From 305d83efa89418ce71809a08a1aca773edab751d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Bobbio=20=28Lunar=29?= <lunar@softwareheritage.org> Date: Thu, 20 Jun 2024 15:20:24 +0200 Subject: [PATCH] Handle missing objects in revision_log() and revision_shortlog() A revision in a log can be missing from the storage. While holes are unusual, they can happen. Dedicated tests were added for the issue. TODO: - [ ] A decision needs to be made on what should be returned by `revision_short_log()` in case of a missing revision. --- swh/storage/cassandra/storage.py | 11 +++-- swh/storage/postgresql/storage.py | 12 +++-- swh/storage/proxies/masking/__init__.py | 13 ++--- swh/storage/sql/40-funcs.sql | 8 ++-- swh/storage/sql/upgrades/194.sql | 32 +++++++++++++ swh/storage/tests/storage_tests.py | 63 ++++++++++++++++++++++++- 6 files changed, 122 insertions(+), 17 deletions(-) create mode 100644 swh/storage/sql/upgrades/194.sql diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py index 0ea476ec3..5e5eee7b8 100644 --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -876,15 +876,15 @@ class CassandraStorage: limit: Optional[int], short: bool, ) -> Union[ - Iterable[Dict[str, Any]], + Iterable[Optional[Dict[str, Any]]], Iterable[Tuple[Sha1Git, Tuple[Sha1Git, ...]]], ]: if limit and len(seen) >= limit: return - rev_ids = [id_ for id_ in rev_ids if id_ not in seen] + rev_ids = set(rev_ids) - seen if not rev_ids: return - seen |= set(rev_ids) + seen |= rev_ids # We need this query, even if short=True, to return consistent # results (ie. not return only a subset of a revision's parents @@ -906,7 +906,9 @@ class CassandraStorage: else: rows = self._cql_runner.revision_get(rev_ids) + seen_ids = set() for row in rows: + seen_ids.add(row.id) # TODO: use a single query to get all parents? # (it might have less latency, but requires less code and more # bandwidth (because revision id would be part of each returned @@ -919,6 +921,9 @@ class CassandraStorage: rev = converters.revision_from_db(row, parents=parents) yield rev.to_dict() yield from self._get_parent_revs(parents, seen, limit, short) + # handle missing revisions + for _ in rev_ids - seen_ids: + yield None def revision_get_partition( self, diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py index 47143cea2..9466dd7f6 100644 --- a/swh/storage/postgresql/storage.py +++ b/swh/storage/postgresql/storage.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2023 The Software Heritage developers +# Copyright (C) 2015-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -927,7 +927,9 @@ class Storage: for line in db.revision_log( revisions, ignore_displayname=ignore_displayname, limit=limit, cur=cur ): - data = converters.db_to_revision(dict(zip(db.revision_get_cols, line))) + data = converters.db_to_optional_revision( + dict(zip(db.revision_get_cols, line)) + ) if not data: yield None continue @@ -937,7 +939,11 @@ class Storage: def revision_shortlog( self, revisions: List[Sha1Git], limit: Optional[int] = None, *, db: Db, cur=None ) -> Iterable[Optional[Tuple[Sha1Git, Tuple[Sha1Git, ...]]]]: - yield from db.revision_shortlog(revisions, limit, cur) + for rev_id, parents in db.revision_shortlog(revisions, limit, cur): + if rev_id is None: + yield None + else: + yield (rev_id, parents) @db_transaction() def revision_get_random(self, *, db: Db, cur=None) -> Sha1Git: diff --git a/swh/storage/proxies/masking/__init__.py b/swh/storage/proxies/masking/__init__.py index d36943c99..f72b60ca7 100644 --- a/swh/storage/proxies/masking/__init__.py +++ b/swh/storage/proxies/masking/__init__.py @@ -595,17 +595,18 @@ class MaskingProxyStorage: revision_batches = grouper( self.storage.revision_log(revisions, limit=limit), BATCH_SIZE ) - yield from map( - Revision.to_dict, - itertools.chain.from_iterable( + yield from [ + Revision.to_dict(r) if r else None + for r in itertools.chain.from_iterable( self._apply_revision_display_names( self._raise_if_masked_result_in_list( - "revision_log", list(map(Revision.from_dict, revision_batch)) + "revision_log", + [Revision.from_dict(d) if d else None for d in revision_batch], ) ) for revision_batch in revision_batches - ), - ) + ) + ] def revision_get_partition( self, diff --git a/swh/storage/sql/40-funcs.sql b/swh/storage/sql/40-funcs.sql index 803d23735..53612fd91 100644 --- a/swh/storage/sql/40-funcs.sql +++ b/swh/storage/sql/40-funcs.sql @@ -456,10 +456,12 @@ create or replace function swh_revision_list(root_revisions bytea[], num_revs bi language sql stable as $$ - with recursive full_rev_list(id) as ( - (select id from revision where id = ANY(root_revisions)) + with recursive full_rev_list(_index, id) as ( + (select root_index, id + from generate_subscripts(root_revisions, 1) as s(root_index) + left join revision on id = root_revisions[root_index]) union - (select h.parent_id + (select null, h.parent_id from revision_history as h join full_rev_list on h.id = full_rev_list.id) ), diff --git a/swh/storage/sql/upgrades/194.sql b/swh/storage/sql/upgrades/194.sql new file mode 100644 index 000000000..5a9028664 --- /dev/null +++ b/swh/storage/sql/upgrades/194.sql @@ -0,0 +1,32 @@ +-- SWH DB schema upgrade +-- from_version: 193 +-- to_version: 194 +-- description: Fix swh_revision_list to return NULL when given missing revisions + +insert into dbversion(version, release, description) + values(194, now(), 'Work In Progress'); + + +create or replace function swh_revision_list(root_revisions bytea[], num_revs bigint default NULL) + returns table (id sha1_git, parents bytea[]) + language sql + stable +as $$ + with recursive full_rev_list(_index, id) as ( + (select root_index, id + from generate_subscripts(root_revisions, 1) as s(root_index) + left join revision on id = root_revisions[root_index]) + union + (select null, h.parent_id + from revision_history as h + join full_rev_list on h.id = full_rev_list.id) + ), + rev_list as (select id from full_rev_list limit num_revs) + select rev_list.id as id, + array(select rh.parent_id::bytea + from revision_history rh + where rh.id = rev_list.id + order by rh.parent_rank + ) as parent + from rev_list; +$$; diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py index d6e71bc51..9045de662 100644 --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2023 The Software Heritage developers +# Copyright (C) 2015-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -1669,7 +1669,24 @@ class TestStorage: def test_revision_log_unknown_revision(self, swh_storage, sample_data): revision = sample_data.revision rev_log = list(swh_storage.revision_log([revision.id])) - assert rev_log == [] + assert rev_log == [None] + + def test_revision_log_missing_revision(self, swh_storage, sample_data): + revision1, revision2, revision3, revision4 = sample_data.revisions[:4] + + # rev4 -is-child-of-> rev3 -> rev1, (rev2 -> rev1) + # but we purposely not insert rev1 + swh_storage.revision_add([revision2, revision3, revision4]) + + # when + results = list(swh_storage.revision_log([revision4.id])) + + # for comparison purposes + actual_results = [Revision.from_dict(r) if r else None for r in results] + assert len(actual_results) == 4 + assert actual_results[0:2] == [revision4, revision3] + # ordering is not guaranteed between parents + assert set(actual_results[2:4]) == {revision2, None} def test_revision_shortlog(self, swh_storage, sample_data): revision1, revision2, revision3, revision4 = sample_data.revisions[:4] @@ -1688,6 +1705,11 @@ class TestStorage: [revision2.id, revision2.parents], ] + def test_revision_shortlog_unknown_revision(self, swh_storage, sample_data): + revision = sample_data.revision + results = list(swh_storage.revision_log([revision.id])) + assert results == [None] + def test_revision_shortlog_with_limit(self, swh_storage, sample_data): revision1, revision2, revision3, revision4 = sample_data.revisions[:4] @@ -1699,6 +1721,43 @@ class TestStorage: assert len(actual_results) == 1 assert list(actual_results[0]) == [revision4.id, revision4.parents] + def test_revision_shortlog_missing_revision(self, swh_storage, sample_data): + revision1, revision2, revision3, revision4 = sample_data.revisions[:4] + + # rev4 -is-child-of-> rev3 -> (rev1, rev2); rev2 -> rev1 + # but we purposely not insert rev2 + swh_storage.revision_add([revision1, revision3, revision4]) + + results = list(swh_storage.revision_shortlog([revision4.id])) + + pytest.skip("XXX: We need to decide on what the output should be!") + + import pprint + + pprint.pp( + [ + (r_id.hex(), [r_parent.hex() for r_parent in r_parents]) + for r_id, r_parents in results + ] + ) + + # Cassandra currently gives us: + assert len(results) == 3 + assert results == [ + [revision4.id, revision4.parents], + [revision3.id, revision3.parents], + [revision1.id, revision1.parents], + ] + + # PostgreSQL currently gives us: + assert len(results) == 4 + assert results == [ + [revision4.id, revision4.parents], + [revision3.id, revision3.parents], + [revision1.id, revision1.parents], + [revision2.id, []], + ] + def test_revision_get(self, swh_storage, sample_data): revision, revision2 = sample_data.revisions[:2] -- GitLab