From 305d83efa89418ce71809a08a1aca773edab751d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Bobbio=20=28Lunar=29?=
 <lunar@softwareheritage.org>
Date: Thu, 20 Jun 2024 15:20:24 +0200
Subject: [PATCH] Handle missing objects in revision_log() and
 revision_shortlog()

A revision in a log can be missing from the storage. While holes are
unusual, they can happen. Dedicated tests were added for the issue.

TODO:
- [ ] A decision needs to be made on what should be returned by
      `revision_short_log()` in case of a missing revision.
---
 swh/storage/cassandra/storage.py        | 11 +++--
 swh/storage/postgresql/storage.py       | 12 +++--
 swh/storage/proxies/masking/__init__.py | 13 ++---
 swh/storage/sql/40-funcs.sql            |  8 ++--
 swh/storage/sql/upgrades/194.sql        | 32 +++++++++++++
 swh/storage/tests/storage_tests.py      | 63 ++++++++++++++++++++++++-
 6 files changed, 122 insertions(+), 17 deletions(-)
 create mode 100644 swh/storage/sql/upgrades/194.sql

diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
index 0ea476ec3..5e5eee7b8 100644
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -876,15 +876,15 @@ class CassandraStorage:
         limit: Optional[int],
         short: bool,
     ) -> Union[
-        Iterable[Dict[str, Any]],
+        Iterable[Optional[Dict[str, Any]]],
         Iterable[Tuple[Sha1Git, Tuple[Sha1Git, ...]]],
     ]:
         if limit and len(seen) >= limit:
             return
-        rev_ids = [id_ for id_ in rev_ids if id_ not in seen]
+        rev_ids = set(rev_ids) - seen
         if not rev_ids:
             return
-        seen |= set(rev_ids)
+        seen |= rev_ids
 
         # We need this query, even if short=True, to return consistent
         # results (ie. not return only a subset of a revision's parents
@@ -906,7 +906,9 @@ class CassandraStorage:
         else:
             rows = self._cql_runner.revision_get(rev_ids)
 
+            seen_ids = set()
             for row in rows:
+                seen_ids.add(row.id)
                 # TODO: use a single query to get all parents?
                 # (it might have less latency, but requires less code and more
                 # bandwidth (because revision id would be part of each returned
@@ -919,6 +921,9 @@ class CassandraStorage:
                 rev = converters.revision_from_db(row, parents=parents)
                 yield rev.to_dict()
                 yield from self._get_parent_revs(parents, seen, limit, short)
+            # handle missing revisions
+            for _ in rev_ids - seen_ids:
+                yield None
 
     def revision_get_partition(
         self,
diff --git a/swh/storage/postgresql/storage.py b/swh/storage/postgresql/storage.py
index 47143cea2..9466dd7f6 100644
--- a/swh/storage/postgresql/storage.py
+++ b/swh/storage/postgresql/storage.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2023  The Software Heritage developers
+# Copyright (C) 2015-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -927,7 +927,9 @@ class Storage:
         for line in db.revision_log(
             revisions, ignore_displayname=ignore_displayname, limit=limit, cur=cur
         ):
-            data = converters.db_to_revision(dict(zip(db.revision_get_cols, line)))
+            data = converters.db_to_optional_revision(
+                dict(zip(db.revision_get_cols, line))
+            )
             if not data:
                 yield None
                 continue
@@ -937,7 +939,11 @@ class Storage:
     def revision_shortlog(
         self, revisions: List[Sha1Git], limit: Optional[int] = None, *, db: Db, cur=None
     ) -> Iterable[Optional[Tuple[Sha1Git, Tuple[Sha1Git, ...]]]]:
-        yield from db.revision_shortlog(revisions, limit, cur)
+        for rev_id, parents in db.revision_shortlog(revisions, limit, cur):
+            if rev_id is None:
+                yield None
+            else:
+                yield (rev_id, parents)
 
     @db_transaction()
     def revision_get_random(self, *, db: Db, cur=None) -> Sha1Git:
diff --git a/swh/storage/proxies/masking/__init__.py b/swh/storage/proxies/masking/__init__.py
index d36943c99..f72b60ca7 100644
--- a/swh/storage/proxies/masking/__init__.py
+++ b/swh/storage/proxies/masking/__init__.py
@@ -595,17 +595,18 @@ class MaskingProxyStorage:
         revision_batches = grouper(
             self.storage.revision_log(revisions, limit=limit), BATCH_SIZE
         )
-        yield from map(
-            Revision.to_dict,
-            itertools.chain.from_iterable(
+        yield from [
+            Revision.to_dict(r) if r else None
+            for r in itertools.chain.from_iterable(
                 self._apply_revision_display_names(
                     self._raise_if_masked_result_in_list(
-                        "revision_log", list(map(Revision.from_dict, revision_batch))
+                        "revision_log",
+                        [Revision.from_dict(d) if d else None for d in revision_batch],
                     )
                 )
                 for revision_batch in revision_batches
-            ),
-        )
+            )
+        ]
 
     def revision_get_partition(
         self,
diff --git a/swh/storage/sql/40-funcs.sql b/swh/storage/sql/40-funcs.sql
index 803d23735..53612fd91 100644
--- a/swh/storage/sql/40-funcs.sql
+++ b/swh/storage/sql/40-funcs.sql
@@ -456,10 +456,12 @@ create or replace function swh_revision_list(root_revisions bytea[], num_revs bi
     language sql
     stable
 as $$
-    with recursive full_rev_list(id) as (
-        (select id from revision where id = ANY(root_revisions))
+    with recursive full_rev_list(_index, id) as (
+        (select root_index, id
+         from generate_subscripts(root_revisions, 1) as s(root_index)
+         left join revision on id = root_revisions[root_index])
         union
-        (select h.parent_id
+        (select null, h.parent_id
          from revision_history as h
          join full_rev_list on h.id = full_rev_list.id)
     ),
diff --git a/swh/storage/sql/upgrades/194.sql b/swh/storage/sql/upgrades/194.sql
new file mode 100644
index 000000000..5a9028664
--- /dev/null
+++ b/swh/storage/sql/upgrades/194.sql
@@ -0,0 +1,32 @@
+-- SWH DB schema upgrade
+-- from_version: 193
+-- to_version: 194
+-- description: Fix swh_revision_list to return NULL when given missing revisions
+
+insert into dbversion(version, release, description)
+    values(194, now(), 'Work In Progress');
+
+
+create or replace function swh_revision_list(root_revisions bytea[], num_revs bigint default NULL)
+    returns table (id sha1_git, parents bytea[])
+    language sql
+    stable
+as $$
+    with recursive full_rev_list(_index, id) as (
+        (select root_index, id
+         from generate_subscripts(root_revisions, 1) as s(root_index)
+         left join revision on id = root_revisions[root_index])
+        union
+        (select null, h.parent_id
+         from revision_history as h
+         join full_rev_list on h.id = full_rev_list.id)
+    ),
+    rev_list as (select id from full_rev_list limit num_revs)
+    select rev_list.id as id,
+           array(select rh.parent_id::bytea
+                 from revision_history rh
+                 where rh.id = rev_list.id
+                 order by rh.parent_rank
+                ) as parent
+    from rev_list;
+$$;
diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py
index d6e71bc51..9045de662 100644
--- a/swh/storage/tests/storage_tests.py
+++ b/swh/storage/tests/storage_tests.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2023  The Software Heritage developers
+# Copyright (C) 2015-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -1669,7 +1669,24 @@ class TestStorage:
     def test_revision_log_unknown_revision(self, swh_storage, sample_data):
         revision = sample_data.revision
         rev_log = list(swh_storage.revision_log([revision.id]))
-        assert rev_log == []
+        assert rev_log == [None]
+
+    def test_revision_log_missing_revision(self, swh_storage, sample_data):
+        revision1, revision2, revision3, revision4 = sample_data.revisions[:4]
+
+        # rev4 -is-child-of-> rev3 -> rev1, (rev2 -> rev1)
+        # but we purposely not insert rev1
+        swh_storage.revision_add([revision2, revision3, revision4])
+
+        # when
+        results = list(swh_storage.revision_log([revision4.id]))
+
+        # for comparison purposes
+        actual_results = [Revision.from_dict(r) if r else None for r in results]
+        assert len(actual_results) == 4
+        assert actual_results[0:2] == [revision4, revision3]
+        # ordering is not guaranteed between parents
+        assert set(actual_results[2:4]) == {revision2, None}
 
     def test_revision_shortlog(self, swh_storage, sample_data):
         revision1, revision2, revision3, revision4 = sample_data.revisions[:4]
@@ -1688,6 +1705,11 @@ class TestStorage:
             [revision2.id, revision2.parents],
         ]
 
+    def test_revision_shortlog_unknown_revision(self, swh_storage, sample_data):
+        revision = sample_data.revision
+        results = list(swh_storage.revision_log([revision.id]))
+        assert results == [None]
+
     def test_revision_shortlog_with_limit(self, swh_storage, sample_data):
         revision1, revision2, revision3, revision4 = sample_data.revisions[:4]
 
@@ -1699,6 +1721,43 @@ class TestStorage:
         assert len(actual_results) == 1
         assert list(actual_results[0]) == [revision4.id, revision4.parents]
 
+    def test_revision_shortlog_missing_revision(self, swh_storage, sample_data):
+        revision1, revision2, revision3, revision4 = sample_data.revisions[:4]
+
+        # rev4 -is-child-of-> rev3 -> (rev1, rev2); rev2 -> rev1
+        # but we purposely not insert rev2
+        swh_storage.revision_add([revision1, revision3, revision4])
+
+        results = list(swh_storage.revision_shortlog([revision4.id]))
+
+        pytest.skip("XXX: We need to decide on what the output should be!")
+
+        import pprint
+
+        pprint.pp(
+            [
+                (r_id.hex(), [r_parent.hex() for r_parent in r_parents])
+                for r_id, r_parents in results
+            ]
+        )
+
+        # Cassandra currently gives us:
+        assert len(results) == 3
+        assert results == [
+            [revision4.id, revision4.parents],
+            [revision3.id, revision3.parents],
+            [revision1.id, revision1.parents],
+        ]
+
+        # PostgreSQL currently gives us:
+        assert len(results) == 4
+        assert results == [
+            [revision4.id, revision4.parents],
+            [revision3.id, revision3.parents],
+            [revision1.id, revision1.parents],
+            [revision2.id, []],
+        ]
+
     def test_revision_get(self, swh_storage, sample_data):
         revision, revision2 = sample_data.revisions[:2]
 
-- 
GitLab