From 9c759bbe476b45c3352f3fc455edda82ad07fe87 Mon Sep 17 00:00:00 2001
From: John Ericson <John.Ericson@Obsidian.Systems>
Date: Thu, 5 May 2022 14:48:07 -0400
Subject: [PATCH] Overhaul `lookup_missing_hashes`

Keep hashes separated by type to make bugs less likely.
---
 swh/web/api/views/identifiers.py     |  4 +--
 swh/web/common/archive.py            | 39 ++++++++++++++--------------
 swh/web/tests/common/test_archive.py | 16 +++++++-----
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/swh/web/api/views/identifiers.py b/swh/web/api/views/identifiers.py
index 8d7dc2cfb..3f9491672 100644
--- a/swh/web/api/views/identifiers.py
+++ b/swh/web/api/views/identifiers.py
@@ -110,9 +110,7 @@ def api_swhid_known(request):
     # group swhids by their type
     swhids_by_type = group_swhids(swhids)
     # search for hashes not present in the storage
-    missing_hashes = {
-        k: set(archive.lookup_missing_hashes({k: v})) for k, v in swhids_by_type.items()
-    }
+    missing_hashes = archive.lookup_missing_hashes(swhids_by_type)
 
     for ty, missing_hashes in missing_hashes.items():
         for hash in missing_hashes.iter():
diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py
index a523f2811..7719f77bf 100644
--- a/swh/web/common/archive.py
+++ b/swh/web/common/archive.py
@@ -1406,7 +1406,21 @@ def lookup_object(object_type: ObjectType, object_id: str) -> Dict[str, Any]:
         raise ValueError(f"Unexpected object type variant: {object_type}")
 
 
-def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[bytes]:
+# TODO factored out into swh-storage in D7751. Use that version once it
+# lands.
+def _identifiers_missing(obj_type: ObjectType, obj_ids: List[bytes]) -> Iterable[bytes]:
+    return {
+        ObjectType.CONTENT: storage.content_missing_per_sha1_git,
+        ObjectType.DIRECTORY: storage.directory_missing,
+        ObjectType.REVISION: storage.revision_missing,
+        ObjectType.RELEASE: storage.release_missing,
+        ObjectType.SNAPSHOT: storage.snapshot_missing,
+    }[obj_type](obj_ids)
+
+
+def lookup_missing_hashes(
+    grouped_swhids: Dict[ObjectType, List[bytes]]
+) -> Dict[ObjectType, Set[bytes]]:
     """Lookup missing Software Heritage persistent identifier hash, using
     batch processing.
 
@@ -1415,25 +1429,12 @@ def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[bytes]:
         keys: object types
         values: object hashes
     Returns:
-        A set(bytes) of the hashes not found in the storage
+        A dictionary per type with set(bytes) of the hashes not found in the storage
     """
-    missing_hashes = []
-
-    for obj_type, obj_ids in grouped_swhids.items():
-        if obj_type == ObjectType.CONTENT:
-            missing_hashes.append(storage.content_missing_per_sha1_git(obj_ids))
-        elif obj_type == ObjectType.DIRECTORY:
-            missing_hashes.append(storage.directory_missing(obj_ids))
-        elif obj_type == ObjectType.REVISION:
-            missing_hashes.append(storage.revision_missing(obj_ids))
-        elif obj_type == ObjectType.RELEASE:
-            missing_hashes.append(storage.release_missing(obj_ids))
-        elif obj_type == ObjectType.SNAPSHOT:
-            missing_hashes.append(storage.snapshot_missing(obj_ids))
-
-    missing = set(itertools.chain(*missing_hashes))
-
-    return missing
+    return {
+        obj_type: set(_identifiers_missing(obj_type, obj_ids))
+        for obj_type, obj_ids in grouped_swhids.items()
+    }
 
 
 def lookup_origins_by_sha1s(sha1s: List[str]) -> Iterator[Optional[OriginInfo]]:
diff --git a/swh/web/tests/common/test_archive.py b/swh/web/tests/common/test_archive.py
index fdaa61d65..fe7969bf0 100644
--- a/swh/web/tests/common/test_archive.py
+++ b/swh/web/tests/common/test_archive.py
@@ -958,11 +958,11 @@ def test_lookup_missing_hashes_non_present():
     actual_result = archive.lookup_missing_hashes(grouped_swhids)
 
     assert actual_result == {
-        missing_cnt,
-        missing_dir,
-        missing_rev,
-        missing_rel,
-        missing_snp,
+        ObjectType.CONTENT: {missing_cnt},
+        ObjectType.DIRECTORY: {missing_dir},
+        ObjectType.REVISION: {missing_rev},
+        ObjectType.RELEASE: {missing_rel},
+        ObjectType.SNAPSHOT: {missing_snp},
     }
 
 
@@ -981,7 +981,11 @@ def test_lookup_missing_hashes_some_present(content, directory):
 
     actual_result = archive.lookup_missing_hashes(grouped_swhids)
 
-    assert actual_result == {missing_rev, missing_rel, missing_snp}
+    assert actual_result == {
+        ObjectType.REVISION: {missing_rev},
+        ObjectType.RELEASE: {missing_rel},
+        ObjectType.SNAPSHOT: {missing_snp},
+    }
 
 
 def test_lookup_origin_extra_trailing_slash(origin):
-- 
GitLab