From 9c759bbe476b45c3352f3fc455edda82ad07fe87 Mon Sep 17 00:00:00 2001 From: John Ericson <John.Ericson@Obsidian.Systems> Date: Thu, 5 May 2022 14:48:07 -0400 Subject: [PATCH] Overhaul `lookup_missing_hashes` Keep hashes separated by type to make bugs less likely. --- swh/web/api/views/identifiers.py | 4 +-- swh/web/common/archive.py | 39 ++++++++++++++-------------- swh/web/tests/common/test_archive.py | 16 +++++++----- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/swh/web/api/views/identifiers.py b/swh/web/api/views/identifiers.py index 8d7dc2cfb..3f9491672 100644 --- a/swh/web/api/views/identifiers.py +++ b/swh/web/api/views/identifiers.py @@ -110,9 +110,7 @@ def api_swhid_known(request): # group swhids by their type swhids_by_type = group_swhids(swhids) # search for hashes not present in the storage - missing_hashes = { - k: set(archive.lookup_missing_hashes({k: v})) for k, v in swhids_by_type.items() - } + missing_hashes = archive.lookup_missing_hashes(swhids_by_type) for ty, missing_hashes in missing_hashes.items(): for hash in missing_hashes.iter(): diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py index a523f2811..7719f77bf 100644 --- a/swh/web/common/archive.py +++ b/swh/web/common/archive.py @@ -1406,7 +1406,21 @@ def lookup_object(object_type: ObjectType, object_id: str) -> Dict[str, Any]: raise ValueError(f"Unexpected object type variant: {object_type}") -def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[bytes]: +# TODO factored out into swh-storage in D7751. Use that version once it +# lands. +def _identifiers_missing(obj_type: ObjectType, obj_ids: List[bytes]) -> Iterable[bytes]: + return { + ObjectType.CONTENT: storage.content_missing_per_sha1_git, + ObjectType.DIRECTORY: storage.directory_missing, + ObjectType.REVISION: storage.revision_missing, + ObjectType.RELEASE: storage.release_missing, + ObjectType.SNAPSHOT: storage.snapshot_missing, + }[obj_type](obj_ids) + + +def lookup_missing_hashes( + grouped_swhids: Dict[ObjectType, List[bytes]] +) -> Dict[ObjectType, Set[bytes]]: """Lookup missing Software Heritage persistent identifier hash, using batch processing. @@ -1415,25 +1429,12 @@ def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[bytes]: keys: object types values: object hashes Returns: - A set(bytes) of the hashes not found in the storage + A dictionary per type with set(bytes) of the hashes not found in the storage """ - missing_hashes = [] - - for obj_type, obj_ids in grouped_swhids.items(): - if obj_type == ObjectType.CONTENT: - missing_hashes.append(storage.content_missing_per_sha1_git(obj_ids)) - elif obj_type == ObjectType.DIRECTORY: - missing_hashes.append(storage.directory_missing(obj_ids)) - elif obj_type == ObjectType.REVISION: - missing_hashes.append(storage.revision_missing(obj_ids)) - elif obj_type == ObjectType.RELEASE: - missing_hashes.append(storage.release_missing(obj_ids)) - elif obj_type == ObjectType.SNAPSHOT: - missing_hashes.append(storage.snapshot_missing(obj_ids)) - - missing = set(itertools.chain(*missing_hashes)) - - return missing + return { + obj_type: set(_identifiers_missing(obj_type, obj_ids)) + for obj_type, obj_ids in grouped_swhids.items() + } def lookup_origins_by_sha1s(sha1s: List[str]) -> Iterator[Optional[OriginInfo]]: diff --git a/swh/web/tests/common/test_archive.py b/swh/web/tests/common/test_archive.py index fdaa61d65..fe7969bf0 100644 --- a/swh/web/tests/common/test_archive.py +++ b/swh/web/tests/common/test_archive.py @@ -958,11 +958,11 @@ def test_lookup_missing_hashes_non_present(): actual_result = archive.lookup_missing_hashes(grouped_swhids) assert actual_result == { - missing_cnt, - missing_dir, - missing_rev, - missing_rel, - missing_snp, + ObjectType.CONTENT: {missing_cnt}, + ObjectType.DIRECTORY: {missing_dir}, + ObjectType.REVISION: {missing_rev}, + ObjectType.RELEASE: {missing_rel}, + ObjectType.SNAPSHOT: {missing_snp}, } @@ -981,7 +981,11 @@ def test_lookup_missing_hashes_some_present(content, directory): actual_result = archive.lookup_missing_hashes(grouped_swhids) - assert actual_result == {missing_rev, missing_rel, missing_snp} + assert actual_result == { + ObjectType.REVISION: {missing_rev}, + ObjectType.RELEASE: {missing_rel}, + ObjectType.SNAPSHOT: {missing_snp}, + } def test_lookup_origin_extra_trailing_slash(origin): -- GitLab