Skip to content
Snippets Groups Projects
Commit 9c759bbe authored by John Ericson's avatar John Ericson Committed by Phabricator Migration user
Browse files

Overhaul `lookup_missing_hashes`

Keep hashes separated by type to make bugs less likely.
parent 940c4aa0
No related tags found
No related merge requests found
......@@ -110,9 +110,7 @@ def api_swhid_known(request):
# group swhids by their type
swhids_by_type = group_swhids(swhids)
# search for hashes not present in the storage
missing_hashes = {
k: set(archive.lookup_missing_hashes({k: v})) for k, v in swhids_by_type.items()
}
missing_hashes = archive.lookup_missing_hashes(swhids_by_type)
for ty, missing_hashes in missing_hashes.items():
for hash in missing_hashes.iter():
......
......@@ -1406,7 +1406,21 @@ def lookup_object(object_type: ObjectType, object_id: str) -> Dict[str, Any]:
raise ValueError(f"Unexpected object type variant: {object_type}")
def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[bytes]:
# TODO factored out into swh-storage in D7751. Use that version once it
# lands.
def _identifiers_missing(obj_type: ObjectType, obj_ids: List[bytes]) -> Iterable[bytes]:
return {
ObjectType.CONTENT: storage.content_missing_per_sha1_git,
ObjectType.DIRECTORY: storage.directory_missing,
ObjectType.REVISION: storage.revision_missing,
ObjectType.RELEASE: storage.release_missing,
ObjectType.SNAPSHOT: storage.snapshot_missing,
}[obj_type](obj_ids)
def lookup_missing_hashes(
grouped_swhids: Dict[ObjectType, List[bytes]]
) -> Dict[ObjectType, Set[bytes]]:
"""Lookup missing Software Heritage persistent identifier hash, using
batch processing.
......@@ -1415,25 +1429,12 @@ def lookup_missing_hashes(grouped_swhids: Dict[str, List[bytes]]) -> Set[bytes]:
keys: object types
values: object hashes
Returns:
A set(bytes) of the hashes not found in the storage
A dictionary per type with set(bytes) of the hashes not found in the storage
"""
missing_hashes = []
for obj_type, obj_ids in grouped_swhids.items():
if obj_type == ObjectType.CONTENT:
missing_hashes.append(storage.content_missing_per_sha1_git(obj_ids))
elif obj_type == ObjectType.DIRECTORY:
missing_hashes.append(storage.directory_missing(obj_ids))
elif obj_type == ObjectType.REVISION:
missing_hashes.append(storage.revision_missing(obj_ids))
elif obj_type == ObjectType.RELEASE:
missing_hashes.append(storage.release_missing(obj_ids))
elif obj_type == ObjectType.SNAPSHOT:
missing_hashes.append(storage.snapshot_missing(obj_ids))
missing = set(itertools.chain(*missing_hashes))
return missing
return {
obj_type: set(_identifiers_missing(obj_type, obj_ids))
for obj_type, obj_ids in grouped_swhids.items()
}
def lookup_origins_by_sha1s(sha1s: List[str]) -> Iterator[Optional[OriginInfo]]:
......
......@@ -958,11 +958,11 @@ def test_lookup_missing_hashes_non_present():
actual_result = archive.lookup_missing_hashes(grouped_swhids)
assert actual_result == {
missing_cnt,
missing_dir,
missing_rev,
missing_rel,
missing_snp,
ObjectType.CONTENT: {missing_cnt},
ObjectType.DIRECTORY: {missing_dir},
ObjectType.REVISION: {missing_rev},
ObjectType.RELEASE: {missing_rel},
ObjectType.SNAPSHOT: {missing_snp},
}
......@@ -981,7 +981,11 @@ def test_lookup_missing_hashes_some_present(content, directory):
actual_result = archive.lookup_missing_hashes(grouped_swhids)
assert actual_result == {missing_rev, missing_rel, missing_snp}
assert actual_result == {
ObjectType.REVISION: {missing_rev},
ObjectType.RELEASE: {missing_rel},
ObjectType.SNAPSHOT: {missing_snp},
}
def test_lookup_origin_extra_trailing_slash(origin):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment