diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index d22b46107dc8a02176b5b52240514a5ca9062b0c..9ca86d2b8e2a456fdd59642e70aea30f6eb55636 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -254,7 +254,7 @@ class PackageLoader(BaseLoader, Generic[TPackageInfo]): # For each extid type, call extid_get_from_extid() with all the extids of # that type, and store them in the '(type, extid) -> target' map. known_extids: Dict[PartialExtID, List[CoreSWHID]] = {} - for ((extid_type, extid_version), extids) in new_extids.items(): + for (extid_type, extid_version), extids in new_extids.items(): for extid in self.storage.extid_get_from_extid( extid_type, extids, version=extid_version ): @@ -343,7 +343,6 @@ class PackageLoader(BaseLoader, Generic[TPackageInfo]): ) } if missing_releases: - err_message = "Found ExtIDs pointing to missing releases" logger.error(err_message + ": %s", missing_releases) @@ -654,17 +653,18 @@ class PackageLoader(BaseLoader, Generic[TPackageInfo]): } new_extids: Set[ExtID] = set() + extid_to_swhid: Dict[Optional[PartialExtID], CoreSWHID] = {} tmp_releases: Dict[str, List[Tuple[str, Sha1Git]]] = { version: [] for version in versions } - for (branch_name, p_info) in packages_info: + for branch_name, p_info in packages_info: logger.debug("package_info: %s", p_info) # Check if the package was already loaded, using its ExtID swhid = self.resolve_object_from_extids( known_extids, p_info, last_snapshot_targets - ) + ) or extid_to_swhid.get(p_info.extid()) if swhid is not None and swhid.object_type == ObjectType.REVISION: # This package was already loaded, but by an older version @@ -740,6 +740,7 @@ class PackageLoader(BaseLoader, Generic[TPackageInfo]): release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=release_id ) + extid_to_swhid[partial_extid] = release_swhid new_extids.add( ExtID( extid_type=extid_type, diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v1.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v1.tar.gz new file mode 120000 index 0000000000000000000000000000000000000000..ccebc093e022a3c6eb93aeafd2b3df4028ec8ce0 --- /dev/null +++ b/swh/loader/package/tests/data/https_example.org/package_example_example-v1.tar.gz @@ -0,0 +1 @@ +package_example_example-v1.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v2.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v2.tar.gz new file mode 120000 index 0000000000000000000000000000000000000000..7efe17b1eff03089c63408df9a5af94bdd478936 --- /dev/null +++ b/swh/loader/package/tests/data/https_example.org/package_example_example-v2.tar.gz @@ -0,0 +1 @@ +package_example_example-v2.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v3.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v3.tar.gz new file mode 120000 index 0000000000000000000000000000000000000000..1f80aa1a1ca253b0889f1af75ac4121f4b0c69cb --- /dev/null +++ b/swh/loader/package/tests/data/https_example.org/package_example_example-v3.tar.gz @@ -0,0 +1 @@ +package_example_example-v3.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v4.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v4.tar.gz new file mode 120000 index 0000000000000000000000000000000000000000..8665dacc26bab477688f55de0b3f4a1e3dbdf372 --- /dev/null +++ b/swh/loader/package/tests/data/https_example.org/package_example_example-v4.tar.gz @@ -0,0 +1 @@ +package_example_example-v4.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py index cb9b8cbf066d51318dee6440ece5bf5befdb92d2..f02aa6b0963b4468eeadf6d0ac1da1d88d593ff3 100644 --- a/swh/loader/package/tests/test_loader.py +++ b/swh/loader/package/tests/test_loader.py @@ -6,6 +6,7 @@ import datetime import hashlib import logging +import os import string from typing import Optional from unittest.mock import Mock, call, patch @@ -107,7 +108,6 @@ class StubPackageLoader(PackageLoader[StubPackageInfo]): def test_loader_origin_visit_success(swh_storage, requests_mock_datadir): - loader = StubPackageLoader(swh_storage, ORIGIN_URL) assert loader.load() == { @@ -621,7 +621,6 @@ class StubPackageLoaderWithPackageInfoFailure(StubPackageLoader): def test_loader_origin_with_package_info_failure(swh_storage, requests_mock_datadir): - loader = StubPackageLoaderWithPackageInfoFailure(swh_storage, ORIGIN_URL) assert loader.load() == { @@ -640,7 +639,6 @@ def test_loader_origin_with_package_info_failure(swh_storage, requests_mock_data def test_loader_with_dangling_branch_in_last_snapshot( swh_storage, requests_mock_datadir ): - loader = StubPackageLoader(swh_storage, ORIGIN_URL) assert loader.load() == { @@ -662,3 +660,63 @@ def test_loader_with_dangling_branch_in_last_snapshot( "snapshot_id": "dcb9ecef64af73f2cdac7f5463cb6dece6b1db61", "status": "eventful", } + + +class StubPackageLoaderDuplicatedReleases(StubPackageLoader): + def get_versions(self): + return ["v1.0", "v1", "v2.0", "v2", "v3.0", "v3", "v4.0", "v4"] + + def get_package_info(self, version): + filename = f"example-{version}.tar.gz" + filepath = os.path.join( + os.path.dirname(__file__), + "data", + "https_example.org", + f"package_example_{filename}", + ) + with open(filepath, "rb") as file: + sha256_checksum = hashlib.sha256(file.read()) + + p_info = StubPackageInfo( + f"{ORIGIN_URL}/{filename}", + filename, + version=version, + checksums={"sha256": sha256_checksum.hexdigest()}, + ) + extid_type = "extid-stub-sha256" + + patch.object( + p_info, + "extid", + return_value=(extid_type, 1, sha256_checksum.digest()), + autospec=True, + ).start() + yield (f"branch-{version}", p_info) + + +def test_loader_with_duplicated_releases(swh_storage, requests_mock_datadir, mocker): + """Check each package release is downloaded and processed only once.""" + loader = StubPackageLoaderDuplicatedReleases(swh_storage, ORIGIN_URL) + + load_release = mocker.spy(loader, "_load_release") + + assert loader.load() == { + "status": "eventful", + "snapshot_id": "a01fb6280bedbfc3609d3b6ae06e70bf2b4186fa", + } + + # versions v{i}.0 and v{i} target the same release so load_release + # should have been called once per unique release + assert len(load_release.mock_calls) == len(loader.get_versions()) / 2 + + snasphot = loader.last_snapshot() + + # all referenced versions should be found in the snapshot + assert len(snasphot.branches) == len(loader.get_versions()) + + # check branch-v{i}.0 and branch-v{i} target the same release + for i in range(1, 5): + assert ( + snasphot.branches[f"branch-v{i}.0".encode()] + == snasphot.branches[f"branch-v{i}".encode()] + )