From 54d8c7dfea8801987ccc935d8caf8f9d34370f83 Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Mon, 6 Nov 2023 15:19:07 +0100 Subject: [PATCH] package: Avoid processing duplicated package release multiple times It might exist cases where multiple versions of a package target the same release object. For instance a rpm package has one specific version for each distribution release but they can target the same intrinsic version and source package contents are exactly the same. So avoid downloading and processing a package version if the corresponding extid has already been encountered during the current loading by maintaining a mapping between extids and release swhids. --- swh/loader/package/loader.py | 9 +-- .../package_example_example-v1.tar.gz | 1 + .../package_example_example-v2.tar.gz | 1 + .../package_example_example-v3.tar.gz | 1 + .../package_example_example-v4.tar.gz | 1 + swh/loader/package/tests/test_loader.py | 64 ++++++++++++++++++- 6 files changed, 70 insertions(+), 7 deletions(-) create mode 120000 swh/loader/package/tests/data/https_example.org/package_example_example-v1.tar.gz create mode 120000 swh/loader/package/tests/data/https_example.org/package_example_example-v2.tar.gz create mode 120000 swh/loader/package/tests/data/https_example.org/package_example_example-v3.tar.gz create mode 120000 swh/loader/package/tests/data/https_example.org/package_example_example-v4.tar.gz diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py index d22b4610..9ca86d2b 100644 --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -254,7 +254,7 @@ class PackageLoader(BaseLoader, Generic[TPackageInfo]): # For each extid type, call extid_get_from_extid() with all the extids of # that type, and store them in the '(type, extid) -> target' map. known_extids: Dict[PartialExtID, List[CoreSWHID]] = {} - for ((extid_type, extid_version), extids) in new_extids.items(): + for (extid_type, extid_version), extids in new_extids.items(): for extid in self.storage.extid_get_from_extid( extid_type, extids, version=extid_version ): @@ -343,7 +343,6 @@ class PackageLoader(BaseLoader, Generic[TPackageInfo]): ) } if missing_releases: - err_message = "Found ExtIDs pointing to missing releases" logger.error(err_message + ": %s", missing_releases) @@ -654,17 +653,18 @@ class PackageLoader(BaseLoader, Generic[TPackageInfo]): } new_extids: Set[ExtID] = set() + extid_to_swhid: Dict[Optional[PartialExtID], CoreSWHID] = {} tmp_releases: Dict[str, List[Tuple[str, Sha1Git]]] = { version: [] for version in versions } - for (branch_name, p_info) in packages_info: + for branch_name, p_info in packages_info: logger.debug("package_info: %s", p_info) # Check if the package was already loaded, using its ExtID swhid = self.resolve_object_from_extids( known_extids, p_info, last_snapshot_targets - ) + ) or extid_to_swhid.get(p_info.extid()) if swhid is not None and swhid.object_type == ObjectType.REVISION: # This package was already loaded, but by an older version @@ -740,6 +740,7 @@ class PackageLoader(BaseLoader, Generic[TPackageInfo]): release_swhid = CoreSWHID( object_type=ObjectType.RELEASE, object_id=release_id ) + extid_to_swhid[partial_extid] = release_swhid new_extids.add( ExtID( extid_type=extid_type, diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v1.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v1.tar.gz new file mode 120000 index 00000000..ccebc093 --- /dev/null +++ b/swh/loader/package/tests/data/https_example.org/package_example_example-v1.tar.gz @@ -0,0 +1 @@ +package_example_example-v1.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v2.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v2.tar.gz new file mode 120000 index 00000000..7efe17b1 --- /dev/null +++ b/swh/loader/package/tests/data/https_example.org/package_example_example-v2.tar.gz @@ -0,0 +1 @@ +package_example_example-v2.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v3.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v3.tar.gz new file mode 120000 index 00000000..1f80aa1a --- /dev/null +++ b/swh/loader/package/tests/data/https_example.org/package_example_example-v3.tar.gz @@ -0,0 +1 @@ +package_example_example-v3.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/tests/data/https_example.org/package_example_example-v4.tar.gz b/swh/loader/package/tests/data/https_example.org/package_example_example-v4.tar.gz new file mode 120000 index 00000000..8665dacc --- /dev/null +++ b/swh/loader/package/tests/data/https_example.org/package_example_example-v4.tar.gz @@ -0,0 +1 @@ +package_example_example-v4.0.tar.gz \ No newline at end of file diff --git a/swh/loader/package/tests/test_loader.py b/swh/loader/package/tests/test_loader.py index cb9b8cbf..f02aa6b0 100644 --- a/swh/loader/package/tests/test_loader.py +++ b/swh/loader/package/tests/test_loader.py @@ -6,6 +6,7 @@ import datetime import hashlib import logging +import os import string from typing import Optional from unittest.mock import Mock, call, patch @@ -107,7 +108,6 @@ class StubPackageLoader(PackageLoader[StubPackageInfo]): def test_loader_origin_visit_success(swh_storage, requests_mock_datadir): - loader = StubPackageLoader(swh_storage, ORIGIN_URL) assert loader.load() == { @@ -621,7 +621,6 @@ class StubPackageLoaderWithPackageInfoFailure(StubPackageLoader): def test_loader_origin_with_package_info_failure(swh_storage, requests_mock_datadir): - loader = StubPackageLoaderWithPackageInfoFailure(swh_storage, ORIGIN_URL) assert loader.load() == { @@ -640,7 +639,6 @@ def test_loader_origin_with_package_info_failure(swh_storage, requests_mock_data def test_loader_with_dangling_branch_in_last_snapshot( swh_storage, requests_mock_datadir ): - loader = StubPackageLoader(swh_storage, ORIGIN_URL) assert loader.load() == { @@ -662,3 +660,63 @@ def test_loader_with_dangling_branch_in_last_snapshot( "snapshot_id": "dcb9ecef64af73f2cdac7f5463cb6dece6b1db61", "status": "eventful", } + + +class StubPackageLoaderDuplicatedReleases(StubPackageLoader): + def get_versions(self): + return ["v1.0", "v1", "v2.0", "v2", "v3.0", "v3", "v4.0", "v4"] + + def get_package_info(self, version): + filename = f"example-{version}.tar.gz" + filepath = os.path.join( + os.path.dirname(__file__), + "data", + "https_example.org", + f"package_example_{filename}", + ) + with open(filepath, "rb") as file: + sha256_checksum = hashlib.sha256(file.read()) + + p_info = StubPackageInfo( + f"{ORIGIN_URL}/{filename}", + filename, + version=version, + checksums={"sha256": sha256_checksum.hexdigest()}, + ) + extid_type = "extid-stub-sha256" + + patch.object( + p_info, + "extid", + return_value=(extid_type, 1, sha256_checksum.digest()), + autospec=True, + ).start() + yield (f"branch-{version}", p_info) + + +def test_loader_with_duplicated_releases(swh_storage, requests_mock_datadir, mocker): + """Check each package release is downloaded and processed only once.""" + loader = StubPackageLoaderDuplicatedReleases(swh_storage, ORIGIN_URL) + + load_release = mocker.spy(loader, "_load_release") + + assert loader.load() == { + "status": "eventful", + "snapshot_id": "a01fb6280bedbfc3609d3b6ae06e70bf2b4186fa", + } + + # versions v{i}.0 and v{i} target the same release so load_release + # should have been called once per unique release + assert len(load_release.mock_calls) == len(loader.get_versions()) / 2 + + snasphot = loader.last_snapshot() + + # all referenced versions should be found in the snapshot + assert len(snasphot.branches) == len(loader.get_versions()) + + # check branch-v{i}.0 and branch-v{i} target the same release + for i in range(1, 5): + assert ( + snasphot.branches[f"branch-v{i}.0".encode()] + == snasphot.branches[f"branch-v{i}".encode()] + ) -- GitLab