diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py index dfb28b7a52c8894134a807af9cdebd0dadb4a2e1..244cfaae820ae2378ad8a4f915491dd077306b8f 100644 --- a/swh/loader/core/loader.py +++ b/swh/loader/core/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2023 The Software Heritage developers +# Copyright (C) 2015-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -654,6 +654,10 @@ class NodeLoader(BaseLoader, ABC): """ + # Bump version when incompatible changes occur: + # - 20240215: Tarball directory from a leaf class changed the tarball ingestion + extid_version = 1 + def __init__( self, storage: StorageInterface, @@ -750,6 +754,7 @@ class NodeLoader(BaseLoader, ABC): extid_type=extid_type % hash_algo, extid=extid, target=node.swhid(), + extid_version=self.extid_version, ) for hash_algo, extid in checksums.items() } @@ -1140,15 +1145,16 @@ class TarballDirectoryLoader(BaseDirectoryLoader): if directory_path: found_directory_path = True + # Yield the top-level directory as-is + yield directory_path + # If there is a mismatch between the computed NAR hash and the one + # we should obtain, retry its computation by not including single + # top level directory if there is such a layout (as nix does). # Check whether a top-level directory exists listing = list(directory_path.iterdir()) if len(listing) == 1: - # Top-level directory, we provide it - dir_path = listing[0] - else: - # otherwise, provide the directory_path as is - dir_path = directory_path - yield dir_path + # Top-level directory exists, we provide it, nix depends on it + yield listing[0] # To catch 'standard' hash mismatch issues raise by the 'download' method. if not found_directory_path and errors: diff --git a/swh/loader/core/tests/conftest.py b/swh/loader/core/tests/conftest.py index 4bcf2d6d451e34ec63d870015739ab01ae60c6bb..771ad00aa4f8573d089ffabce539923a63f717c5 100644 --- a/swh/loader/core/tests/conftest.py +++ b/swh/loader/core/tests/conftest.py @@ -58,9 +58,23 @@ def tarball_with_std_hashes(tarball_path): def tarball_with_nar_hashes(tarball_path): nar_hashes = compute_nar_hashes(tarball_path, ["sha256"]) # Ensure it's the same hash as the initial one computed from the cli + + assert ( + nar_hashes["sha256"] + == "45db8a27ccfae60b5233003c54c2d6b5ed6f0a1299dd9bbebc8f06cf649bc9c0" + ) + return (tarball_path, nar_hashes) + + +@pytest.fixture +def tarball_with_nar_hashes_first_directory(tarball_path): + # Compute the nar hashes from the first directory inside the tarball + nar_hashes = compute_nar_hashes(tarball_path, ["sha256"], top_level=False) + # Ensure it's the same hash as the initial one computed from the cli + assert ( nar_hashes["sha256"] - == "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4" + == "45db8a27ccfae60b5233003c54c2d6b5ed6f0a1299dd9bbebc8f06cf649bc9c0" ) return (tarball_path, nar_hashes) @@ -71,7 +85,7 @@ def tarball_with_executable_with_nar_hashes(tarball_with_executable_path): # Ensure it's the same hash as the initial one computed from the cli assert ( nar_hashes["sha256"] - == "1d3407e5ad740331f928c2a864c7a8e0796f9da982a858c151c9b77506ec10a8" + == "2c2b619d2dc235bff286762550c7f86eb34c9f88ec83a8ae426d75604d3a815b" ) return (tarball_with_executable_path, nar_hashes) diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py index e207a09811c33b68b3cf9221962853ff4261c16a..42652aba6f785a68df3ddb07b16b01745e7f71b8 100644 --- a/swh/loader/core/tests/test_loader.py +++ b/swh/loader/core/tests/test_loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2023 The Software Heritage developers +# Copyright (C) 2018-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -453,6 +453,18 @@ def test_content_loader_missing_field(swh_storage): ContentLoader(swh_storage, origin.url) +@pytest.mark.parametrize("loader_class", [ContentLoader, TarballDirectoryLoader]) +def test_loader_extid_version(swh_storage, loader_class, content_path): + """Ensure the extid_version is the expected one""" + unknown_origin = Origin(f"{CONTENT_MIRROR}/project/asdf/archives/unknown.lisp") + loader = loader_class( + swh_storage, + unknown_origin.url, + checksums=compute_hashes(content_path), + ) + assert loader.extid_version == 1 + + @pytest.mark.parametrize("loader_class", [ContentLoader, TarballDirectoryLoader]) def test_node_loader_missing_field(swh_storage, loader_class): """It should raise if the ContentLoader is missing checksums field""" @@ -578,7 +590,9 @@ def test_content_loader_ok_simple( assert result == {"status": "eventful"} - extids = fetch_extids_from_checksums(loader.storage, checksum_layout, checksums) + extids = fetch_extids_from_checksums( + loader.storage, checksum_layout, checksums, extid_version=loader.extid_version + ) assert len(extids) == len(checksums) visit_status = assert_last_visit_matches( @@ -803,7 +817,77 @@ def test_directory_loader_ok_simple( assert result == {"status": "eventful"} - extids = fetch_extids_from_checksums(loader.storage, checksum_layout, checksums) + extids = fetch_extids_from_checksums( + loader.storage, checksum_layout, checksums, extid_version=loader.extid_version + ) + assert len(extids) == len(checksums) + + visit_status = assert_last_visit_matches( + swh_storage, origin.url, status="full", type="tarball-directory" + ) + assert visit_status.snapshot is not None + + result2 = loader.load() + + assert result2 == {"status": "uneventful"} + + +def test_directory_loader_nar_hash_without_top_level_dir(swh_storage, tarball_path): + """It should be an eventful visit when the expected NAR hash omits tarball top level + directory in its computation.""" + + origin = Origin(f"file://{tarball_path}") + + expected_nar_checksums = { + "sha256": "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4" + } + + loader = TarballDirectoryLoader( + swh_storage, + origin.url, + checksums=expected_nar_checksums, + checksum_layout="nar", + ) + result = loader.load() + + assert result == {"status": "eventful"} + + +@pytest.mark.parametrize("tarball_top_level_directory", [True, False]) +def test_directory_loader_nar_hash_tarball_with_or_without_top_level( + swh_storage, requests_mock_datadir, tarball_path, tarball_top_level_directory +): + """It should be eventful to ingest tarball when the expected NAR hash omits tarball + top level directory in its computation (as nixpkgs computes the nar on tarball) or + when the top level directory is provided directly (as guix computes the nar on + tarball) + + """ + origin = Origin(DIRECTORY_URL) + # Compute nar hashes out of: + # - the uncompressed directory top-level (top_level=True) + # - the first directory (top_level=False) within the tarball + # In any case, the ingestion should be ok + checksums = compute_nar_hashes( + tarball_path, + ["sha1", "sha256", "sha512"], + is_tarball=True, + top_level=tarball_top_level_directory, + ) + + loader = TarballDirectoryLoader( + swh_storage, + origin.url, + checksums=checksums, + checksum_layout="nar", + ) + result = loader.load() + + assert result == {"status": "eventful"} + + extids = fetch_extids_from_checksums( + loader.storage, "nar", checksums, extid_version=loader.extid_version + ) assert len(extids) == len(checksums) visit_status = assert_last_visit_matches( diff --git a/swh/loader/core/tests/test_nar.py b/swh/loader/core/tests/test_nar.py index 817d5bd94f83ed7f90239b6001590501a345279e..f9c1aa7ec31cff75891e82dbbd3195f4616a6b4f 100644 --- a/swh/loader/core/tests/test_nar.py +++ b/swh/loader/core/tests/test_nar.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023 The Software Heritage developers +# Copyright (C) 2023-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -18,10 +18,9 @@ def test_nar_tarball(tmpdir, tarball_with_nar_hashes): directory_path = Path(tmpdir) directory_path.mkdir(parents=True, exist_ok=True) uncompress(str(tarball_path), dest=str(directory_path)) - path_on_disk = next(directory_path.iterdir()) nar = Nar(hash_names=list(nar_hashes.keys())) - nar.serialize(path_on_disk) + nar.serialize(directory_path) assert nar.hexdigest() == nar_hashes @@ -32,10 +31,9 @@ def test_nar_tarball_with_executable(tmpdir, tarball_with_executable_with_nar_ha directory_path = Path(tmpdir) directory_path.mkdir(parents=True, exist_ok=True) uncompress(str(tarball_path), dest=str(directory_path)) - path_on_disk = next(directory_path.iterdir()) nar = Nar(hash_names=list(nar_hashes.keys())) - nar.serialize(path_on_disk) + nar.serialize(directory_path) assert nar.hexdigest() == nar_hashes @@ -135,11 +133,10 @@ def test_nar_cli_tarball(cli_runner, cli_nar, tmpdir, tarball_with_nar_hashes): directory_path = Path(tmpdir) directory_path.mkdir(parents=True, exist_ok=True) uncompress(str(tarball_path), dest=str(directory_path)) - path_on_disk = next(directory_path.iterdir()) assert list(nar_hashes.keys()) == ["sha256"] - result = cli_runner.invoke(cli_nar, ["--hash-algo", "sha256", str(path_on_disk)]) + result = cli_runner.invoke(cli_nar, ["--hash-algo", "sha256", str(directory_path)]) assert result.exit_code == 0 assert_output_contains(result.output, nar_hashes["sha256"]) diff --git a/swh/loader/core/utils.py b/swh/loader/core/utils.py index d7bc64e5136f6f0e028bf589ce8092dc770b4ccc..170d13841c052dcb9988dab6166c95f6c7a8b3e4 100644 --- a/swh/loader/core/utils.py +++ b/swh/loader/core/utils.py @@ -136,6 +136,7 @@ def compute_nar_hashes( filepath: Path, hash_names: List[str] = ["sha256"], is_tarball=True, + top_level=True, ) -> Dict[str, str]: """Compute nar checksums dict out of a filepath (tarball or plain file). @@ -146,6 +147,8 @@ def compute_nar_hashes( filepath: The tarball (if is_tarball is True) or a filepath hash_names: The list of checksums to compute is_tarball: Whether filepath represents a tarball or not + top_level: Whether we want to compute the top-level directory (of the tarball) + hashes. This is only useful when used with 'is_tarball' at True. Returns: The dict of checksums values whose keys are present in hash_names. @@ -156,7 +159,13 @@ def compute_nar_hashes( directory_path = Path(tmpdir) directory_path.mkdir(parents=True, exist_ok=True) uncompress(str(filepath), dest=str(directory_path)) - path_on_disk = next(directory_path.iterdir()) + + if top_level: + # Default behavior, pass the extracted tarball path root directory + path_on_disk = directory_path + else: + # Pass along the first directory of the tarball + path_on_disk = next(iter(directory_path.iterdir())) else: path_on_disk = filepath diff --git a/swh/loader/tests/__init__.py b/swh/loader/tests/__init__.py index 1a951b0d69aa84dbe304ce94453da7fc0cbde0bc..0046cfeef48797ef01d59602b021478fb26f64f4 100644 --- a/swh/loader/tests/__init__.py +++ b/swh/loader/tests/__init__.py @@ -264,7 +264,10 @@ def get_stats(storage) -> Dict: def fetch_extids_from_checksums( - storage: StorageInterface, checksum_layout: str, checksums: Dict[str, str] + storage: StorageInterface, + checksum_layout: str, + checksums: Dict[str, str], + extid_version: int = 0, ) -> List[ExtID]: extids = [] extid_type = None @@ -272,7 +275,6 @@ def fetch_extids_from_checksums( extid_type = "nar-%s" elif checksum_layout == "standard": extid_type = "checksum-%s" - extid_version = 0 if extid_type: for hash_algo, checksum in checksums.items():