From c9b51f8b94af3805151f1f049760726c72f2bc1c Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Thu, 15 Feb 2024 15:34:55 +0100
Subject: [PATCH] TarballDirectoryLoader: Ingest tarball like package loaders

This now matches the tarball loader behavior (top-level directory included
[1]). This also matches what's expected by the guix dataset.

As the nix hashes computed are done from the first directory included in the
tarball though, we must also provide that directory. That way, the hashes
checks done during ingestion can match appropriately. That was the initial
implementation.

In terms of data, as this will change the visit snapshot and the extid
mappings, the core loaders (NodeLoader, ...) now declares an extid_version
bumped to 1 (it was 0 by default). Which means that all extid mappings will be
recomputed.

[1] https://gitlab.softwareheritage.org/swh/devel/swh-loader-core/-/blob/master/swh/loader/package/loader.py?ref_type=heads#L829-837

Refs. swh/infra/sysadm-environment#5222
---
 swh/loader/core/loader.py            | 20 ++++---
 swh/loader/core/tests/conftest.py    | 18 +++++-
 swh/loader/core/tests/test_loader.py | 90 +++++++++++++++++++++++++++-
 swh/loader/core/tests/test_nar.py    | 11 ++--
 swh/loader/core/utils.py             | 11 +++-
 swh/loader/tests/__init__.py         |  6 +-
 6 files changed, 134 insertions(+), 22 deletions(-)

diff --git a/swh/loader/core/loader.py b/swh/loader/core/loader.py
index dfb28b7a..244cfaae 100644
--- a/swh/loader/core/loader.py
+++ b/swh/loader/core/loader.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2023 The Software Heritage developers
+# Copyright (C) 2015-2024 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -654,6 +654,10 @@ class NodeLoader(BaseLoader, ABC):
 
     """
 
+    # Bump version when incompatible changes occur:
+    # - 20240215: Tarball directory from a leaf class changed the tarball ingestion
+    extid_version = 1
+
     def __init__(
         self,
         storage: StorageInterface,
@@ -750,6 +754,7 @@ class NodeLoader(BaseLoader, ABC):
                     extid_type=extid_type % hash_algo,
                     extid=extid,
                     target=node.swhid(),
+                    extid_version=self.extid_version,
                 )
                 for hash_algo, extid in checksums.items()
             }
@@ -1140,15 +1145,16 @@ class TarballDirectoryLoader(BaseDirectoryLoader):
 
                 if directory_path:
                     found_directory_path = True
+                    # Yield the top-level directory as-is
+                    yield directory_path
+                    # If there is a mismatch between the computed NAR hash and the one
+                    # we should obtain, retry its computation by not including single
+                    # top level directory if there is such a layout (as nix does).
                     # Check whether a top-level directory exists
                     listing = list(directory_path.iterdir())
                     if len(listing) == 1:
-                        # Top-level directory, we provide it
-                        dir_path = listing[0]
-                    else:
-                        # otherwise, provide the directory_path as is
-                        dir_path = directory_path
-                    yield dir_path
+                        # Top-level directory exists, we provide it, nix depends on it
+                        yield listing[0]
 
         # To catch 'standard' hash mismatch issues raise by the 'download' method.
         if not found_directory_path and errors:
diff --git a/swh/loader/core/tests/conftest.py b/swh/loader/core/tests/conftest.py
index 4bcf2d6d..771ad00a 100644
--- a/swh/loader/core/tests/conftest.py
+++ b/swh/loader/core/tests/conftest.py
@@ -58,9 +58,23 @@ def tarball_with_std_hashes(tarball_path):
 def tarball_with_nar_hashes(tarball_path):
     nar_hashes = compute_nar_hashes(tarball_path, ["sha256"])
     # Ensure it's the same hash as the initial one computed from the cli
+
+    assert (
+        nar_hashes["sha256"]
+        == "45db8a27ccfae60b5233003c54c2d6b5ed6f0a1299dd9bbebc8f06cf649bc9c0"
+    )
+    return (tarball_path, nar_hashes)
+
+
+@pytest.fixture
+def tarball_with_nar_hashes_first_directory(tarball_path):
+    # Compute the nar hashes from the first directory inside the tarball
+    nar_hashes = compute_nar_hashes(tarball_path, ["sha256"], top_level=False)
+    # Ensure it's the same hash as the initial one computed from the cli
+
     assert (
         nar_hashes["sha256"]
-        == "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4"
+        == "45db8a27ccfae60b5233003c54c2d6b5ed6f0a1299dd9bbebc8f06cf649bc9c0"
     )
     return (tarball_path, nar_hashes)
 
@@ -71,7 +85,7 @@ def tarball_with_executable_with_nar_hashes(tarball_with_executable_path):
     # Ensure it's the same hash as the initial one computed from the cli
     assert (
         nar_hashes["sha256"]
-        == "1d3407e5ad740331f928c2a864c7a8e0796f9da982a858c151c9b77506ec10a8"
+        == "2c2b619d2dc235bff286762550c7f86eb34c9f88ec83a8ae426d75604d3a815b"
     )
     return (tarball_with_executable_path, nar_hashes)
 
diff --git a/swh/loader/core/tests/test_loader.py b/swh/loader/core/tests/test_loader.py
index e207a098..42652aba 100644
--- a/swh/loader/core/tests/test_loader.py
+++ b/swh/loader/core/tests/test_loader.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2023  The Software Heritage developers
+# Copyright (C) 2018-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -453,6 +453,18 @@ def test_content_loader_missing_field(swh_storage):
         ContentLoader(swh_storage, origin.url)
 
 
+@pytest.mark.parametrize("loader_class", [ContentLoader, TarballDirectoryLoader])
+def test_loader_extid_version(swh_storage, loader_class, content_path):
+    """Ensure the extid_version is the expected one"""
+    unknown_origin = Origin(f"{CONTENT_MIRROR}/project/asdf/archives/unknown.lisp")
+    loader = loader_class(
+        swh_storage,
+        unknown_origin.url,
+        checksums=compute_hashes(content_path),
+    )
+    assert loader.extid_version == 1
+
+
 @pytest.mark.parametrize("loader_class", [ContentLoader, TarballDirectoryLoader])
 def test_node_loader_missing_field(swh_storage, loader_class):
     """It should raise if the ContentLoader is missing checksums field"""
@@ -578,7 +590,9 @@ def test_content_loader_ok_simple(
 
     assert result == {"status": "eventful"}
 
-    extids = fetch_extids_from_checksums(loader.storage, checksum_layout, checksums)
+    extids = fetch_extids_from_checksums(
+        loader.storage, checksum_layout, checksums, extid_version=loader.extid_version
+    )
     assert len(extids) == len(checksums)
 
     visit_status = assert_last_visit_matches(
@@ -803,7 +817,77 @@ def test_directory_loader_ok_simple(
 
     assert result == {"status": "eventful"}
 
-    extids = fetch_extids_from_checksums(loader.storage, checksum_layout, checksums)
+    extids = fetch_extids_from_checksums(
+        loader.storage, checksum_layout, checksums, extid_version=loader.extid_version
+    )
+    assert len(extids) == len(checksums)
+
+    visit_status = assert_last_visit_matches(
+        swh_storage, origin.url, status="full", type="tarball-directory"
+    )
+    assert visit_status.snapshot is not None
+
+    result2 = loader.load()
+
+    assert result2 == {"status": "uneventful"}
+
+
+def test_directory_loader_nar_hash_without_top_level_dir(swh_storage, tarball_path):
+    """It should be an eventful visit when the expected NAR hash omits tarball top level
+    directory in its computation."""
+
+    origin = Origin(f"file://{tarball_path}")
+
+    expected_nar_checksums = {
+        "sha256": "23fb1fe278aeb2de899f7d7f10cf892f63136cea2c07146da2200da4de54b7e4"
+    }
+
+    loader = TarballDirectoryLoader(
+        swh_storage,
+        origin.url,
+        checksums=expected_nar_checksums,
+        checksum_layout="nar",
+    )
+    result = loader.load()
+
+    assert result == {"status": "eventful"}
+
+
+@pytest.mark.parametrize("tarball_top_level_directory", [True, False])
+def test_directory_loader_nar_hash_tarball_with_or_without_top_level(
+    swh_storage, requests_mock_datadir, tarball_path, tarball_top_level_directory
+):
+    """It should be eventful to ingest tarball when the expected NAR hash omits tarball
+    top level directory in its computation (as nixpkgs computes the nar on tarball) or
+    when the top level directory is provided directly (as guix computes the nar on
+    tarball)
+
+    """
+    origin = Origin(DIRECTORY_URL)
+    # Compute nar hashes out of:
+    # - the uncompressed directory top-level (top_level=True)
+    # - the first directory (top_level=False) within the tarball
+    # In any case, the ingestion should be ok
+    checksums = compute_nar_hashes(
+        tarball_path,
+        ["sha1", "sha256", "sha512"],
+        is_tarball=True,
+        top_level=tarball_top_level_directory,
+    )
+
+    loader = TarballDirectoryLoader(
+        swh_storage,
+        origin.url,
+        checksums=checksums,
+        checksum_layout="nar",
+    )
+    result = loader.load()
+
+    assert result == {"status": "eventful"}
+
+    extids = fetch_extids_from_checksums(
+        loader.storage, "nar", checksums, extid_version=loader.extid_version
+    )
     assert len(extids) == len(checksums)
 
     visit_status = assert_last_visit_matches(
diff --git a/swh/loader/core/tests/test_nar.py b/swh/loader/core/tests/test_nar.py
index 817d5bd9..f9c1aa7e 100644
--- a/swh/loader/core/tests/test_nar.py
+++ b/swh/loader/core/tests/test_nar.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023  The Software Heritage developers
+# Copyright (C) 2023-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -18,10 +18,9 @@ def test_nar_tarball(tmpdir, tarball_with_nar_hashes):
     directory_path = Path(tmpdir)
     directory_path.mkdir(parents=True, exist_ok=True)
     uncompress(str(tarball_path), dest=str(directory_path))
-    path_on_disk = next(directory_path.iterdir())
 
     nar = Nar(hash_names=list(nar_hashes.keys()))
-    nar.serialize(path_on_disk)
+    nar.serialize(directory_path)
     assert nar.hexdigest() == nar_hashes
 
 
@@ -32,10 +31,9 @@ def test_nar_tarball_with_executable(tmpdir, tarball_with_executable_with_nar_ha
     directory_path = Path(tmpdir)
     directory_path.mkdir(parents=True, exist_ok=True)
     uncompress(str(tarball_path), dest=str(directory_path))
-    path_on_disk = next(directory_path.iterdir())
 
     nar = Nar(hash_names=list(nar_hashes.keys()))
-    nar.serialize(path_on_disk)
+    nar.serialize(directory_path)
     assert nar.hexdigest() == nar_hashes
 
 
@@ -135,11 +133,10 @@ def test_nar_cli_tarball(cli_runner, cli_nar, tmpdir, tarball_with_nar_hashes):
     directory_path = Path(tmpdir)
     directory_path.mkdir(parents=True, exist_ok=True)
     uncompress(str(tarball_path), dest=str(directory_path))
-    path_on_disk = next(directory_path.iterdir())
 
     assert list(nar_hashes.keys()) == ["sha256"]
 
-    result = cli_runner.invoke(cli_nar, ["--hash-algo", "sha256", str(path_on_disk)])
+    result = cli_runner.invoke(cli_nar, ["--hash-algo", "sha256", str(directory_path)])
 
     assert result.exit_code == 0
     assert_output_contains(result.output, nar_hashes["sha256"])
diff --git a/swh/loader/core/utils.py b/swh/loader/core/utils.py
index d7bc64e5..170d1384 100644
--- a/swh/loader/core/utils.py
+++ b/swh/loader/core/utils.py
@@ -136,6 +136,7 @@ def compute_nar_hashes(
     filepath: Path,
     hash_names: List[str] = ["sha256"],
     is_tarball=True,
+    top_level=True,
 ) -> Dict[str, str]:
     """Compute nar checksums dict out of a filepath (tarball or plain file).
 
@@ -146,6 +147,8 @@ def compute_nar_hashes(
         filepath: The tarball (if is_tarball is True) or a filepath
         hash_names: The list of checksums to compute
         is_tarball: Whether filepath represents a tarball or not
+        top_level: Whether we want to compute the top-level directory (of the tarball)
+            hashes. This is only useful when used with 'is_tarball' at True.
 
     Returns:
         The dict of checksums values whose keys are present in hash_names.
@@ -156,7 +159,13 @@ def compute_nar_hashes(
             directory_path = Path(tmpdir)
             directory_path.mkdir(parents=True, exist_ok=True)
             uncompress(str(filepath), dest=str(directory_path))
-            path_on_disk = next(directory_path.iterdir())
+
+            if top_level:
+                # Default behavior, pass the extracted tarball path root directory
+                path_on_disk = directory_path
+            else:
+                # Pass along the first directory of the tarball
+                path_on_disk = next(iter(directory_path.iterdir()))
         else:
             path_on_disk = filepath
 
diff --git a/swh/loader/tests/__init__.py b/swh/loader/tests/__init__.py
index 1a951b0d..0046cfee 100644
--- a/swh/loader/tests/__init__.py
+++ b/swh/loader/tests/__init__.py
@@ -264,7 +264,10 @@ def get_stats(storage) -> Dict:
 
 
 def fetch_extids_from_checksums(
-    storage: StorageInterface, checksum_layout: str, checksums: Dict[str, str]
+    storage: StorageInterface,
+    checksum_layout: str,
+    checksums: Dict[str, str],
+    extid_version: int = 0,
 ) -> List[ExtID]:
     extids = []
     extid_type = None
@@ -272,7 +275,6 @@ def fetch_extids_from_checksums(
         extid_type = "nar-%s"
     elif checksum_layout == "standard":
         extid_type = "checksum-%s"
-    extid_version = 0
 
     if extid_type:
         for hash_algo, checksum in checksums.items():
-- 
GitLab