From fdeb086f7743498744ca2aa95a5c1c1e658c811e Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Tue, 12 Mar 2024 17:53:15 +0100
Subject: [PATCH] nixguix: Handle creation of svn-export visit types on svn
 sub-trees

Some Guix packages correspond to subset exports of a subversion source
tree at a given revision, typically the Tex Live ones.

In that case, we must pass an extra parameter to the svn-export loader
to specify the sub-paths to export but also use a unique origin URL
for each package to archive as otherwise the same one would be used
and only a single package would be archived.

Related to swh/infra/sysadm-environment#5263.
---
 swh/lister/nixguix/lister.py                  | 19 +++++++-
 .../nixguix/tests/data/sources-texlive.json   | 44 +++++++++++++++++++
 swh/lister/nixguix/tests/test_lister.py       | 34 ++++++++++++++
 3 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 swh/lister/nixguix/tests/data/sources-texlive.json

diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
index f20c9020..89bf4a84 100644
--- a/swh/lister/nixguix/lister.py
+++ b/swh/lister/nixguix/lister.py
@@ -113,6 +113,9 @@ class Artifact:
     """Optional reference on the artifact (git commit, branch, svn commit, tag, ...)"""
     submodules: bool
     """Indicates if submodules should be retrieved for a git-checkout visit type"""
+    svn_paths: Optional[List[str]]
+    """Optional list of paths for the svn-export loader, only those will be exported
+    and loaded into the archive"""
 
 
 @dataclass
@@ -472,14 +475,23 @@ class NixGuixLister(StatelessLister[PageResult]):
                     if not checksums:
                         continue
 
+                    origin_url = plain_url
+                    svn_paths = artifact.get("svn_files")
+                    if svn_paths:
+                        # as multiple svn-export visit types can use the same base svn URL
+                        # we modify the origin URL to ensure it is unique by appending the
+                        # NAR hash value as a query parameter
+                        origin_url += f"?nar={integrity}"
+
                     yield ArtifactType.ARTIFACT, Artifact(
-                        origin=plain_url,
+                        origin=origin_url,
                         fallback_urls=[],
                         checksums=checksums,
                         checksum_layout=MAPPING_CHECKSUM_LAYOUT[outputHashMode],
                         visit_type=VCS_ARTIFACT_TYPE_TO_VISIT_TYPE[artifact_type],
                         ref=plain_ref,
                         submodules=artifact.get("submodule", False),
+                        svn_paths=svn_paths,
                     )
 
             elif artifact_type == "url":
@@ -614,6 +626,7 @@ class NixGuixLister(StatelessLister[PageResult]):
                     visit_type="tarball-directory" if is_tar else "content",
                     ref=None,
                     submodules=False,
+                    svn_paths=None,
                 )
             else:
                 logger.warning(
@@ -644,6 +657,10 @@ class NixGuixLister(StatelessLister[PageResult]):
             loader_arguments["ref"] = artifact.ref
         if artifact.submodules:
             loader_arguments["submodules"] = artifact.submodules
+        if artifact.svn_paths:
+            # extract the base svn url from the modified origin URL (see get_pages method)
+            loader_arguments["svn_url"] = artifact.origin.rsplit("?", maxsplit=1)[0]
+            loader_arguments["svn_paths"] = artifact.svn_paths
         yield ListedOrigin(
             lister_id=self.lister_obj.id,
             url=artifact.origin,
diff --git a/swh/lister/nixguix/tests/data/sources-texlive.json b/swh/lister/nixguix/tests/data/sources-texlive.json
new file mode 100644
index 00000000..4894db78
--- /dev/null
+++ b/swh/lister/nixguix/tests/data/sources-texlive.json
@@ -0,0 +1,44 @@
+{
+  "sources": [
+    {
+      "type": "svn",
+      "svn_url": "svn://www.tug.org/texlive/tags/texlive-2023.0/Master/texmf-dist/",
+      "svn_files": [
+        "bibtex/bib/oberdiek/",
+        "doc/latex/oberdiek/",
+        "source/latex/oberdiek/",
+        "tex/generic/oberdiek/",
+        "tex/latex/oberdiek/"
+      ],
+      "integrity": "sha256-n9ZrKjR0JYOsbFtKby7UWykYjVY0f1hgInyR3DNbpro=",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "recursive",
+      "svn_revision": 66594
+    },
+    {
+      "type": "svn",
+      "svn_url": "svn://www.tug.org/texlive/tags/texlive-2023.0/Master/texmf-dist/",
+      "svn_files": [
+        "fonts/source/public/knuth-lib/",
+        "fonts/tfm/public/knuth-lib/",
+        "tex/generic/knuth-lib/",
+        "tex/plain/knuth-lib/"
+      ],
+      "integrity": "sha256-it3vOYZ4VrsXcBY6QSeIuHsNcoQkZsWP7aYaC8j4iDY=",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "recursive",
+      "svn_revision": 66594
+    },
+    {
+      "type": "svn",
+      "svn_url": "svn://www.tug.org/texlive/tags/texlive-2023.0/Master/texmf-dist/",
+      "svn_files": [
+        "doc/latex/etdipa/"
+      ],
+      "integrity": "sha256-H25frh/nt438g8lsUCcfNuytvPrBrkYIpEEpqq4q48o=",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "recursive",
+      "svn_revision": 66594
+    }
+  ]
+}
\ No newline at end of file
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
index 8dab20f2..180fe4fe 100644
--- a/swh/lister/nixguix/tests/test_lister.py
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -457,3 +457,37 @@ def test_lister_nixguix_fail(datadir, swh_scheduler, requests_mock):
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
     assert len(scheduler_origins) == 0
+
+
+def test_lister_nixguix_svn_export_sub_trees(datadir, swh_scheduler, requests_mock):
+    """NixGuixLister should handle svn-export visit types exporting a subset of
+    a subversion source tree (e.g. Tex Live packages for Guix)"""
+    url = SOURCES["guix"]["manifest"]
+    origin_upstream = SOURCES["guix"]["repo"]
+    lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
+
+    response = page_response(datadir, "texlive")
+    requests_mock.get(url, [{"json": response}])
+
+    listed_result = lister.run()
+
+    assert listed_result == ListerStats(pages=7, origins=5)
+
+    scheduler_origins = {
+        origin.url: origin
+        for origin in lister.scheduler.get_listed_origins(lister.lister_obj.id).results
+    }
+
+    for source in response["sources"]:
+        svn_url = source["svn_url"]
+        origin_url = f"{source['svn_url']}?nar={source['integrity']}"
+        assert origin_url in scheduler_origins
+        assert "svn_url" in scheduler_origins[origin_url].extra_loader_arguments
+        assert (
+            scheduler_origins[origin_url].extra_loader_arguments["svn_url"] == svn_url
+        )
+        assert "svn_paths" in scheduler_origins[origin_url].extra_loader_arguments
+        assert (
+            scheduler_origins[origin_url].extra_loader_arguments["svn_paths"]
+            == source["svn_files"]
+        )
-- 
GitLab