From e8699422d757eb968779a9e44fc5017fb5a6dd97 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Fri, 4 Nov 2022 13:50:25 +0100
Subject: [PATCH] nixguix: Reject Git SSH URLs and pseudo-URLs

For consistency with Maven and Packagist listers
---
 swh/lister/nixguix/lister.py                       |  2 +-
 swh/lister/nixguix/tests/data/sources-failure.json | 10 ++++++++++
 swh/lister/nixguix/tests/test_lister.py            | 13 ++++++++++---
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
index 9ebe82ea..3e410aa2 100644
--- a/swh/lister/nixguix/lister.py
+++ b/swh/lister/nixguix/lister.py
@@ -402,7 +402,7 @@ class NixGuixLister(StatelessLister[PageResult]):
                 urls = []
                 for url in origin_urls:
                     urlparsed = urlparse(url)
-                    if urlparsed.scheme == "":
+                    if urlparsed.scheme == "" and not re.match(r"^\w+@[^/]+:", url):
                         logger.warning("Missing scheme for <%s>: fallback to http", url)
                         fixed_url = f"http://{url}"
                     else:
diff --git a/swh/lister/nixguix/tests/data/sources-failure.json b/swh/lister/nixguix/tests/data/sources-failure.json
index 237a0186..86b34a8d 100644
--- a/swh/lister/nixguix/tests/data/sources-failure.json
+++ b/swh/lister/nixguix/tests/data/sources-failure.json
@@ -53,6 +53,16 @@
       "urls": [ "unknown://example.org/wrong-scheme-so-skipped.txt" ],
       "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
     },
+    {
+      "type": "url",
+      "urls": [ "ssh://git@example.org:wrong-scheme-so-skipped.txt" ],
+      "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
+    },
+    {
+      "type": "url",
+      "urls": [ "git@example.org:git-pseudourl/so-skipped" ],
+      "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
+    },
     {
       "type": "url",
       "urls": [ "https://code.9front.org/hg/plan9front" ],
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
index fdb7210e..a00a5f61 100644
--- a/swh/lister/nixguix/tests/test_lister.py
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -353,13 +353,20 @@ def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
     )
 
     listed_result = lister.run()
-    # only the origin upstream is listed, every other entries are unsupported or incomplete
-    assert listed_result == ListerStats(pages=1, origins=1)
 
+    expected_origins = ["https://github.com/NixOS/nixpkgs"]
     scheduler_origins = lister.scheduler.get_listed_origins(
         lister.lister_obj.id
     ).results
-    assert len(scheduler_origins) == 1
+    scheduler_origin_urls = [orig.url for orig in scheduler_origins]
+
+    assert scheduler_origin_urls == expected_origins
+
+    # only the origin upstream is listed, every other entries are unsupported or incomplete
+    assert listed_result == ListerStats(pages=1, origins=1), (
+        f"Expected origins: {' '.join(expected_origins)}, got: "
+        f"{' '.join(scheduler_origin_urls)}"
+    )
 
     assert scheduler_origins[0].visit_type == "git"
 
-- 
GitLab