From 81688ca17e667c693fbc46abadf9275bb99a54f1 Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Tue, 25 Oct 2022 17:46:30 +0200
Subject: [PATCH] nixguix: Use content-disposition from http head request if
 provided

As a last fallback after the content-type check, instead of raising immediately.

Related to T3781
---
 swh/lister/nixguix/lister.py                  | 25 +++++++++++++++++--
 .../nixguix/tests/data/sources-success.json   | 14 +++++++++++
 swh/lister/nixguix/tests/test_lister.py       | 13 ++++++++++
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
index 21cae67c..0b8e8bef 100644
--- a/swh/lister/nixguix/lister.py
+++ b/swh/lister/nixguix/lister.py
@@ -242,12 +242,33 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
                     url,
                 )
 
+        origin = urls[0]
+
         content_type = response.headers.get("Content-Type")
         if content_type:
             logger.debug("Content-Type: %s", content_type)
             if content_type == "application/json":
-                return False, urls[0]
-            return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), urls[0]
+                return False, origin
+            return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), origin
+
+        content_disposition = response.headers.get("Content-Disposition")
+        if content_disposition:
+            logger.debug("Content-Disposition: %s", content_disposition)
+            if "filename=" in content_disposition:
+                fields = content_disposition.split("; ")
+                for field in fields:
+                    if "filename=" in field:
+                        _, filename = field.split("filename=")
+                        break
+
+                return (
+                    url_endswith(
+                        urlparse(filename),
+                        TARBALL_EXTENSIONS,
+                        raise_when_no_extension=False,
+                    ),
+                    origin,
+                )
 
         raise ArtifactNatureUndetected(
             f"Cannot determine artifact type from url <{url}>"
diff --git a/swh/lister/nixguix/tests/data/sources-success.json b/swh/lister/nixguix/tests/data/sources-success.json
index 3178159c..05fdd796 100644
--- a/swh/lister/nixguix/tests/data/sources-success.json
+++ b/swh/lister/nixguix/tests/data/sources-success.json
@@ -272,6 +272,20 @@
         "https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5"
       ],
       "integrity": "sha256-Kb5f9LN54vxPiO99i8FyNCEw3T53owYfZMinXv5OunM="
+    },
+    {
+      "type": "url",
+      "urls": [
+        "https://codeload.github.com/unknown-horizons/unknown-horizons/tar.gz/2019.1"
+      ],
+      "integrity": "sha256-pBf9PTQiEv0ZDk8hvoLvE8EOHtfCiPu+RuRiAM895Ng="
+    },
+    {
+      "type": "url",
+      "urls": [
+        "https://codeload.github.com/fifengine/fifengine/tar.gz/0.4.2"
+      ],
+      "integrity": "sha256-6IK1W++jauLxqJraFq8PgUobePfL5gIexbFgVgTPj/g="
     }
   ],
   "version": "1",
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
index 13ee1160..fdb7210e 100644
--- a/swh/lister/nixguix/tests/test_lister.py
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -240,6 +240,19 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
             "Content-Type": "application/x-gzip",
         },
     )
+    requests_mock.head(
+        "https://codeload.github.com/unknown-horizons/unknown-horizons/tar.gz/2019.1",
+        headers={
+            "Content-Disposition": "attachment; filename=unknown-horizons-2019.1.tar.gz",
+        },
+    )
+    requests_mock.head(
+        "https://codeload.github.com/fifengine/fifengine/tar.gz/0.4.2",
+        headers={
+            "Content-Disposition": "attachment; name=fieldName; "
+            "filename=fifengine-0.4.2.tar.gz; other=stuff",
+        },
+    )
 
     expected_visit_types = defaultdict(int)
     # origin upstream is added as origin
-- 
GitLab