diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py index 21cae67cce0b9c006ad0d10cf077b21c09fc9ad7..0b8e8bef46193d22f2d4f2063b804f76608e4589 100644 --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -242,12 +242,33 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st url, ) + origin = urls[0] + content_type = response.headers.get("Content-Type") if content_type: logger.debug("Content-Type: %s", content_type) if content_type == "application/json": - return False, urls[0] - return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), urls[0] + return False, origin + return content_type.startswith(POSSIBLE_TARBALL_MIMETYPES), origin + + content_disposition = response.headers.get("Content-Disposition") + if content_disposition: + logger.debug("Content-Disposition: %s", content_disposition) + if "filename=" in content_disposition: + fields = content_disposition.split("; ") + for field in fields: + if "filename=" in field: + _, filename = field.split("filename=") + break + + return ( + url_endswith( + urlparse(filename), + TARBALL_EXTENSIONS, + raise_when_no_extension=False, + ), + origin, + ) raise ArtifactNatureUndetected( f"Cannot determine artifact type from url <{url}>" diff --git a/swh/lister/nixguix/tests/data/sources-success.json b/swh/lister/nixguix/tests/data/sources-success.json index 3178159c8050631174eb5208153166cd5c6b9bf6..05fdd796a7b9f4589c9b0f3e1196444da19a2995 100644 --- a/swh/lister/nixguix/tests/data/sources-success.json +++ b/swh/lister/nixguix/tests/data/sources-success.json @@ -272,6 +272,20 @@ "https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5" ], "integrity": "sha256-Kb5f9LN54vxPiO99i8FyNCEw3T53owYfZMinXv5OunM=" + }, + { + "type": "url", + "urls": [ + "https://codeload.github.com/unknown-horizons/unknown-horizons/tar.gz/2019.1" + ], + "integrity": "sha256-pBf9PTQiEv0ZDk8hvoLvE8EOHtfCiPu+RuRiAM895Ng=" + }, + { + "type": "url", + "urls": [ + "https://codeload.github.com/fifengine/fifengine/tar.gz/0.4.2" + ], + "integrity": "sha256-6IK1W++jauLxqJraFq8PgUobePfL5gIexbFgVgTPj/g=" } ], "version": "1", diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py index 13ee1160c361fa7693aa6c07dbf5dbc2b602c9ab..fdb7210e00bf7d072a7a24efb95842c9565b75f9 100644 --- a/swh/lister/nixguix/tests/test_lister.py +++ b/swh/lister/nixguix/tests/test_lister.py @@ -240,6 +240,19 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock): "Content-Type": "application/x-gzip", }, ) + requests_mock.head( + "https://codeload.github.com/unknown-horizons/unknown-horizons/tar.gz/2019.1", + headers={ + "Content-Disposition": "attachment; filename=unknown-horizons-2019.1.tar.gz", + }, + ) + requests_mock.head( + "https://codeload.github.com/fifengine/fifengine/tar.gz/0.4.2", + headers={ + "Content-Disposition": "attachment; name=fieldName; " + "filename=fifengine-0.4.2.tar.gz; other=stuff", + }, + ) expected_visit_types = defaultdict(int) # origin upstream is added as origin