Skip to content
Snippets Groups Projects
Commit 65e51e29 authored by Antoine Lambert's avatar Antoine Lambert
Browse files

nixguix: Update heuristic checking if URL targets a tarball file

In addition to query parameters also check if any part of URL path
contains a tarball filename.

It fixes the detection of some tarball URLs provided in Guix manifest.

Related to swh/meta#3781.
parent ed8de05e
No related branches found
No related tags found
1 merge request!519nixguix: Update heuristic checking if URL targets a tarball file
Pipeline #6723 failed
......@@ -144,10 +144,11 @@ POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+")
def url_endswith(
def url_contains_tarball_filename(
urlparsed, extensions: List[str], raise_when_no_extension: bool = True
) -> bool:
"""Determine whether urlparsed ends with one of the extensions passed as parameter.
"""Determine whether urlparsed contains a tarball filename ending with one of the
extensions passed as parameter, path parts and query parameters are checked.
This also account for the edge case of a filename with only a version as name (so no
extension in the end.)
......@@ -158,11 +159,15 @@ def url_endswith(
"""
paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
raise ArtifactWithoutExtension
match = any(path.suffix.endswith(tuple(extensions)) for path in paths)
match = any(
path_part.endswith(tuple(extensions))
for path in paths
for path_part in path.parts
)
if match:
return match
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
raise ArtifactWithoutExtension
# Some false negative can happen (e.g. https://<netloc>/path/0.1.5)), so make sure
# to catch those
name = Path(urlparsed.path).name
......@@ -214,7 +219,7 @@ def is_tarball(
urlparsed = urlparse(url)
if urlparsed.scheme not in ("http", "https", "ftp"):
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
return url_endswith(urlparsed, TARBALL_EXTENSIONS)
return url_contains_tarball_filename(urlparsed, TARBALL_EXTENSIONS)
# Check all urls and as soon as an url allows the nature detection, this stops.
exceptions_to_raise = []
......@@ -283,7 +288,7 @@ def is_tarball(
break
return (
url_endswith(
url_contains_tarball_filename(
urlparse(filename),
TARBALL_EXTENSIONS,
raise_when_no_extension=False,
......@@ -585,7 +590,7 @@ class NixGuixLister(StatelessLister[PageResult]):
# Let's check and filter it out if it is to be ignored (if possible).
# Some origin urls may not have extension at this point (e.g
# http://git.marmaro.de/?p=mmh;a=snp;h=<id>;sf=tgz), let them through.
if url_endswith(
if url_contains_tarball_filename(
urlparse(origin),
self.extensions_to_ignore,
raise_when_no_extension=False,
......
......@@ -24,7 +24,7 @@ from swh.lister.nixguix.lister import (
ArtifactWithoutExtension,
NixGuixLister,
is_tarball,
url_endswith,
url_contains_tarball_filename,
)
from swh.lister.pattern import ListerStats
......@@ -65,7 +65,7 @@ def test_url_endswith(name, expected_result):
"""It should detect whether url or query params of the urls ends with extensions"""
urlparsed = urlparse(f"https://example.org/{name}")
assert (
url_endswith(
url_contains_tarball_filename(
urlparsed,
TARBALL_EXTENSIONS + DEFAULT_EXTENSIONS_TO_IGNORE,
raise_when_no_extension=False,
......@@ -81,7 +81,7 @@ def test_url_endswith_raise(name):
"""It should raise when the tested url has no extension"""
urlparsed = urlparse(f"https://example.org/{name}")
with pytest.raises(ArtifactWithoutExtension):
url_endswith(urlparsed, ["unimportant"])
url_contains_tarball_filename(urlparsed, ["unimportant"])
@pytest.mark.parametrize(
......@@ -98,12 +98,15 @@ def test_is_tarball_simple(tarballs):
@pytest.mark.parametrize(
"query_param",
["file", "f", "url", "name", "anykeyreally"],
"url",
[
"https://example.org/download/one.tar.gz/other/path/parts",
"https://example.org/download.php?foo=bar&file=one.tar.gz",
],
)
def test_is_tarball_not_so_simple(query_param):
"""More involved check on tarball should discriminate between tarball and file"""
url = f"https://example.org/download.php?foo=bar&{query_param}=one.tar.gz"
def test_is_tarball_not_so_simple(url):
"""Detect tarball URL when filename is not in the last path parts or
in a query parameter"""
is_tar, origin = is_tarball([url])
assert is_tar is True
assert origin == url
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment