Skip to content
Snippets Groups Projects
Verified Commit 026fea21 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

nixguix: Deal with edge case url with version instead of extension

Prior to this, some urls were detected as file because their version name were wrongly
detected as extension, hence not matching tarball extensions.

Related to T3781
parent 8355fee2
No related branches found
No related tags found
No related merge requests found
......@@ -22,6 +22,7 @@ from enum import Enum
import logging
from pathlib import Path
import random
import re
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from urllib.parse import parse_qsl, urlparse
......@@ -136,20 +137,36 @@ VCS_SUPPORTED = ("git", "svn", "hg")
POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
PATTERN_VERSION = re.compile(r"(v*[0-9]+[.])([0-9]+[.]*)+")
def url_endswith(
urlparsed, extensions: List[str], raise_when_no_extension: bool = True
) -> bool:
"""Determine whether urlparsed ends with one of the extensions.
"""Determine whether urlparsed ends with one of the extensions passed as parameter.
This also account for the edge case of a filename with only a version as name (so no
extension in the end.)
Raises:
ArtifactWithoutExtension in case no extension is available and raise_when_no_extension
is True (the default)
ArtifactWithoutExtension in case no extension is available and
raise_when_no_extension is True (the default)
"""
paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
raise ArtifactWithoutExtension
return any(path.suffix.endswith(tuple(extensions)) for path in paths)
match = any(path.suffix.endswith(tuple(extensions)) for path in paths)
if match:
return match
# Some false negative can happen (e.g. https://<netloc>/path/0.1.5)), so make sure
# to catch those
name = Path(urlparsed.path).name
if not PATTERN_VERSION.match(name):
return match
if raise_when_no_extension:
raise ArtifactWithoutExtension
return False
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
......
......@@ -265,6 +265,13 @@
"https://github.com/Doom-Utils/deutex/releases/download/v5.2.2/deutex-5.2.2.tar.zst"
],
"integrity": "sha256-EO0OelM+yXy20DVI1CWPvsiIUqRbXqTPVDQ3atQXS18="
},
{
"type": "url",
"urls": [
"https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5"
],
"integrity": "sha256-Kb5f9LN54vxPiO99i8FyNCEw3T53owYfZMinXv5OunM="
}
],
"version": "1",
......
......@@ -52,7 +52,13 @@ def page_response(datadir, instance: str = "success") -> List[Dict]:
[(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS]
+ [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS]
+ [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE]
+ [("two?file=something.el", False), ("foo?two=two&three=three", False)],
+ [
("two?file=something.el", False),
("foo?two=two&three=three", False),
("v1.2.3", False), # with raise_when_no_extension is False
("2048-game-20151026.1233", False),
("v2048-game-20151026.1233", False),
],
)
def test_url_endswith(name, expected_result):
"""It should detect whether url or query params of the urls ends with extensions"""
......@@ -67,9 +73,12 @@ def test_url_endswith(name, expected_result):
)
def test_url_endswith_raise():
@pytest.mark.parametrize(
"name", ["foo?two=two&three=three", "tar.gz/0.1.5", "tar.gz/v10.3.1"]
)
def test_url_endswith_raise(name):
"""It should raise when the tested url has no extension"""
urlparsed = urlparse("https://example.org/foo?two=two&three=three")
urlparsed = urlparse(f"https://example.org/{name}")
with pytest.raises(ArtifactWithoutExtension):
url_endswith(urlparsed, ["unimportant"])
......@@ -225,6 +234,12 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
"Location": "https://static.crates.io/crates/syntect/syntect-4.6.0.crate"
},
)
requests_mock.head(
"https://codeload.github.com/fifengine/fifechan/tar.gz/0.1.5",
headers={
"Content-Type": "application/x-gzip",
},
)
expected_visit_types = defaultdict(int)
# origin upstream is added as origin
......@@ -248,7 +263,7 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
expected_visit_types["content"] += 1
elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
expected_visit_types["svn"] += 1
elif "crates.io" in url:
elif "crates.io" in url or "codeload.github.com" in url:
expected_visit_types["directory"] += 1
else: # tarball artifacts
expected_visit_types["directory"] += 1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment