Skip to content
Snippets Groups Projects
Verified Commit a94b75f3 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

nixguix: Deal with mistyped origins

Some origins are listed as urls while they are not. They are possibly vcs. So this
commit tries to detect and and deal with those if possible. If not possible, they are
skipped.

Related to T3781
Related to P1470
parent 1b4fe51f
No related branches found
No related tags found
No related merge requests found
......@@ -41,6 +41,13 @@ class ArtifactNatureUndetected(ValueError):
pass
class ArtifactNatureMistyped(ValueError):
"""Raised when a remote artifact's neither a tarball nor a file. It's probably a
misconfiguration in the manifest that badly typed a vcs repository."""
pass
@dataclass
class OriginUpstream:
"""Upstream origin (e.g. NixOS/nixpkgs, Guix/Guix)."""
......@@ -73,10 +80,10 @@ class VCS:
origin: str
"""Origin url of the vcs"""
ref: Optional[str]
"""Reference either a svn commit id, a git commit, ..."""
type: str
"""Type of (d)vcs, e.g. svn, git, hg, ..."""
ref: Optional[str] = None
"""Reference either a svn commit id, a git commit, ..."""
class ArtifactType(Enum):
......@@ -109,6 +116,8 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
Raises:
ArtifactNatureUndetected when the artifact's nature cannot be detected out
of its url
ArtifactNatureMistyped when the artifact is not a tarball nor a file. It's up to
the caller to do what's right with it.
Returns: A tuple (bool, url). The boolean represents whether the url is an archive
or not. The second parameter is the actual url once the head request is issued
......@@ -123,27 +132,37 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
IndexError in case no extension is available
"""
return Path(urlparse(url).path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS
urlparsed = urlparse(url)
if urlparsed.scheme not in ("http", "https", "ftp"):
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
return Path(urlparsed.path).suffixes[-1].lstrip(".") in TARBALL_EXTENSIONS
index = random.randrange(len(urls))
url = urls[index]
try:
is_tar = _is_tarball(url)
return is_tar, urls[0]
except IndexError:
if request is None:
raise ArtifactNatureUndetected(
"Cannot determine artifact type from url %s", url
f"Cannot determine artifact type from url <{url}>"
)
logger.warning(
"Cannot detect extension for '%s'. Fallback to http head query",
"Cannot detect extension for <%s>. Fallback to http head query",
url,
)
response = request.head(url)
try:
response = request.head(url)
except requests.exceptions.InvalidSchema:
raise ArtifactNatureUndetected(
f"Cannot determine artifact type from url <{url}>"
)
if not response.ok or response.status_code == 404:
raise ArtifactNatureUndetected(
"Cannot determine artifact type from url %s", url
f"Cannot determine artifact type from url <{url}>"
)
location = response.headers.get("Location")
if location: # It's not always present
......@@ -154,7 +173,7 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
return _is_tarball(location), location
except IndexError:
logger.warning(
"Still cannot detect extension through location '%s'...",
"Still cannot detect extension through location <%s>...",
url,
)
......@@ -166,7 +185,7 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
return content_type in POSSIBLE_TARBALL_MIMETYPES, urls[0]
raise ArtifactNatureUndetected(
"Cannot determine artifact type from url %s", url
f"Cannot determine artifact type from url <{url}>"
)
......@@ -287,23 +306,48 @@ class NixGuixLister(StatelessLister[PageResult]):
urls = artifact.get("urls")
if not urls:
# Nothing to fetch
logger.warning("Skipping url '%s': empty artifact", artifact)
logger.warning("Skipping url <%s>: empty artifact", artifact)
continue
assert urls is not None
# Deal with misplaced origins
# FIXME: T3294: Fix missing scheme in urls
origin, *fallback_urls = urls
integrity = artifact.get("integrity")
if integrity is None:
logger.warning("Skipping url '%s': missing integrity field", origin)
logger.warning("Skipping url <%s>: missing integrity field", origin)
continue
try:
is_tar, origin = is_tarball(urls, self.session)
except ArtifactNatureMistyped:
logger.warning(
"Mistyped url <%s>: trying to deal with it properly", origin
)
urlparsed = urlparse(origin)
artifact_type = urlparsed.scheme
if artifact_type in VCS_SUPPORTED:
artifact_url = (
self.github_session.get_canonical_url(origin)
if self.github_session
else origin
)
if not artifact_url:
continue
yield ArtifactType.VCS, VCS(
origin=artifact_url, type=artifact_type
)
else:
logger.warning(
"Skipping url <%s>: undetected remote artifact type", origin
)
continue
except ArtifactNatureUndetected:
logger.warning(
"Skipping url '%s': undetected remote artifact type", origin
"Skipping url <%s>: undetected remote artifact type", origin
)
continue
......@@ -325,7 +369,7 @@ class NixGuixLister(StatelessLister[PageResult]):
)
else:
logger.warning(
"Skipping artifact '%s': unsupported type %s",
"Skipping artifact <%s>: unsupported type %s",
artifact,
artifact_type,
)
......
......@@ -12,6 +12,18 @@
{
"type": "url",
"urls": [ "https://example.org/another-file-no-integrity-so-skipped.txt" ]
},
{
"type": "url",
"urls": [
"ftp://ftp.ourproject.org/file-with-no-extension"
],
"integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="
},
{
"type": "url",
"urls": [ "unknown://example.org/wrong-scheme-so-skipped.txt" ],
"integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
}
],
"version":"1",
......
......@@ -22,6 +22,13 @@
],
"integrity": "sha256-1w3NdfRzp9XIFDLD2SYJJr+Nnf9c1UF5YWlJfRxSLt0="
},
{
"type": "url",
"urls": [
"ftp://ftp.ourproject.org/pub/ytalk/ytalk-3.3.0.tar.gz"
],
"integrity": "sha256-bss09x9yOnuW+Q5BHHjf8nNcCNxCKMdl9/2/jKSFcrQ="
},
{
"type": "url",
"urls": [
......@@ -45,6 +52,11 @@
"type": "svn",
"svn_url": "https://code.call-cc.org/svn/chicken-eggs/release/5/iset/tags/2.2",
"svn_revision": 39057
},
{
"type": "url",
"urls": ["svn://svn.code.sf.net/p/acme-crossass/code-0/trunk"],
"integrity": "sha256-VifIQ+UEVMKJ+cNS+Xxusazinr5Cgu1lmGuhqj/5Mpk="
}
],
"version": "1",
......
......@@ -15,6 +15,7 @@ import requests
from swh.lister import TARBALL_EXTENSIONS
from swh.lister.nixguix.lister import (
POSSIBLE_TARBALL_MIMETYPES,
ArtifactNatureMistyped,
ArtifactNatureUndetected,
NixGuixLister,
is_tarball,
......@@ -31,19 +32,20 @@ def page_response(datadir, instance: str) -> List[Dict]:
@pytest.mark.parametrize(
"urls",
"tarballs",
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
+ [[f"one.{ext}?foo=bar"] for ext in TARBALL_EXTENSIONS],
)
def test_is_tarball_simple(urls):
def test_is_tarball_simple(tarballs):
"""Simple check on tarball should discriminate betwenn tarball and file"""
urls = [f"https://example.org/{tarball}" for tarball in tarballs]
is_tar, origin = is_tarball(urls)
assert is_tar is True
assert origin == urls[0]
@pytest.mark.parametrize(
"urls",
"files",
[
["abc.lisp"],
["one.abc", "two.bcd"],
......@@ -52,8 +54,9 @@ def test_is_tarball_simple(urls):
["config.nix", "flakes.nix"],
],
)
def test_is_tarball_simple_not_tarball(urls):
def test_is_tarball_simple_not_tarball(files):
"""Simple check on tarball should discriminate betwenn tarball and file"""
urls = [f"http://example.org/{file}" for file in files]
is_tar, origin = is_tarball(urls)
assert is_tar is False
assert origin == urls[0]
......@@ -65,7 +68,7 @@ def test_is_tarball_complex_with_no_result(requests_mock):
url = "https://example.org/crates/package/download"
urls = [url]
with pytest.raises(ArtifactNatureUndetected):
is_tarball(url) # no request parameter, this cannot fallback, raises
is_tarball(urls) # no request parameter, this cannot fallback, raises
with pytest.raises(ArtifactNatureUndetected):
requests_mock.head(
......@@ -87,6 +90,16 @@ def test_is_tarball_complex_with_no_result(requests_mock):
)
is_tarball(urls, requests)
with pytest.raises(ArtifactNatureMistyped):
is_tarball(["foo://example.org/unsupported-scheme"])
with pytest.raises(ArtifactNatureMistyped):
fallback_url = "foo://example.org/unsupported-scheme"
requests_mock.head(
url, headers={"location": fallback_url} # still no extension, cannot detect
)
is_tarball(urls, requests)
@pytest.mark.parametrize(
"fallback_url, expected_result",
......@@ -162,6 +175,8 @@ def test_lister_nixguix(datadir, swh_scheduler, requests_mock):
url = artifact["urls"][0]
if url.endswith(".c") or url.endswith(".txt"):
expected_visit_types["content"] += 1
elif url.startswith("svn"): # mistyped artifact rendered as vcs nonetheless
expected_visit_types["svn"] += 1
else:
expected_visit_types["directory"] += 1
......@@ -214,6 +229,12 @@ def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
"https://crates.io/api/v1/0.1.5/no-extension-and-head-404-so-skipped",
status_code=404,
)
# Will raise for that origin, this will get ignored as we cannot determine anything
# from its name
requests_mock.head(
"ftp://ftp.ourproject.org/file-with-no-extension",
exc=requests.exceptions.InvalidSchema,
)
listed_result = lister.run()
# only the origin upstream is listed, every other entries are unsupported or incomplete
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment