Skip to content
Snippets Groups Projects
Verified Commit ca4ab7f2 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

nixguix: Allow lister to ignore specific extensions

Those extensions can be extended through configuration. They default to some binary
format already encountered during docker runs.

Related to T3781
parent d96a39d5
No related branches found
No related tags found
No related merge requests found
......@@ -37,6 +37,21 @@ from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
# By default, ignore binary files and archives containing binaries
DEFAULT_EXTENSIONS_TO_IGNORE = [
"AppImage",
"bin",
"exe",
"iso",
"linux64",
"msi",
"png",
"dic",
"deb",
"rpm",
]
class ArtifactNatureUndetected(ValueError):
"""Raised when a remote artifact's nature (tarball, file) cannot be detected."""
......@@ -55,11 +70,7 @@ class ArtifactNatureMistyped(ValueError):
class ArtifactWithoutExtension(ValueError):
"""Raised when an artifact nature cannot be determined by its name.
This exception is solely for internal use of the :meth:`is_tarball` method.
"""
"""Raised when an artifact nature cannot be determined by its name."""
pass
......@@ -125,6 +136,22 @@ VCS_SUPPORTED = ("git", "svn", "hg")
POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
def url_endswith(
urlparsed, extensions: List[str], raise_when_no_extension: bool = True
) -> bool:
"""Determine whether urlparsed ends with one of the extensions.
Raises:
ArtifactWithoutExtension in case no extension is available and raise_when_no_extension
is True (the default)
"""
paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
if raise_when_no_extension and not any(path.suffix != "" for path in paths):
raise ArtifactWithoutExtension
return any(path.suffix.endswith(tuple(extensions)) for path in paths)
def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
"""Determine whether a list of files actually are tarballs or simple files.
......@@ -157,13 +184,7 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
urlparsed = urlparse(url)
if urlparsed.scheme not in ("http", "https", "ftp"):
raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
paths = [
Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)
]
if not any(path.suffix != "" for path in paths):
raise ArtifactWithoutExtension
return any(path.suffix.endswith(tuple(TARBALL_EXTENSIONS)) for path in paths)
return url_endswith(urlparsed, TARBALL_EXTENSIONS)
index = random.randrange(len(urls))
url = urls[index]
......@@ -247,6 +268,10 @@ class NixGuixLister(StatelessLister[PageResult]):
it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location`
response header, and then checks the extension again.
Optionally, when the `extension_to_ignore` parameter is provided, it extends the
default extensions to ignore (`DEFAULT_EXTENSIONS_TO_IGNORE`) with those passed.
This can be used to drop further binary files detected in the wild.
"""
LISTER_NAME = "nixguix"
......@@ -260,6 +285,7 @@ class NixGuixLister(StatelessLister[PageResult]):
credentials: Optional[CredentialsType] = None,
# canonicalize urls, can be turned off during docker runs
canonicalize: bool = True,
extensions_to_ignore: List[str] = [],
**kwargs: Any,
):
super().__init__(
......@@ -271,6 +297,7 @@ class NixGuixLister(StatelessLister[PageResult]):
# either full fqdn NixOS/nixpkgs or guix repository urls
# maybe add an assert on those specific urls?
self.origin_upstream = origin_upstream
self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore
self.session = requests.Session()
# for testing purposes, we may want to skip this step (e.g. docker run and rate
......@@ -435,13 +462,34 @@ class NixGuixLister(StatelessLister[PageResult]):
# 'critical' information about how to recompute the hash (e.g. fs
# layout, executable bit, ...)
logger.warning(
"Skipping artifact <%s>: 'file' artifact of type <%s> is "
"Skipping artifact <%s>: 'file' artifact of type <%s> is"
" missing information to properly check its integrity",
artifact,
artifact_type,
)
continue
# At this point plenty of heuristics happened and we should have found
# the right origin and its nature.
# Let's check and filter it out if it is to be ignored (if possible).
# Some origin urls may not have extension at this point (e.g
# http://git.marmaro.de/?p=mmh;a=snp;h=<id>;sf=tgz), let them through.
if url_endswith(
urlparse(origin),
self.extensions_to_ignore,
raise_when_no_extension=False,
):
logger.warning(
"Skipping artifact <%s>: 'file' artifact of type <%s> is"
" ignored due to lister configuration. It should ignore"
" origins with extension [%s]",
origin,
artifact_type,
",".join(self.extensions_to_ignore),
)
continue
logger.debug("%s: %s", "dir" if is_tar else "cnt", origin)
yield ArtifactType.ARTIFACT, Artifact(
origin=origin,
......
......@@ -57,6 +57,123 @@
"type": "url",
"urls": [ "https://code.9front.org/hg/plan9front" ],
"integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
},
{
"outputHash": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://github.com/KSP-CKAN/CKAN/releases/download/v1.30.4/ckan.exe"
],
"integrity": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://github.com/johannesjo/super-productivity/releases/download/v7.5.1/superProductivity-7.5.1.AppImage"
],
"integrity": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "19ir6x4c01825hpx2wbbcxkk70ymwbw4j03v8b2xc13ayylwzx0r",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"http://gorilla.dp100.com/downloads/gorilla1537_64.bin"
],
"integrity": "sha256-GfTPqfdqBNbFQnsASfji1YMzZ2drcdEvLAIFwEg3OaY=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "1zj53xybygps66m3v5kzi61vqy987zp6bfgk0qin9pja68qq75vx",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://fedorapeople.org/groups/virt/virtio-win/direct-downloads/archive-virtio/virtio-win-0.1.196-1/virtio-win.iso"
],
"integrity": "sha256-fZeDMTJK3mQjBvO5Ze4/KHm8g4l/lj2qMfo+v3wfRf4=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "02qgsj4h4zrjxkcclx7clsqbqd699kg0dq1xxa9hbj3vfnddjv1f",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://www.pjrc.com/teensy/td_153/TeensyduinoInstall.linux64"
],
"integrity": "sha256-LmzZmnV7yAWT6j3gBt5MyTS8sKbsdMrY7DJ/AonUDws=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://dl.winehq.org/wine/wine-mono/6.4.0/wine-mono-6.4.0-x86.msi"
],
"integrity": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "00y96w9shbbrdbf6xcjlahqd08154kkrxmqraik7qshiwcqpw7p4",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://raw.githubusercontent.com/webtorrent/webtorrent-desktop/v0.21.0/static/linux/share/icons/hicolor/48x48/apps/webtorrent-desktop.png"
],
"integrity": "sha256-5B5+MeMRanxmVBnXnuckJSDQMFRUsm7canktqBM3yQM=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "0lw193jr7ldvln5x5z9p21rz1by46h0say9whfcw2kxs9vprd5b3",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"http://xuxen.eus/static/hunspell/eu_ES.dic"
],
"integrity": "sha256-Y5WW7066T8GZgzx5pQE0xK/wcxA3/dKLpbvRk+VIgVM=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "0wbhvypdr96a5ddg6kj41dn9sbl49n7pfi2vs762ij82hm2gvwcm",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://www.openprinting.org/download/printdriver/components/lsb3.2/main/RPMS/noarch/openprinting-ppds-postscript-lexmark-20160218-1lsb3.2.noarch.rpm"
],
"integrity": "sha256-lfH9RIUCySjM0VtEd49NhC6dbAtETvNaK8qk3K7fcHE=",
"inferredFetcher": "unclassified"
},
{
"outputHash": "01gy84gr0gw5ap7hpy72azaf6hlzac7vxkn5cgad5sfbyzxgjgc9",
"outputHashAlgo": "sha256",
"outputHashMode": "flat",
"type": "url",
"urls": [
"https://wire-app.wire.com/linux/debian/pool/main/Wire-3.26.2941_amd64.deb"
],
"integrity": "sha256-iT35+vfL6dLUY8XOvg9Tn0Lj1Ffi+AvPVYU/kB9B/gU=",
"inferredFetcher": "unclassified"
},
{
"type": "url",
"urls": [
"https://elpa.gnu.org/packages/zones.foobar"
],
"integrity": "sha256-YRZc7dI3DjUzoSIp4fIshUyhMXIQ/fPKaKnjeYVa4WI="
}
],
"version":"1",
......
......@@ -8,6 +8,7 @@ import json
import logging
from pathlib import Path
from typing import Dict, List
from urllib.parse import urlparse
import pytest
import requests
......@@ -15,11 +16,14 @@ from requests.exceptions import ConnectionError, InvalidSchema, SSLError
from swh.lister import TARBALL_EXTENSIONS
from swh.lister.nixguix.lister import (
DEFAULT_EXTENSIONS_TO_IGNORE,
POSSIBLE_TARBALL_MIMETYPES,
ArtifactNatureMistyped,
ArtifactNatureUndetected,
ArtifactWithoutExtension,
NixGuixLister,
is_tarball,
url_endswith,
)
from swh.lister.pattern import ListerStats
......@@ -43,6 +47,33 @@ def page_response(datadir, instance: str = "success") -> List[Dict]:
return json.loads(datapath.read_text()) if datapath.exists else []
@pytest.mark.parametrize(
"name,expected_result",
[(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS]
+ [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS]
+ [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE]
+ [("two?file=something.el", False), ("foo?two=two&three=three", False)],
)
def test_url_endswith(name, expected_result):
"""It should detect whether url or query params of the urls ends with extensions"""
urlparsed = urlparse(f"https://example.org/{name}")
assert (
url_endswith(
urlparsed,
TARBALL_EXTENSIONS + DEFAULT_EXTENSIONS_TO_IGNORE,
raise_when_no_extension=False,
)
is expected_result
)
def test_url_endswith_raise():
"""It should raise when the tested url has no extension"""
urlparsed = urlparse("https://example.org/foo?two=two&three=three")
with pytest.raises(ArtifactWithoutExtension):
url_endswith(urlparsed, ["unimportant"])
@pytest.mark.parametrize(
"tarballs",
[[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
......@@ -254,10 +285,15 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
"""NixGuixLister should ignore unsupported or incomplete origins"""
"""NixGuixLister should ignore unsupported or incomplete or to ignore origins"""
url = SOURCES["nixpkgs"]["manifest"]
origin_upstream = SOURCES["nixpkgs"]["repo"]
lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
lister = NixGuixLister(
swh_scheduler,
url=url,
origin_upstream=origin_upstream,
extensions_to_ignore=["foobar"],
)
response = page_response(datadir, "failure")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment