From ca4ab7f277dc51efc62dbb5c6866dc424d117d6a Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Tue, 25 Oct 2022 10:39:01 +0200
Subject: [PATCH] nixguix: Allow lister to ignore specific extensions

Those extensions can be extended through configuration. They default to some binary
format already encountered during docker runs.

Related to T3781
---
 swh/lister/nixguix/lister.py                  |  74 +++++++++--
 .../nixguix/tests/data/sources-failure.json   | 117 ++++++++++++++++++
 swh/lister/nixguix/tests/test_lister.py       |  40 +++++-
 3 files changed, 216 insertions(+), 15 deletions(-)

diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py
index 1dbd4def..b5b6e899 100644
--- a/swh/lister/nixguix/lister.py
+++ b/swh/lister/nixguix/lister.py
@@ -37,6 +37,21 @@ from swh.scheduler.model import ListedOrigin
 logger = logging.getLogger(__name__)
 
 
+# By default, ignore binary files and archives containing binaries
+DEFAULT_EXTENSIONS_TO_IGNORE = [
+    "AppImage",
+    "bin",
+    "exe",
+    "iso",
+    "linux64",
+    "msi",
+    "png",
+    "dic",
+    "deb",
+    "rpm",
+]
+
+
 class ArtifactNatureUndetected(ValueError):
     """Raised when a remote artifact's nature (tarball, file) cannot be detected."""
 
@@ -55,11 +70,7 @@ class ArtifactNatureMistyped(ValueError):
 
 
 class ArtifactWithoutExtension(ValueError):
-    """Raised when an artifact nature cannot be determined by its name.
-
-    This exception is solely for internal use of the :meth:`is_tarball` method.
-
-    """
+    """Raised when an artifact nature cannot be determined by its name."""
 
     pass
 
@@ -125,6 +136,22 @@ VCS_SUPPORTED = ("git", "svn", "hg")
 POSSIBLE_TARBALL_MIMETYPES = tuple(MIMETYPE_TO_ARCHIVE_FORMAT.keys())
 
 
+def url_endswith(
+    urlparsed, extensions: List[str], raise_when_no_extension: bool = True
+) -> bool:
+    """Determine whether urlparsed ends with one of the extensions.
+
+    Raises:
+        ArtifactWithoutExtension in case no extension is available and raise_when_no_extension
+          is True (the default)
+
+    """
+    paths = [Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)]
+    if raise_when_no_extension and not any(path.suffix != "" for path in paths):
+        raise ArtifactWithoutExtension
+    return any(path.suffix.endswith(tuple(extensions)) for path in paths)
+
+
 def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, str]:
     """Determine whether a list of files actually are tarballs or simple files.
 
@@ -157,13 +184,7 @@ def is_tarball(urls: List[str], request: Optional[Any] = None) -> Tuple[bool, st
         urlparsed = urlparse(url)
         if urlparsed.scheme not in ("http", "https", "ftp"):
             raise ArtifactNatureMistyped(f"Mistyped artifact '{url}'")
-
-        paths = [
-            Path(p) for (_, p) in [("_", urlparsed.path)] + parse_qsl(urlparsed.query)
-        ]
-        if not any(path.suffix != "" for path in paths):
-            raise ArtifactWithoutExtension
-        return any(path.suffix.endswith(tuple(TARBALL_EXTENSIONS)) for path in paths)
+        return url_endswith(urlparsed, TARBALL_EXTENSIONS)
 
     index = random.randrange(len(urls))
     url = urls[index]
@@ -247,6 +268,10 @@ class NixGuixLister(StatelessLister[PageResult]):
     it fallbacks to query (HEAD) the url to retrieve the origin out of the `Location`
     response header, and then checks the extension again.
 
+    Optionally, when the `extension_to_ignore` parameter is provided, it extends the
+    default extensions to ignore (`DEFAULT_EXTENSIONS_TO_IGNORE`) with those passed.
+    This can be used to drop further binary files detected in the wild.
+
     """
 
     LISTER_NAME = "nixguix"
@@ -260,6 +285,7 @@ class NixGuixLister(StatelessLister[PageResult]):
         credentials: Optional[CredentialsType] = None,
         # canonicalize urls, can be turned off during docker runs
         canonicalize: bool = True,
+        extensions_to_ignore: List[str] = [],
         **kwargs: Any,
     ):
         super().__init__(
@@ -271,6 +297,7 @@ class NixGuixLister(StatelessLister[PageResult]):
         # either full fqdn NixOS/nixpkgs or guix repository urls
         # maybe add an assert on those specific urls?
         self.origin_upstream = origin_upstream
+        self.extensions_to_ignore = DEFAULT_EXTENSIONS_TO_IGNORE + extensions_to_ignore
 
         self.session = requests.Session()
         # for testing purposes, we may want to skip this step (e.g. docker run and rate
@@ -435,13 +462,34 @@ class NixGuixLister(StatelessLister[PageResult]):
                     # 'critical' information about how to recompute the hash (e.g. fs
                     # layout, executable bit, ...)
                     logger.warning(
-                        "Skipping artifact <%s>: 'file' artifact of type <%s> is "
+                        "Skipping artifact <%s>: 'file' artifact of type <%s> is"
                         " missing information to properly check its integrity",
                         artifact,
                         artifact_type,
                     )
                     continue
 
+                # At this point plenty of heuristics happened and we should have found
+                # the right origin and its nature.
+
+                # Let's check and filter it out if it is to be ignored (if possible).
+                # Some origin urls may not have extension at this point (e.g
+                # http://git.marmaro.de/?p=mmh;a=snp;h=<id>;sf=tgz), let them through.
+                if url_endswith(
+                    urlparse(origin),
+                    self.extensions_to_ignore,
+                    raise_when_no_extension=False,
+                ):
+                    logger.warning(
+                        "Skipping artifact <%s>: 'file' artifact of type <%s> is"
+                        " ignored due to lister configuration. It should ignore"
+                        " origins with extension [%s]",
+                        origin,
+                        artifact_type,
+                        ",".join(self.extensions_to_ignore),
+                    )
+                    continue
+
                 logger.debug("%s: %s", "dir" if is_tar else "cnt", origin)
                 yield ArtifactType.ARTIFACT, Artifact(
                     origin=origin,
diff --git a/swh/lister/nixguix/tests/data/sources-failure.json b/swh/lister/nixguix/tests/data/sources-failure.json
index e0844af2..237a0186 100644
--- a/swh/lister/nixguix/tests/data/sources-failure.json
+++ b/swh/lister/nixguix/tests/data/sources-failure.json
@@ -57,6 +57,123 @@
       "type": "url",
       "urls": [ "https://code.9front.org/hg/plan9front" ],
       "integrity": "sha256-wAEswtkl3ulAw3zq4perrGS6Wlww5XXnQYsEAoYT9fI="
+    },
+    {
+      "outputHash": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "flat",
+      "type": "url",
+      "urls": [
+        "https://github.com/KSP-CKAN/CKAN/releases/download/v1.30.4/ckan.exe"
+      ],
+      "integrity": "sha256-IgPqUEDpaIuGoaGoH2GCEzh3KxF3pkJC3VjTYXwSiQE=",
+      "inferredFetcher": "unclassified"
+    },
+    {
+      "outputHash": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "flat",
+      "type": "url",
+      "urls": [
+        "https://github.com/johannesjo/super-productivity/releases/download/v7.5.1/superProductivity-7.5.1.AppImage"
+      ],
+      "integrity": "sha256-ezJN/t0iNk0haMLPioEQSNXU4ugVeJe44GNVGd+cOF4=",
+      "inferredFetcher": "unclassified"
+    },
+    {
+      "outputHash": "19ir6x4c01825hpx2wbbcxkk70ymwbw4j03v8b2xc13ayylwzx0r",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "flat",
+      "type": "url",
+      "urls": [
+        "http://gorilla.dp100.com/downloads/gorilla1537_64.bin"
+      ],
+      "integrity": "sha256-GfTPqfdqBNbFQnsASfji1YMzZ2drcdEvLAIFwEg3OaY=",
+      "inferredFetcher": "unclassified"
+    },
+    {
+      "outputHash": "1zj53xybygps66m3v5kzi61vqy987zp6bfgk0qin9pja68qq75vx",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "flat",
+      "type": "url",
+      "urls": [
+        "https://fedorapeople.org/groups/virt/virtio-win/direct-downloads/archive-virtio/virtio-win-0.1.196-1/virtio-win.iso"
+      ],
+      "integrity": "sha256-fZeDMTJK3mQjBvO5Ze4/KHm8g4l/lj2qMfo+v3wfRf4=",
+      "inferredFetcher": "unclassified"
+    },
+    {
+      "outputHash": "02qgsj4h4zrjxkcclx7clsqbqd699kg0dq1xxa9hbj3vfnddjv1f",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "flat",
+      "type": "url",
+      "urls": [
+        "https://www.pjrc.com/teensy/td_153/TeensyduinoInstall.linux64"
+      ],
+      "integrity": "sha256-LmzZmnV7yAWT6j3gBt5MyTS8sKbsdMrY7DJ/AonUDws=",
+      "inferredFetcher": "unclassified"
+    },
+    {
+      "outputHash": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "flat",
+      "type": "url",
+      "urls": [
+        "https://dl.winehq.org/wine/wine-mono/6.4.0/wine-mono-6.4.0-x86.msi"
+      ],
+      "integrity": "sha256-24uF87kQWQ9hrb+gAFqZXWE+KZocxz0AVT1w3IEBDjY=",
+      "inferredFetcher": "unclassified"
+    },
+    {
+      "outputHash": "00y96w9shbbrdbf6xcjlahqd08154kkrxmqraik7qshiwcqpw7p4",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "flat",
+      "type": "url",
+      "urls": [
+        "https://raw.githubusercontent.com/webtorrent/webtorrent-desktop/v0.21.0/static/linux/share/icons/hicolor/48x48/apps/webtorrent-desktop.png"
+      ],
+      "integrity": "sha256-5B5+MeMRanxmVBnXnuckJSDQMFRUsm7canktqBM3yQM=",
+      "inferredFetcher": "unclassified"
+    },
+    {
+      "outputHash": "0lw193jr7ldvln5x5z9p21rz1by46h0say9whfcw2kxs9vprd5b3",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "flat",
+      "type": "url",
+      "urls": [
+        "http://xuxen.eus/static/hunspell/eu_ES.dic"
+      ],
+      "integrity": "sha256-Y5WW7066T8GZgzx5pQE0xK/wcxA3/dKLpbvRk+VIgVM=",
+      "inferredFetcher": "unclassified"
+    },
+    {
+      "outputHash": "0wbhvypdr96a5ddg6kj41dn9sbl49n7pfi2vs762ij82hm2gvwcm",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "flat",
+      "type": "url",
+      "urls": [
+        "https://www.openprinting.org/download/printdriver/components/lsb3.2/main/RPMS/noarch/openprinting-ppds-postscript-lexmark-20160218-1lsb3.2.noarch.rpm"
+      ],
+      "integrity": "sha256-lfH9RIUCySjM0VtEd49NhC6dbAtETvNaK8qk3K7fcHE=",
+      "inferredFetcher": "unclassified"
+    },
+    {
+      "outputHash": "01gy84gr0gw5ap7hpy72azaf6hlzac7vxkn5cgad5sfbyzxgjgc9",
+      "outputHashAlgo": "sha256",
+      "outputHashMode": "flat",
+      "type": "url",
+      "urls": [
+        "https://wire-app.wire.com/linux/debian/pool/main/Wire-3.26.2941_amd64.deb"
+      ],
+      "integrity": "sha256-iT35+vfL6dLUY8XOvg9Tn0Lj1Ffi+AvPVYU/kB9B/gU=",
+      "inferredFetcher": "unclassified"
+    },
+    {
+      "type": "url",
+      "urls": [
+        "https://elpa.gnu.org/packages/zones.foobar"
+      ],
+      "integrity": "sha256-YRZc7dI3DjUzoSIp4fIshUyhMXIQ/fPKaKnjeYVa4WI="
     }
   ],
   "version":"1",
diff --git a/swh/lister/nixguix/tests/test_lister.py b/swh/lister/nixguix/tests/test_lister.py
index d19b9c55..78bde96b 100644
--- a/swh/lister/nixguix/tests/test_lister.py
+++ b/swh/lister/nixguix/tests/test_lister.py
@@ -8,6 +8,7 @@ import json
 import logging
 from pathlib import Path
 from typing import Dict, List
+from urllib.parse import urlparse
 
 import pytest
 import requests
@@ -15,11 +16,14 @@ from requests.exceptions import ConnectionError, InvalidSchema, SSLError
 
 from swh.lister import TARBALL_EXTENSIONS
 from swh.lister.nixguix.lister import (
+    DEFAULT_EXTENSIONS_TO_IGNORE,
     POSSIBLE_TARBALL_MIMETYPES,
     ArtifactNatureMistyped,
     ArtifactNatureUndetected,
+    ArtifactWithoutExtension,
     NixGuixLister,
     is_tarball,
+    url_endswith,
 )
 from swh.lister.pattern import ListerStats
 
@@ -43,6 +47,33 @@ def page_response(datadir, instance: str = "success") -> List[Dict]:
     return json.loads(datapath.read_text()) if datapath.exists else []
 
 
+@pytest.mark.parametrize(
+    "name,expected_result",
+    [(f"one.{ext}", True) for ext in TARBALL_EXTENSIONS]
+    + [(f"one.{ext}?foo=bar", True) for ext in TARBALL_EXTENSIONS]
+    + [(f"one?p0=1&foo=bar.{ext}", True) for ext in DEFAULT_EXTENSIONS_TO_IGNORE]
+    + [("two?file=something.el", False), ("foo?two=two&three=three", False)],
+)
+def test_url_endswith(name, expected_result):
+    """It should detect whether url or query params of the urls ends with extensions"""
+    urlparsed = urlparse(f"https://example.org/{name}")
+    assert (
+        url_endswith(
+            urlparsed,
+            TARBALL_EXTENSIONS + DEFAULT_EXTENSIONS_TO_IGNORE,
+            raise_when_no_extension=False,
+        )
+        is expected_result
+    )
+
+
+def test_url_endswith_raise():
+    """It should raise when the tested url has no extension"""
+    urlparsed = urlparse("https://example.org/foo?two=two&three=three")
+    with pytest.raises(ArtifactWithoutExtension):
+        url_endswith(urlparsed, ["unimportant"])
+
+
 @pytest.mark.parametrize(
     "tarballs",
     [[f"one.{ext}", f"two.{ext}"] for ext in TARBALL_EXTENSIONS]
@@ -254,10 +285,15 @@ def test_lister_nixguix_ok(datadir, swh_scheduler, requests_mock):
 
 
 def test_lister_nixguix_mostly_noop(datadir, swh_scheduler, requests_mock):
-    """NixGuixLister should ignore unsupported or incomplete origins"""
+    """NixGuixLister should ignore unsupported or incomplete or to ignore origins"""
     url = SOURCES["nixpkgs"]["manifest"]
     origin_upstream = SOURCES["nixpkgs"]["repo"]
-    lister = NixGuixLister(swh_scheduler, url=url, origin_upstream=origin_upstream)
+    lister = NixGuixLister(
+        swh_scheduler,
+        url=url,
+        origin_upstream=origin_upstream,
+        extensions_to_ignore=["foobar"],
+    )
 
     response = page_response(datadir, "failure")
 
-- 
GitLab