diff --git a/swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom b/swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom new file mode 100644 index 0000000000000000000000000000000000000000..28284e6d6629b8db320343589fb39cfd6b81e33c --- /dev/null +++ b/swh/lister/maven/tests/data/sprova4j-0.1.0.invalidurl.pom @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <modelVersion>4.0.0</modelVersion> + <groupId>al.aldi</groupId> + <artifactId>sprova4j</artifactId> + <version>0.1.0</version> + <name>sprova4j</name> + <description>Java client for Sprova Test Management</description> + <url>https://github.com/aldialimucaj/sprova4j</url> + <inceptionYear>2018</inceptionYear> + <licenses> + <license> + <name>The Apache Software License, Version 2.0</name> + <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> + <distribution>repo</distribution> + </license> + </licenses> + <developers> + <developer> + <id>aldi</id> + <name>Aldi Alimucaj</name> + <email>aldi.alimucaj@gmail.com</email> + </developer> + </developers> + <scm> + <connection>scm:git@github.com/aldialimucaj/sprova4j.git</connection> + <url>git@github.com/aldialimucaj/sprova4j</url> + </scm> +</project> + diff --git a/swh/lister/maven/tests/test_lister.py b/swh/lister/maven/tests/test_lister.py index 9bacd4ef709c1cb8152ae08dc20602d0c64ab581..18cde6526ba81596c05e7c76cb4ce3cf2af16365 100644 --- a/swh/lister/maven/tests/test_lister.py +++ b/swh/lister/maven/tests/test_lister.py @@ -170,6 +170,53 @@ def test_maven_full_listing_malformed( assert scheduler_state.last_seen_pom == -1 +def test_maven_ignore_invalid_url( + swh_scheduler, + requests_mock, + datadir, +): + """Covers full listing of multiple pages, checking page results with a malformed + scm entry in pom.""" + + lister = MavenLister( + scheduler=swh_scheduler, + url=MVN_URL, + instance="maven.org", + index_url=INDEX_URL, + incremental=False, + ) + + # Set up test. + requests_mock.get( + URL_POM_1, content=Path(datadir, "sprova4j-0.1.0.invalidurl.pom").read_bytes() + ) + + # Then run the lister. + stats = lister.run() + + # Start test checks. + assert stats.pages == 5 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + origin_urls = [origin.url for origin in scheduler_origins] + + # 1 git origins (the other ignored) + 1 maven origin with 2 releases (one per jar) + assert set(origin_urls) == {ORIGIN_GIT_INCR, ORIGIN_SRC} + assert len(origin_urls) == len(set(origin_urls)) + + for origin in scheduler_origins: + if origin.visit_type == "maven": + for src in LIST_SRC_DATA: + last_update_src = iso8601.parse_date(src["time"]) + assert last_update_src <= origin.last_update + assert origin.extra_loader_arguments["artifacts"] == list(LIST_SRC_DATA) + + scheduler_state = lister.get_state_from_scheduler() + assert scheduler_state is not None + assert scheduler_state.last_seen_doc == -1 + assert scheduler_state.last_seen_pom == -1 + + def test_maven_incremental_listing( swh_scheduler, requests_mock, diff --git a/swh/lister/opam/tests/test_lister.py b/swh/lister/opam/tests/test_lister.py index 26dc7535ad96f6c8291ab1faa751d2ce3d7f6a79..26526ba45358409d330f0745491fc96d61448df7 100644 --- a/swh/lister/opam/tests/test_lister.py +++ b/swh/lister/opam/tests/test_lister.py @@ -48,7 +48,7 @@ def test_mock_init_repository_update(mock_opam, tmp_path, datadir): mock_init, mock_popen = mock_opam instance = "fake_opam_repo" - instance_url = f"file://{datadir}/{instance}" + instance_url = f"http://example.org/{instance}" opam_root = str(tmp_path / "test-opam") os.makedirs(opam_root, exist_ok=True) @@ -112,8 +112,17 @@ def test_urls(swh_scheduler, mock_opam, tmp_path): assert expected_urls == result_urls -def test_opam_binary(datadir, swh_scheduler, tmp_path): - instance_url = f"file://{datadir}/fake_opam_repo" +def test_opam_binary(datadir, swh_scheduler, tmp_path, mocker): + from swh.lister.opam.lister import opam_init + + instance_url = "http://example.org/fake_opam_repo" + + def mock_opam_init(opam_root, instance, url, env): + assert url == instance_url + return opam_init(opam_root, instance, f"{datadir}/fake_opam_repo", env) + + # Patch opam_init to use the local directory + mocker.patch("swh.lister.opam.lister.opam_init", side_effect=mock_opam_init) lister = OpamLister( swh_scheduler, @@ -141,8 +150,17 @@ def test_opam_binary(datadir, swh_scheduler, tmp_path): assert expected_urls == result_urls -def test_opam_multi_instance(datadir, swh_scheduler, tmp_path): - instance_url = f"file://{datadir}/fake_opam_repo" +def test_opam_multi_instance(datadir, swh_scheduler, tmp_path, mocker): + from swh.lister.opam.lister import opam_init + + instance_url = "http://example.org/fake_opam_repo" + + def mock_opam_init(opam_root, instance, url, env): + assert url == instance_url + return opam_init(opam_root, instance, f"{datadir}/fake_opam_repo", env) + + # Patch opam_init to use the local directory + mocker.patch("swh.lister.opam.lister.opam_init", side_effect=mock_opam_init) lister = OpamLister( swh_scheduler, diff --git a/swh/lister/packagist/tests/data/payrix_payrix-php.json b/swh/lister/packagist/tests/data/payrix_payrix-php.json new file mode 100644 index 0000000000000000000000000000000000000000..43a6c77dd6ec3964490353c4ffe923e4758e824b --- /dev/null +++ b/swh/lister/packagist/tests/data/payrix_payrix-php.json @@ -0,0 +1,151 @@ +{ + "packages": { + "payrix/payrix-php": { + "dev-master": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "git@gitlab.com:payrix/public/payrix-php.git", + "type": "git", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=cf02195d3c32424396932e087824bf581966e703", + "type": "zip", + "shasum": "", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "type": "library", + "time": "2021-05-25T14:12:28+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "default-branch": true, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 4416889 + }, + "v2.0.0": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "v2.0.0", + "version_normalized": "2.0.0.0", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "https://gitlab.com/payrix/public/payrix-php.git", + "type": "git", + "reference": "4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68", + "type": "zip", + "shasum": "", + "reference": "4b40ad457a5cdbddb384b4d8f2c62d8d8c04ce68" + }, + "type": "library", + "time": "2020-09-03T11:26:52+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 4416947 + }, + "v2.0.1": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "v2.0.1", + "version_normalized": "2.0.1.0", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "https://gitlab.com/payrix/public/payrix-php.git", + "type": "git", + "reference": "9693f2dff0a589e16c88a9bf838069ab89166103" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=9693f2dff0a589e16c88a9bf838069ab89166103", + "type": "zip", + "shasum": "", + "reference": "9693f2dff0a589e16c88a9bf838069ab89166103" + }, + "type": "library", + "time": "2021-05-10T02:32:57+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 5183918 + }, + "v2.0.2": { + "name": "payrix/payrix-php", + "description": "PayrixPHP PHP SDK package", + "keywords": [], + "homepage": "https://portal.payrix.com", + "version": "v2.0.2", + "version_normalized": "2.0.2.0", + "license": [ + "Apache-2.0" + ], + "authors": [], + "source": { + "url": "https://gitlab.com/payrix/public/payrix-php.git", + "type": "git", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "dist": { + "url": "https://gitlab.com/api/v4/projects/payrix%2Fpublic%2Fpayrix-php/repository/archive.zip?sha=cf02195d3c32424396932e087824bf581966e703", + "type": "zip", + "shasum": "", + "reference": "cf02195d3c32424396932e087824bf581966e703" + }, + "type": "library", + "time": "2021-05-25T10:12:28+00:00", + "autoload": { + "psr-4": { + "PayrixPHP\\": "lib/" + } + }, + "require": { + "php": ">=5.4.0", + "ext-curl": "*", + "ext-openssl": "*" + }, + "uid": 5232658 + } + } + } +} diff --git a/swh/lister/packagist/tests/data/with_invalid_url.json b/swh/lister/packagist/tests/data/with_invalid_url.json new file mode 100644 index 0000000000000000000000000000000000000000..4b281ea33e59a0317fc88f7366c5ca8df3845815 --- /dev/null +++ b/swh/lister/packagist/tests/data/with_invalid_url.json @@ -0,0 +1,24 @@ +{ + "packages": { + "ycms/module-main": { + "dev-master": { + "name": "with/invalid_url", + "description": "", + "keywords": [], + "homepage": "", + "version": "dev-master", + "version_normalized": "9999999-dev", + "license": [], + "authors": [], + "source": { + "type": "git", + "url": "git@example.org/invalid/url.git", + "reference": "0000000000000000000000000000000000000000" + }, + "time": "2015-08-23T04:42:33+00:00", + "default-branch": true, + "uid": 4064797 + } + } + } +} diff --git a/swh/lister/packagist/tests/test_lister.py b/swh/lister/packagist/tests/test_lister.py index e2782eee564b5e5e7a937a28dac9a8f1cdcdfaf6..4f512a2399756674fd0b36749c4aa582723fc65b 100644 --- a/swh/lister/packagist/tests/test_lister.py +++ b/swh/lister/packagist/tests/test_lister.py @@ -14,7 +14,9 @@ _packages_list = { "ljjackson/linnworks", "lky/wx_article", "spryker-eco/computop-api", - "idevlab/essential", + "idevlab/essential", # Git SSH URL + "payrix/payrix-php", + "with/invalid_url", # invalid URL ] } @@ -49,7 +51,7 @@ def test_packagist_lister(swh_scheduler, requests_mock, datadir, requests_mock_d stats = lister.run() assert stats.pages == 1 - assert stats.origins == len(_packages_list["packageNames"]) + assert stats.origins == len(_packages_list["packageNames"]) - 2 assert lister.updated expected_origins = { @@ -69,9 +71,9 @@ def test_packagist_lister(swh_scheduler, requests_mock, datadir, requests_mock_d datetime.datetime.fromisoformat("2020-06-22T15:50:29+00:00"), ), ( - "git@gitlab.com:idevlab/Essential.git", # not GitHub + "https://gitlab.com/payrix/public/payrix-php.git", # not GitHub "git", - datetime.datetime.fromisoformat("2022-10-12T10:34:29+00:00"), + datetime.datetime.fromisoformat("2021-05-25T14:12:28+00:00"), ), } diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index 5b3a33d619673ebe9927907d87f76b164c144dc1..8a1b497a4bad94b528faed09b8a824b81caa2cf4 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -20,7 +20,7 @@ from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface from . import USER_AGENT_TEMPLATE -from .utils import http_retry +from .utils import http_retry, is_valid_origin_url logger = logging.getLogger(__name__) @@ -277,8 +277,15 @@ class Lister(Generic[StateType, PageType]): Returns: the list of origin URLs recorded in scheduler database """ + valid_origins = [] + for origin in origins: + if is_valid_origin_url(origin.url): + valid_origins.append(origin) + else: + logger.warning("Skipping invalid origin: %s", origin.url) + recorded_origins = [] - for batch_origins in grouper(origins, n=1000): + for batch_origins in grouper(valid_origins, n=1000): ret = self.scheduler.record_listed_origins(batch_origins) recorded_origins += [origin.url for origin in ret] diff --git a/swh/lister/utils.py b/swh/lister/utils.py index 125b31be71d890432cfaaccac66dc9478311136d..3220d4df854a279440a9315a8dedb1d03f7c9763 100644 --- a/swh/lister/utils.py +++ b/swh/lister/utils.py @@ -2,7 +2,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Callable, Iterator, Tuple +from typing import Callable, Iterator, Optional, Tuple +import urllib.parse from requests.exceptions import ConnectionError, HTTPError from requests.status_codes import codes @@ -111,3 +112,50 @@ def http_retry( """ return tenacity_retry(retry=retry, wait=wait, stop=stop, reraise=True, **retry_args) + + +def is_valid_origin_url(url: Optional[str]) -> bool: + """Returns whether the given string is a valid origin URL. + This excludes Git SSH URLs and pseudo-URLs (eg. ``ssh://git@example.org:foo`` + and ``git@example.org:foo``), as they are not supported by the Git loader + and usually require authentication. + + All HTTP URLs are allowed: + + >>> is_valid_origin_url("http://example.org/repo.git") + True + >>> is_valid_origin_url("http://example.org/repo") + True + >>> is_valid_origin_url("https://example.org/repo") + True + >>> is_valid_origin_url("https://foo:bar@example.org/repo") + True + + Scheme-less URLs are rejected; + + >>> is_valid_origin_url("example.org/repo") + False + >>> is_valid_origin_url("example.org:repo") + False + + Git SSH URLs and pseudo-URLs are rejected: + + >>> is_valid_origin_url("git@example.org:repo") + False + >>> is_valid_origin_url("ssh://git@example.org:repo") + False + """ + if not url: + # Empty or None + return False + + parsed = urllib.parse.urlparse(url) + if not parsed.netloc: + # Is parsed as a relative URL + return False + + if parsed.scheme == "ssh": + # Git SSH URL + return False + + return True