Skip to content
Snippets Groups Projects
Verified Commit f7128cfc authored by Antoine Lambert's avatar Antoine Lambert
Browse files

gitweb: Check git URLs can be cloned before creating loading tasks

Git URLs scapped from gitweb pages or derived from project URLs are
now checked for clonability before sending them to the scheduler.

It allows to discard invalid clone URLs as it exists cases where only
one URL can be cloned from the list provided by a gitweb project.

This slightly slow downs the listing process but ensures better result.

Fixes #4713.
parent f2f9c7d1
No related branches found
Tags v0.0.4
No related merge requests found
Pipeline #13823 passed
# Copyright (C) 2023-2024 The Software Heritage developers
# Copyright (C) 2023-2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -9,6 +10,7 @@ from urllib.parse import parse_qs, urljoin, urlparse
from bs4 import BeautifulSoup
from dateparser import parse
from dulwich.porcelain import ls_remote
from requests.exceptions import HTTPError
from swh.lister.pattern import CredentialsType, StatelessLister
......@@ -20,6 +22,17 @@ logger = logging.getLogger(__name__)
Repositories = List[Dict[str, Any]]
def valid_git_repo_url(git_repo_url: str) -> bool:
try:
ls_remote(git_repo_url)
except Exception:
logger.debug("Git URL %s cannot be cloned", git_repo_url)
return False
else:
logger.debug("Git URL %s can be cloned", git_repo_url)
return True
class GitwebLister(StatelessLister[Repositories]):
"""Lister class for Gitweb repositories.
......@@ -132,40 +145,32 @@ class GitwebLister(StatelessLister[Repositories]):
urls = []
for row in bs.select("tr.metadata_url"):
url = row.select("td")[-1].text.strip()
for scheme in ("http", "https", "git"):
# remove any string prefix before origin
pos = url.find(f"{scheme}://")
if pos != -1:
url = url[pos:]
break
if "," in url:
urls_ = [s.strip() for s in url.split(",") if s]
urls.extend(urls_)
else:
urls.append(url)
for url in row.select("td")[-1].text.strip().split(","):
for scheme in ("http", "https", "git"):
# remove any string prefix before origin
pos = url.find(f"{scheme}://")
if pos != -1:
urls.append(url[pos:].strip())
if not urls:
repo = try_to_determine_git_repository(repository_url, self.base_git_url)
if not repo:
logger.debug("No git urls found on %s", repository_url)
return repo
else:
urls = [repo]
# look for the http/https url, if any, and use it as origin_url
for url in urls:
# look for the first valid clone URL and check those with http* scheme first
for url in reversed(sorted(urls)):
if valid_git_repo_url(url):
return url
parsed_url = urlparse(url)
if parsed_url.scheme == "https":
origin_url = url
break
elif parsed_url.scheme == "http" and self.instance_scheme == "https":
if parsed_url.scheme == "http" and self.instance_scheme == "https":
# workaround for non-working listed http origins
origin_url = url.replace("http://", "https://")
break
else:
# otherwise, choose the first one
origin_url = urls[0]
return origin_url
if valid_git_repo_url(origin_url):
return origin_url
return None
def try_to_determine_git_repository(
......
# Copyright (C) 2023-2024 The Software Heritage developers
# Copyright (C) 2023-2025 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
......@@ -19,6 +20,11 @@ MAIN_INSTANCE = "git.distorted.org.uk"
MAIN_INSTANCE_URL = f"https://{MAIN_INSTANCE}/~mdw"
@pytest.fixture(autouse=True)
def valid_git_repo_url_mocker(mocker):
mocker.patch("swh.lister.gitweb.lister.valid_git_repo_url").return_value = True
def test_lister_gitweb_instantiate(swh_scheduler):
"""Build a lister with either an url or an instance is supported."""
url = MAIN_INSTANCE_URL
......@@ -75,8 +81,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler):
# test listed repositories
for listed_origin in scheduler_origins:
assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith(url)
assert listed_origin.url.startswith("https://")
assert listed_origin.url.startswith((url, url.replace("https://", "http://")))
assert listed_origin.last_update is not None
assert "," not in listed_origin.url
......@@ -88,6 +93,36 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler):
assert __version__ in user_agent
def test_lister_gitweb_run_https_urls_down(
requests_mock_datadir, swh_scheduler, mocker
):
"""URLs with git scheme should be picked when those with the
https scheme are down."""
mocker.patch("swh.lister.gitweb.lister.valid_git_repo_url").side_effect = (
lambda url: url.startswith("git://")
)
url = MAIN_INSTANCE_URL
lister_gitweb = GitwebLister(swh_scheduler, url=url)
stats = lister_gitweb.run()
expected_nb_origins = 7 # main page will get filtered out
assert stats == ListerStats(pages=1, origins=expected_nb_origins)
scheduler_origins = swh_scheduler.get_listed_origins(
lister_gitweb.lister_obj.id
).results
# test listed repositories
for listed_origin in scheduler_origins:
assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith("git://")
assert listed_origin.last_update is not None
assert "," not in listed_origin.url
def test_lister_gitweb_get_pages_with_pages_and_retry(
requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler
):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment