diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py index 7ac71b08e36e361ea5768dc36aae7acb6a76724a..8eb40a02f6bfe005c3e0bc73353c37328f0ed16c 100644 --- a/swh/lister/gitweb/lister.py +++ b/swh/lister/gitweb/lister.py @@ -1,4 +1,5 @@ -# Copyright (C) 2023-2024 The Software Heritage developers +# Copyright (C) 2023-2025 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -152,16 +153,22 @@ class GitwebLister(StatelessLister[Repositories]): logger.debug("No git urls found on %s", repository_url) return repo - # look for the http/https url, if any, and use it as origin_url for url in urls: - parsed_url = urlparse(url) - if parsed_url.scheme == "https": - origin_url = url - break - elif parsed_url.scheme == "http" and self.instance_scheme == "https": - # workaround for non-working listed http origins - origin_url = url.replace("http://", "https://") - break + # if base_git_url is provided, return the clone URL starting with it + if self.base_git_url: + if url.startswith(self.base_git_url): + origin_url = url + break + else: + # look for the http/https url, if any, and use it as origin_url + parsed_url = urlparse(url) + if parsed_url.scheme == "https": + origin_url = url + break + elif parsed_url.scheme == "http" and self.instance_scheme == "https": + # workaround for non-working listed http origins + origin_url = url.replace("http://", "https://") + break else: # otherwise, choose the first one origin_url = urls[0] diff --git a/swh/lister/gitweb/tests/test_lister.py b/swh/lister/gitweb/tests/test_lister.py index 63534215c8f4f6c5ddb3d3c876b89960b752a7f3..3f55a4928ccfb5b1203ea2efde5d469ba2ae2682 100644 --- a/swh/lister/gitweb/tests/test_lister.py +++ b/swh/lister/gitweb/tests/test_lister.py @@ -1,4 +1,5 @@ -# Copyright (C) 2023-2024 The Software Heritage developers +# Copyright (C) 2023-2025 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -88,6 +89,32 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler): assert __version__ in user_agent +def test_lister_gitweb_run_with_base_git_url(requests_mock_datadir, swh_scheduler): + """Clone URLs starting with base_git_url should be picked.""" + + url = MAIN_INSTANCE_URL + lister_gitweb = GitwebLister( + swh_scheduler, url=url, base_git_url=f"git://{MAIN_INSTANCE}/" + ) + + stats = lister_gitweb.run() + + expected_nb_origins = 7 # main page will get filtered out + assert stats == ListerStats(pages=1, origins=expected_nb_origins) + + # test page parsing + scheduler_origins = swh_scheduler.get_listed_origins( + lister_gitweb.lister_obj.id + ).results + + # test listed repositories + for listed_origin in scheduler_origins: + assert listed_origin.visit_type == "git" + assert listed_origin.url.startswith("git://") + assert listed_origin.last_update is not None + assert "," not in listed_origin.url + + def test_lister_gitweb_get_pages_with_pages_and_retry( requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler ):