diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py index 62f4f80f7c3360b7b8e56ed4e21537e46cf37f22..0bd2d66e92aefce7ec04d875756b7b3034ce9d34 100644 --- a/swh/lister/gitweb/lister.py +++ b/swh/lister/gitweb/lister.py @@ -6,7 +6,7 @@ from datetime import timezone import logging import re from typing import Any, Dict, Iterator, List, Optional -from urllib.parse import urljoin, urlparse +from urllib.parse import parse_qs, urljoin, urlparse from bs4 import BeautifulSoup from dateparser import parse @@ -86,7 +86,7 @@ class GitwebLister(StatelessLister[Repositories]): if not link: continue - repo_url = link["href"] + repo_url = urljoin(self.url, link["href"]).strip("/") if repo_url.endswith("?o=descr"): continue @@ -136,30 +136,19 @@ class GitwebLister(StatelessLister[Repositories]): ) return None - # check if we are on the summary tab, if not, go to this tab - tab = bs.find("table", {"class": "tabs"}) - if tab: - summary_a = tab.find("a", string="summary") - if summary_a: - summary_url = urljoin(repository_url, summary_a["href"]).strip("/") - - if summary_url != repository_url: - logger.debug( - "%s : Active tab is not the summary, trying to load the summary page", - repository_url, - ) - return self._get_origin_from_repository_url(summary_url) - else: - logger.debug("No summary tab found on %s", repository_url) + urls = [] + for row in bs.find_all("tr", {"class": "metadata_url"}): + url = row.contents[-1].string.strip() - urls = [ - row.contents[-1].string - for row in bs.find_all("tr", {"class": "metadata_url"}) - ] + if "," in url: + urls_ = [s.strip() for s in url.split(",") if s] + urls.extend(urls_) + else: + urls.append(url) if not urls: logger.debug("No git urls found on %s", repository_url) - return None + return try_to_determine_git_repository(repository_url) # look for the http/https url, if any, and use it as origin_url for url in urls: @@ -170,3 +159,21 @@ class GitwebLister(StatelessLister[Repositories]): # otherwise, choose the first one origin_url = urls[0] return origin_url + + +def try_to_determine_git_repository(repository_url: str) -> Optional[str]: + """Some gitweb instances does not advertise the git urls. + + This heuristic works on instances demonstrating this behavior. + + """ + result = None + parsed_url = urlparse(repository_url) + params = parse_qs(parsed_url.query).get("p") + if params: + repo = params[0] + if repo and repo.endswith(";a=summary"): + repo = repo.rstrip(";a=summary") + + result = f"git://{parsed_url.netloc}/{repo}" + return result diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips index 796be69335ea073290c04c9d841c6edc67e2b6c2..83a9065966675ff1be3f6dd7b80ec2a9b4df5aca 100644 --- a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips @@ -43,8 +43,7 @@ summary | <a href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog< <tr id="metadata_desc"><td>description</td><td>Introduction to Provable Security slides and notes</td></tr> <tr id="metadata_owner"><td>owner</td><td>Mark Wooding</td></tr> <tr id="metadata_lchange"><td>last change</td><td><span class="datetime">Wed, 1 Nov 2006 14:32:34 +0000</span> (14:32 +0000)</td></tr> -<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips</td></tr> -<tr class="metadata_url"><td></td><td>git://git.distorted.org.uk/~mdw/doc/ips</td></tr> +<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips , git://git.distorted.org.uk/~mdw/doc/ips</td></tr> </table> <div class="header"> <a class="title" href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog</a> diff --git a/swh/lister/gitweb/tests/test_lister.py b/swh/lister/gitweb/tests/test_lister.py index e52546d7e4614998986251bfbd791777e6b42d4d..d92c5881de24038ba1a383bd7c4680c8dea038f6 100644 --- a/swh/lister/gitweb/tests/test_lister.py +++ b/swh/lister/gitweb/tests/test_lister.py @@ -8,7 +8,7 @@ from typing import List import pytest from swh.lister import __version__ -from swh.lister.gitweb.lister import GitwebLister +from swh.lister.gitweb.lister import GitwebLister, try_to_determine_git_repository from swh.lister.pattern import ListerStats MAIN_INSTANCE = "git.distorted.org.uk" @@ -71,6 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler): assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(url) assert listed_origin.last_update is not None + assert "," not in listed_origin.url # test user agent content for request in requests_mock_datadir.request_history: @@ -117,3 +118,21 @@ def test_lister_gitweb_get_origin_from_repo_failing( # so they are filtered out, only the 7 we know are thus listed expected_nb_origins = 7 assert stats == ListerStats(pages=1, origins=expected_nb_origins) + + +@pytest.mark.parametrize( + "url,expected_repo", + [ + ( + "https://git.shadowcat.co.uk?p=urisagit/gitosis-admin.git", + "git://git.shadowcat.co.uk/urisagit/gitosis-admin.git", + ), + ( + "https://git.shadowcat.co.uk?p=File-Slurp.git;a=summary", + "git://git.shadowcat.co.uk/File-Slurp.git", + ), + ("https://domain.org/foobar", None), + ], +) +def test_try_to_determine_git_repository(url, expected_repo): + assert try_to_determine_git_repository(url) == expected_repo