diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py index 3c902e4f7d019771476f6b71cc3bf60df6cbd807..48c6e07d5990762ccae73875b4f3361beb878c1c 100644 --- a/swh/lister/gitweb/lister.py +++ b/swh/lister/gitweb/lister.py @@ -85,7 +85,7 @@ class GitwebLister(StatelessLister[Repositories]): if not link: continue - repo_url = link["href"] + repo_url = urljoin(self.url, link["href"]).strip("/") if repo_url.endswith("?o=descr"): continue @@ -147,11 +147,15 @@ class GitwebLister(StatelessLister[Repositories]): return self._get_origin_from_repository_url(summary_url) else: logger.debug("No summary tab found on %s", repository_url) + urls = [] + for row in bs.find_all("tr", {"class": "metadata_url"}): + url = row.contents[-1].string.strip() - urls = [ - row.contents[-1].string - for row in bs.find_all("tr", {"class": "metadata_url"}) - ] + if "," in url: + urls_ = [s.strip() for s in url.split(",") if s] + urls.extend(urls_) + else: + urls.append(url) if not urls: logger.debug("No git urls found on %s", repository_url) diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips index 796be69335ea073290c04c9d841c6edc67e2b6c2..83a9065966675ff1be3f6dd7b80ec2a9b4df5aca 100644 --- a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips @@ -43,8 +43,7 @@ summary | <a href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog< <tr id="metadata_desc"><td>description</td><td>Introduction to Provable Security slides and notes</td></tr> <tr id="metadata_owner"><td>owner</td><td>Mark Wooding</td></tr> <tr id="metadata_lchange"><td>last change</td><td><span class="datetime">Wed, 1 Nov 2006 14:32:34 +0000</span> (14:32 +0000)</td></tr> -<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips</td></tr> -<tr class="metadata_url"><td></td><td>git://git.distorted.org.uk/~mdw/doc/ips</td></tr> +<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips , git://git.distorted.org.uk/~mdw/doc/ips</td></tr> </table> <div class="header"> <a class="title" href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog</a> diff --git a/swh/lister/gitweb/tests/test_lister.py b/swh/lister/gitweb/tests/test_lister.py index e52546d7e4614998986251bfbd791777e6b42d4d..a3135861ec26c885670f00925c3e082bc2edb35e 100644 --- a/swh/lister/gitweb/tests/test_lister.py +++ b/swh/lister/gitweb/tests/test_lister.py @@ -71,6 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler): assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(url) assert listed_origin.last_update is not None + assert "," not in listed_origin.url # test user agent content for request in requests_mock_datadir.request_history: