From 24734023d4cc531930c5862920f30797d9ed7116 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org> Date: Mon, 10 Jul 2023 12:30:31 +0200 Subject: [PATCH] gitweb: Deal with edge cases - Some metadata url field can contain multiple urls separated by a comma. - Some instance does not list the full url of the repositories Refs. swh/devel/swh-lister#1800 --- swh/lister/gitweb/lister.py | 14 +++++++++----- .../data/https_git.distorted.org.uk/~mdw_doc_ips | 3 +-- swh/lister/gitweb/tests/test_lister.py | 1 + 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py index 3c902e4f..48c6e07d 100644 --- a/swh/lister/gitweb/lister.py +++ b/swh/lister/gitweb/lister.py @@ -85,7 +85,7 @@ class GitwebLister(StatelessLister[Repositories]): if not link: continue - repo_url = link["href"] + repo_url = urljoin(self.url, link["href"]).strip("/") if repo_url.endswith("?o=descr"): continue @@ -147,11 +147,15 @@ class GitwebLister(StatelessLister[Repositories]): return self._get_origin_from_repository_url(summary_url) else: logger.debug("No summary tab found on %s", repository_url) + urls = [] + for row in bs.find_all("tr", {"class": "metadata_url"}): + url = row.contents[-1].string.strip() - urls = [ - row.contents[-1].string - for row in bs.find_all("tr", {"class": "metadata_url"}) - ] + if "," in url: + urls_ = [s.strip() for s in url.split(",") if s] + urls.extend(urls_) + else: + urls.append(url) if not urls: logger.debug("No git urls found on %s", repository_url) diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips index 796be693..83a90659 100644 --- a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips @@ -43,8 +43,7 @@ summary | <a href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog< <tr id="metadata_desc"><td>description</td><td>Introduction to Provable Security slides and notes</td></tr> <tr id="metadata_owner"><td>owner</td><td>Mark Wooding</td></tr> <tr id="metadata_lchange"><td>last change</td><td><span class="datetime">Wed, 1 Nov 2006 14:32:34 +0000</span> (14:32 +0000)</td></tr> -<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips</td></tr> -<tr class="metadata_url"><td></td><td>git://git.distorted.org.uk/~mdw/doc/ips</td></tr> +<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips , git://git.distorted.org.uk/~mdw/doc/ips</td></tr> </table> <div class="header"> <a class="title" href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog</a> diff --git a/swh/lister/gitweb/tests/test_lister.py b/swh/lister/gitweb/tests/test_lister.py index e52546d7..a3135861 100644 --- a/swh/lister/gitweb/tests/test_lister.py +++ b/swh/lister/gitweb/tests/test_lister.py @@ -71,6 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler): assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(url) assert listed_origin.last_update is not None + assert "," not in listed_origin.url # test user agent content for request in requests_mock_datadir.request_history: -- GitLab