From 24734023d4cc531930c5862920f30797d9ed7116 Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Mon, 10 Jul 2023 12:30:31 +0200
Subject: [PATCH] gitweb: Deal with edge cases

- Some metadata url field can contain multiple urls separated by a comma.
- Some instance does not list the full url of the repositories

Refs. swh/devel/swh-lister#1800
---
 swh/lister/gitweb/lister.py                        | 14 +++++++++-----
 .../data/https_git.distorted.org.uk/~mdw_doc_ips   |  3 +--
 swh/lister/gitweb/tests/test_lister.py             |  1 +
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py
index 3c902e4f..48c6e07d 100644
--- a/swh/lister/gitweb/lister.py
+++ b/swh/lister/gitweb/lister.py
@@ -85,7 +85,7 @@ class GitwebLister(StatelessLister[Repositories]):
             if not link:
                 continue
 
-            repo_url = link["href"]
+            repo_url = urljoin(self.url, link["href"]).strip("/")
 
             if repo_url.endswith("?o=descr"):
                 continue
@@ -147,11 +147,15 @@ class GitwebLister(StatelessLister[Repositories]):
                     return self._get_origin_from_repository_url(summary_url)
             else:
                 logger.debug("No summary tab found on %s", repository_url)
+        urls = []
+        for row in bs.find_all("tr", {"class": "metadata_url"}):
+            url = row.contents[-1].string.strip()
 
-        urls = [
-            row.contents[-1].string
-            for row in bs.find_all("tr", {"class": "metadata_url"})
-        ]
+            if "," in url:
+                urls_ = [s.strip() for s in url.split(",") if s]
+                urls.extend(urls_)
+            else:
+                urls.append(url)
 
         if not urls:
             logger.debug("No git urls found on %s", repository_url)
diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips
index 796be693..83a90659 100644
--- a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips
+++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_doc_ips
@@ -43,8 +43,7 @@ summary | <a href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog<
 <tr id="metadata_desc"><td>description</td><td>Introduction to Provable Security slides and notes</td></tr>
 <tr id="metadata_owner"><td>owner</td><td>Mark Wooding</td></tr>
 <tr id="metadata_lchange"><td>last change</td><td><span class="datetime">Wed, 1 Nov 2006 14:32:34 +0000</span> (14:32 +0000)</td></tr>
-<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips</td></tr>
-<tr class="metadata_url"><td></td><td>git://git.distorted.org.uk/~mdw/doc/ips</td></tr>
+<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips , git://git.distorted.org.uk/~mdw/doc/ips</td></tr>
 </table>
 <div class="header">
 <a class="title" href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog</a>
diff --git a/swh/lister/gitweb/tests/test_lister.py b/swh/lister/gitweb/tests/test_lister.py
index e52546d7..a3135861 100644
--- a/swh/lister/gitweb/tests/test_lister.py
+++ b/swh/lister/gitweb/tests/test_lister.py
@@ -71,6 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler):
         assert listed_origin.visit_type == "git"
         assert listed_origin.url.startswith(url)
         assert listed_origin.last_update is not None
+        assert "," not in listed_origin.url
 
     # test user agent content
     for request in requests_mock_datadir.request_history:
-- 
GitLab