diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py index 1710ba79556882c0655e1e8dafd517bfbaa2c18e..79fd7bf6e09c4121b330d5437158a7b2cacf4b58 100644 --- a/swh/lister/gitweb/lister.py +++ b/swh/lister/gitweb/lister.py @@ -130,6 +130,12 @@ class GitwebLister(StatelessLister[Repositories]): urls = [] for row in bs.find_all("tr", {"class": "metadata_url"}): url = row.contents[-1].string.strip() + for scheme in ("http", "https", "git"): + # remove any string prefix before origin + pos = url.find(f"{scheme}://") + if pos != -1: + url = url[pos:] + break if "," in url: urls_ = [s.strip() for s in url.split(",") if s] diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall index 6113b2ac90ed33205fc1591d10e0446206bd6e7e..dbc00a033c5bc53cc1fcaa325e4eabcce76814ba 100644 --- a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall @@ -43,8 +43,8 @@ summary | <a href="https://git.distorted.org.uk/~mdw/firewall/shortlog">shortlog <tr id="metadata_desc"><td>description</td><td>Firewall scripts for distorted.org.uk.</td></tr> <tr id="metadata_owner"><td>owner</td><td>Mark Wooding</td></tr> <tr id="metadata_lchange"><td>last change</td><td><span class="datetime">Thu, 16 Mar 2023 18:09:32 +0000</span> (18:09 +0000)</td></tr> -<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/firewall</td></tr> -<tr class="metadata_url"><td></td><td>git://git.distorted.org.uk/~mdw/firewall</td></tr> +<tr class="metadata_url"><td>URL</td><td>fallback: https://git.distorted.org.uk/~mdw/firewall</td></tr> +<tr class="metadata_url"><td></td><td>fast: git://git.distorted.org.uk/~mdw/firewall</td></tr> </table> <div class="header"> <a class="title" href="https://git.distorted.org.uk/~mdw/firewall/shortlog">shortlog</a> @@ -164,4 +164,4 @@ window.onload = function () { }; </script> </body> -</html> \ No newline at end of file +</html>