From 59a979642f102ca399b3b0d53e3af44a33fcf55a Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Tue, 26 Sep 2023 14:18:36 +0200 Subject: [PATCH] gitweb: Ensure to strip any prefix before git clone URL Some gitweb instances can have some string prefixes before the displayed git clone URLs so ensure to strip them to properly extract URLs. Related to swh/infra/sysadm-environment#5051. --- swh/lister/gitweb/lister.py | 6 ++++++ .../tests/data/https_git.distorted.org.uk/~mdw_firewall | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/swh/lister/gitweb/lister.py b/swh/lister/gitweb/lister.py index 1710ba79..79fd7bf6 100644 --- a/swh/lister/gitweb/lister.py +++ b/swh/lister/gitweb/lister.py @@ -130,6 +130,12 @@ class GitwebLister(StatelessLister[Repositories]): urls = [] for row in bs.find_all("tr", {"class": "metadata_url"}): url = row.contents[-1].string.strip() + for scheme in ("http", "https", "git"): + # remove any string prefix before origin + pos = url.find(f"{scheme}://") + if pos != -1: + url = url[pos:] + break if "," in url: urls_ = [s.strip() for s in url.split(",") if s] diff --git a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall index 6113b2ac..dbc00a03 100644 --- a/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall +++ b/swh/lister/gitweb/tests/data/https_git.distorted.org.uk/~mdw_firewall @@ -43,8 +43,8 @@ summary | <a href="https://git.distorted.org.uk/~mdw/firewall/shortlog">shortlog <tr id="metadata_desc"><td>description</td><td>Firewall scripts for distorted.org.uk.</td></tr> <tr id="metadata_owner"><td>owner</td><td>Mark Wooding</td></tr> <tr id="metadata_lchange"><td>last change</td><td><span class="datetime">Thu, 16 Mar 2023 18:09:32 +0000</span> (18:09 +0000)</td></tr> -<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/firewall</td></tr> -<tr class="metadata_url"><td></td><td>git://git.distorted.org.uk/~mdw/firewall</td></tr> +<tr class="metadata_url"><td>URL</td><td>fallback: https://git.distorted.org.uk/~mdw/firewall</td></tr> +<tr class="metadata_url"><td></td><td>fast: git://git.distorted.org.uk/~mdw/firewall</td></tr> </table> <div class="header"> <a class="title" href="https://git.distorted.org.uk/~mdw/firewall/shortlog">shortlog</a> @@ -164,4 +164,4 @@ window.onload = function () { }; </script> </body> -</html> \ No newline at end of file +</html> -- GitLab