diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index 4a9aeab940653fd5b20247e7e4abdaa00ea07e8b..a74e6bcd428fc603a8e10508eda055a4347e2c18 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -94,9 +94,11 @@ class CGitLister(StatelessLister[Repositories]): page_results = [] - for tr in bs_idx.find("div", {"class": "content"}).find_all( - "tr", {"class": ""} - ): + for tr in bs_idx.find( + "table", {"class": re.compile("(list|project_list)")} + ).find_all("tr"): + if not tr.find("a"): + continue repository_link = tr.find("a")["href"] repo_url = None git_url = None @@ -189,6 +191,13 @@ class CGitLister(StatelessLister[Repositories]): # <link rel='vcs-git' href='https://...' title='...'/> urls = [x["href"] for x in bs.find_all("a", {"rel": "vcs-git"})] + if not urls: + # Try the Gitweb heuristic + urls = [ + row.contents[-1].string + for row in bs.find_all("tr", {"class": "metadata_url"}) + ] + if not urls: logger.debug("No git urls found on %s", repository_url) return None