Skip to content
Snippets Groups Projects
Verified Commit 24734023 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

gitweb: Deal with edge cases

- Some metadata url field can contain multiple urls separated by a comma.
- Some instance does not list the full url of the repositories

Refs. swh/devel/swh-lister#1800
parent 00585f72
No related branches found
No related tags found
No related merge requests found
Pipeline #3498 passed
...@@ -85,7 +85,7 @@ class GitwebLister(StatelessLister[Repositories]): ...@@ -85,7 +85,7 @@ class GitwebLister(StatelessLister[Repositories]):
if not link: if not link:
continue continue
repo_url = link["href"] repo_url = urljoin(self.url, link["href"]).strip("/")
if repo_url.endswith("?o=descr"): if repo_url.endswith("?o=descr"):
continue continue
...@@ -147,11 +147,15 @@ class GitwebLister(StatelessLister[Repositories]): ...@@ -147,11 +147,15 @@ class GitwebLister(StatelessLister[Repositories]):
return self._get_origin_from_repository_url(summary_url) return self._get_origin_from_repository_url(summary_url)
else: else:
logger.debug("No summary tab found on %s", repository_url) logger.debug("No summary tab found on %s", repository_url)
urls = []
for row in bs.find_all("tr", {"class": "metadata_url"}):
url = row.contents[-1].string.strip()
urls = [ if "," in url:
row.contents[-1].string urls_ = [s.strip() for s in url.split(",") if s]
for row in bs.find_all("tr", {"class": "metadata_url"}) urls.extend(urls_)
] else:
urls.append(url)
if not urls: if not urls:
logger.debug("No git urls found on %s", repository_url) logger.debug("No git urls found on %s", repository_url)
......
...@@ -43,8 +43,7 @@ summary | <a href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog< ...@@ -43,8 +43,7 @@ summary | <a href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog<
<tr id="metadata_desc"><td>description</td><td>Introduction to Provable Security slides and notes</td></tr> <tr id="metadata_desc"><td>description</td><td>Introduction to Provable Security slides and notes</td></tr>
<tr id="metadata_owner"><td>owner</td><td>Mark Wooding</td></tr> <tr id="metadata_owner"><td>owner</td><td>Mark Wooding</td></tr>
<tr id="metadata_lchange"><td>last change</td><td><span class="datetime">Wed, 1 Nov 2006 14:32:34 +0000</span> (14:32 +0000)</td></tr> <tr id="metadata_lchange"><td>last change</td><td><span class="datetime">Wed, 1 Nov 2006 14:32:34 +0000</span> (14:32 +0000)</td></tr>
<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips</td></tr> <tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips , git://git.distorted.org.uk/~mdw/doc/ips</td></tr>
<tr class="metadata_url"><td></td><td>git://git.distorted.org.uk/~mdw/doc/ips</td></tr>
</table> </table>
<div class="header"> <div class="header">
<a class="title" href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog</a> <a class="title" href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog</a>
......
...@@ -71,6 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler): ...@@ -71,6 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler):
assert listed_origin.visit_type == "git" assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith(url) assert listed_origin.url.startswith(url)
assert listed_origin.last_update is not None assert listed_origin.last_update is not None
assert "," not in listed_origin.url
# test user agent content # test user agent content
for request in requests_mock_datadir.request_history: for request in requests_mock_datadir.request_history:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment