Skip to content
Snippets Groups Projects
Verified Commit 81555320 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

gitweb: Deal with edge cases

Some instance:
- have summary page which lists metadata_url field with multiple comma separated urls
- lists an incomplete url of the repository so we need to join it with the main page
- have summary page which does not list any git repository urls so try to determine one

Refs. swh/devel/swh-lister#1800
parent ab684134
No related branches found
No related tags found
No related merge requests found
Pipeline #3499 passed
......@@ -6,7 +6,7 @@ from datetime import timezone
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse
from urllib.parse import parse_qs, urljoin, urlparse
from bs4 import BeautifulSoup
from dateparser import parse
......@@ -86,7 +86,7 @@ class GitwebLister(StatelessLister[Repositories]):
if not link:
continue
repo_url = link["href"]
repo_url = urljoin(self.url, link["href"]).strip("/")
if repo_url.endswith("?o=descr"):
continue
......@@ -151,15 +151,19 @@ class GitwebLister(StatelessLister[Repositories]):
return self._get_origin_from_repository_url(summary_url)
else:
logger.debug("No summary tab found on %s", repository_url)
urls = []
for row in bs.find_all("tr", {"class": "metadata_url"}):
url = row.contents[-1].string.strip()
urls = [
row.contents[-1].string
for row in bs.find_all("tr", {"class": "metadata_url"})
]
if "," in url:
urls_ = [s.strip() for s in url.split(",") if s]
urls.extend(urls_)
else:
urls.append(url)
if not urls:
logger.debug("No git urls found on %s", repository_url)
return None
return try_to_determine_git_repository(repository_url)
# look for the http/https url, if any, and use it as origin_url
for url in urls:
......@@ -170,3 +174,21 @@ class GitwebLister(StatelessLister[Repositories]):
# otherwise, choose the first one
origin_url = urls[0]
return origin_url
def try_to_determine_git_repository(repository_url: str) -> Optional[str]:
"""Some cgit instances does not advertise the git urls.
But the following mostly works.
"""
result = None
parsed_url = urlparse(repository_url)
params = parse_qs(parsed_url.query).get("p")
if params:
repo = params[0]
if repo and repo.endswith(";a=summary"):
repo = repo.rstrip(";a=summary")
result = f"git://{parsed_url.netloc}/{repo}"
return result
......@@ -43,8 +43,7 @@ summary | <a href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog<
<tr id="metadata_desc"><td>description</td><td>Introduction to Provable Security slides and notes</td></tr>
<tr id="metadata_owner"><td>owner</td><td>Mark Wooding</td></tr>
<tr id="metadata_lchange"><td>last change</td><td><span class="datetime">Wed, 1 Nov 2006 14:32:34 +0000</span> (14:32 +0000)</td></tr>
<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips</td></tr>
<tr class="metadata_url"><td></td><td>git://git.distorted.org.uk/~mdw/doc/ips</td></tr>
<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips , git://git.distorted.org.uk/~mdw/doc/ips</td></tr>
</table>
<div class="header">
<a class="title" href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog</a>
......
......@@ -8,7 +8,7 @@ from typing import List
import pytest
from swh.lister import __version__
from swh.lister.gitweb.lister import GitwebLister
from swh.lister.gitweb.lister import GitwebLister, try_to_determine_git_repository
from swh.lister.pattern import ListerStats
MAIN_INSTANCE = "git.distorted.org.uk"
......@@ -71,6 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler):
assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith(url)
assert listed_origin.last_update is not None
assert "," not in listed_origin.url
# test user agent content
for request in requests_mock_datadir.request_history:
......@@ -117,3 +118,21 @@ def test_lister_gitweb_get_origin_from_repo_failing(
# so they are filtered out, only the 7 we know are thus listed
expected_nb_origins = 7
assert stats == ListerStats(pages=1, origins=expected_nb_origins)
@pytest.mark.parametrize(
"url,expected_repo",
[
(
"https://git.shadowcat.co.uk?p=urisagit/gitosis-admin.git",
"git://git.shadowcat.co.uk/urisagit/gitosis-admin.git",
),
(
"https://git.shadowcat.co.uk?p=File-Slurp.git;a=summary",
"git://git.shadowcat.co.uk/File-Slurp.git",
),
("https://domain.org/foobar", None),
],
)
def test_try_to_determine_git_repository(url, expected_repo):
assert try_to_determine_git_repository(url) == expected_repo
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment