Skip to content
Snippets Groups Projects
Commit 7b932f46 authored by Antoine Lambert's avatar Antoine Lambert
Browse files

gitweb: Add optional base_git_url parameter to lister

Similar to cgit, it exist cases where git clone URLs for projects hosted
on a gitweb instance cannot be found when scraping project pages or cannot
be easily derived from the gitweb instance root URL.

So add an optional base_git_url parameter enabling to compute correct clone
URLs by appending project names to it.
parent 59a97964
No related branches found
Tags v6.1.0
No related merge requests found
......@@ -36,6 +36,7 @@ class GitwebLister(StatelessLister[Repositories]):
scheduler: SchedulerInterface,
url: Optional[str] = None,
instance: Optional[str] = None,
base_git_url: Optional[str] = None,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
......@@ -44,11 +45,14 @@ class GitwebLister(StatelessLister[Repositories]):
"""Lister class for Gitweb repositories.
Args:
url: (Optional) Root URL of the Gitweb instance, i.e. url of the index of
url: Root URL of the Gitweb instance, i.e. url of the index of
published git repositories on this instance. Defaults to
:file:`https://{instance}` if unset.
instance: Name of gitweb instance. Defaults to url's network location
if unset.
base_git_url: Base URL to clone a git project hosted on the Gitweb instance,
should only be used if the clone URLs cannot be found when scraping project
page or cannot be easily derived from the root URL of the instance
"""
super().__init__(
......@@ -63,6 +67,7 @@ class GitwebLister(StatelessLister[Repositories]):
self.session.headers.update({"Accept": "application/html"})
self.instance_scheme = urlparse(url).scheme
self.base_git_url = base_git_url
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
......@@ -144,7 +149,7 @@ class GitwebLister(StatelessLister[Repositories]):
urls.append(url)
if not urls:
repo = try_to_determine_git_repository(repository_url)
repo = try_to_determine_git_repository(repository_url, self.base_git_url)
if not repo:
logger.debug("No git urls found on %s", repository_url)
return repo
......@@ -165,7 +170,9 @@ class GitwebLister(StatelessLister[Repositories]):
return origin_url
def try_to_determine_git_repository(repository_url: str) -> Optional[str]:
def try_to_determine_git_repository(
repository_url: str, base_git_url: Optional[str] = None
) -> Optional[str]:
"""Some gitweb instances does not advertise the git urls.
This heuristic works on instances demonstrating this behavior.
......@@ -175,7 +182,10 @@ def try_to_determine_git_repository(repository_url: str) -> Optional[str]:
parsed_url = urlparse(repository_url)
repo = parse_qs(parsed_url.query, separator=";").get("p")
if repo:
result = f"git://{parsed_url.netloc}/{repo[0]}"
if base_git_url:
result = f"{base_git_url.rstrip('/')}/{repo[0]}"
else:
result = f"git://{parsed_url.netloc}/{repo[0]}"
return result
......
......@@ -128,25 +128,42 @@ def test_lister_gitweb_get_origin_from_repo_failing(
@pytest.mark.parametrize(
"url,expected_repo",
"url,base_git_url,expected_repo",
[
(
"https://git.shadowcat.co.uk?p=urisagit/gitosis-admin.git",
None,
"git://git.shadowcat.co.uk/urisagit/gitosis-admin.git",
),
(
"https://git.shadowcat.co.uk?p=File-Slurp.git;a=summary",
None,
"git://git.shadowcat.co.uk/File-Slurp.git",
),
(
"https://git.example.org?p=baaaa;a=summary",
None,
"git://git.example.org/baaaa",
),
("https://domain.org/foobar", None),
(
"https://domain.org/foobar",
None,
None,
),
(
"https://gitweb.example.org?p=project.git;a=summary",
"https://example.org",
"https://example.org/project.git",
),
(
"https://example.org?p=project.git;a=summary",
"https://example.org/git/",
"https://example.org/git/project.git",
),
],
)
def test_try_to_determine_git_repository(url, expected_repo):
assert try_to_determine_git_repository(url) == expected_repo
def test_try_to_determine_git_repository(url, base_git_url, expected_repo):
assert try_to_determine_git_repository(url, base_git_url) == expected_repo
def test_parse_last_update():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment