Skip to content
Snippets Groups Projects
Verified Commit 9db73596 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

gitweb: Deal with edge cases

Some instance:
- have summary page which lists metadata_url field with multiple comma separated urls
- lists an incomplete url of the repository so we need to join it with the main page
- have summary page which does not list any git repository urls so try to determine one

Refs. swh/devel/swh-lister#1800
parent ab684134
No related branches found
No related tags found
No related merge requests found
Pipeline #3508 passed
...@@ -6,7 +6,7 @@ from datetime import timezone ...@@ -6,7 +6,7 @@ from datetime import timezone
import logging import logging
import re import re
from typing import Any, Dict, Iterator, List, Optional from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin, urlparse from urllib.parse import parse_qs, urljoin, urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from dateparser import parse from dateparser import parse
...@@ -86,7 +86,7 @@ class GitwebLister(StatelessLister[Repositories]): ...@@ -86,7 +86,7 @@ class GitwebLister(StatelessLister[Repositories]):
if not link: if not link:
continue continue
repo_url = link["href"] repo_url = urljoin(self.url, link["href"]).strip("/")
if repo_url.endswith("?o=descr"): if repo_url.endswith("?o=descr"):
continue continue
...@@ -136,30 +136,19 @@ class GitwebLister(StatelessLister[Repositories]): ...@@ -136,30 +136,19 @@ class GitwebLister(StatelessLister[Repositories]):
) )
return None return None
# check if we are on the summary tab, if not, go to this tab urls = []
tab = bs.find("table", {"class": "tabs"}) for row in bs.find_all("tr", {"class": "metadata_url"}):
if tab: url = row.contents[-1].string.strip()
summary_a = tab.find("a", string="summary")
if summary_a:
summary_url = urljoin(repository_url, summary_a["href"]).strip("/")
if summary_url != repository_url:
logger.debug(
"%s : Active tab is not the summary, trying to load the summary page",
repository_url,
)
return self._get_origin_from_repository_url(summary_url)
else:
logger.debug("No summary tab found on %s", repository_url)
urls = [ if "," in url:
row.contents[-1].string urls_ = [s.strip() for s in url.split(",") if s]
for row in bs.find_all("tr", {"class": "metadata_url"}) urls.extend(urls_)
] else:
urls.append(url)
if not urls: if not urls:
logger.debug("No git urls found on %s", repository_url) logger.debug("No git urls found on %s", repository_url)
return None return try_to_determine_git_repository(repository_url)
# look for the http/https url, if any, and use it as origin_url # look for the http/https url, if any, and use it as origin_url
for url in urls: for url in urls:
...@@ -170,3 +159,21 @@ class GitwebLister(StatelessLister[Repositories]): ...@@ -170,3 +159,21 @@ class GitwebLister(StatelessLister[Repositories]):
# otherwise, choose the first one # otherwise, choose the first one
origin_url = urls[0] origin_url = urls[0]
return origin_url return origin_url
def try_to_determine_git_repository(repository_url: str) -> Optional[str]:
"""Some gitweb instances does not advertise the git urls.
This heuristic works on instances demonstrating this behavior.
"""
result = None
parsed_url = urlparse(repository_url)
params = parse_qs(parsed_url.query).get("p")
if params:
repo = params[0]
if repo and repo.endswith(";a=summary"):
repo = repo.rstrip(";a=summary")
result = f"git://{parsed_url.netloc}/{repo}"
return result
...@@ -43,8 +43,7 @@ summary | <a href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog< ...@@ -43,8 +43,7 @@ summary | <a href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog<
<tr id="metadata_desc"><td>description</td><td>Introduction to Provable Security slides and notes</td></tr> <tr id="metadata_desc"><td>description</td><td>Introduction to Provable Security slides and notes</td></tr>
<tr id="metadata_owner"><td>owner</td><td>Mark Wooding</td></tr> <tr id="metadata_owner"><td>owner</td><td>Mark Wooding</td></tr>
<tr id="metadata_lchange"><td>last change</td><td><span class="datetime">Wed, 1 Nov 2006 14:32:34 +0000</span> (14:32 +0000)</td></tr> <tr id="metadata_lchange"><td>last change</td><td><span class="datetime">Wed, 1 Nov 2006 14:32:34 +0000</span> (14:32 +0000)</td></tr>
<tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips</td></tr> <tr class="metadata_url"><td>URL</td><td>https://git.distorted.org.uk/~mdw/doc/ips , git://git.distorted.org.uk/~mdw/doc/ips</td></tr>
<tr class="metadata_url"><td></td><td>git://git.distorted.org.uk/~mdw/doc/ips</td></tr>
</table> </table>
<div class="header"> <div class="header">
<a class="title" href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog</a> <a class="title" href="https://git.distorted.org.uk/~mdw/doc/ips/shortlog">shortlog</a>
......
...@@ -8,7 +8,7 @@ from typing import List ...@@ -8,7 +8,7 @@ from typing import List
import pytest import pytest
from swh.lister import __version__ from swh.lister import __version__
from swh.lister.gitweb.lister import GitwebLister from swh.lister.gitweb.lister import GitwebLister, try_to_determine_git_repository
from swh.lister.pattern import ListerStats from swh.lister.pattern import ListerStats
MAIN_INSTANCE = "git.distorted.org.uk" MAIN_INSTANCE = "git.distorted.org.uk"
...@@ -71,6 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler): ...@@ -71,6 +71,7 @@ def test_lister_gitweb_run(requests_mock_datadir, swh_scheduler):
assert listed_origin.visit_type == "git" assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith(url) assert listed_origin.url.startswith(url)
assert listed_origin.last_update is not None assert listed_origin.last_update is not None
assert "," not in listed_origin.url
# test user agent content # test user agent content
for request in requests_mock_datadir.request_history: for request in requests_mock_datadir.request_history:
...@@ -117,3 +118,21 @@ def test_lister_gitweb_get_origin_from_repo_failing( ...@@ -117,3 +118,21 @@ def test_lister_gitweb_get_origin_from_repo_failing(
# so they are filtered out, only the 7 we know are thus listed # so they are filtered out, only the 7 we know are thus listed
expected_nb_origins = 7 expected_nb_origins = 7
assert stats == ListerStats(pages=1, origins=expected_nb_origins) assert stats == ListerStats(pages=1, origins=expected_nb_origins)
@pytest.mark.parametrize(
"url,expected_repo",
[
(
"https://git.shadowcat.co.uk?p=urisagit/gitosis-admin.git",
"git://git.shadowcat.co.uk/urisagit/gitosis-admin.git",
),
(
"https://git.shadowcat.co.uk?p=File-Slurp.git;a=summary",
"git://git.shadowcat.co.uk/File-Slurp.git",
),
("https://domain.org/foobar", None),
],
)
def test_try_to_determine_git_repository(url, expected_repo):
assert try_to_determine_git_repository(url) == expected_repo
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment