Skip to content
Snippets Groups Projects
Verified Commit 3ab85628 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

Add gitiles lister

Gitiles instance returns voluntarily a malformed json output (json prefixed with
``)]}'\n``) [2]. The lister deals with it to properly parse the json response
nonetheless. It drops the prefix and then parses the json.

If at some point, they drop this prefix to return json directly, the lister will be able
to deal with it too. There are 2 tests one with 'standard' gitile format and another
with standard json to account for both case.

Refs. swh/meta#5045

[2] https://github.com/google/gitiles/issues/263
parent 573958ce
No related branches found
No related tags found
1 merge request!487Add gitiles lister
Pipeline #3585 passed
Showing with 307 additions and 0 deletions
......@@ -68,6 +68,7 @@ setup(
lister.fedora=swh.lister.fedora:register
lister.gitea=swh.lister.gitea:register
lister.github=swh.lister.github:register
lister.gitiles=swh.lister.gitiles:register
lister.gitlab=swh.lister.gitlab:register
lister.gitweb=swh.lister.gitweb:register
lister.gnu=swh.lister.gnu:register
......
# Copyright (C) 2023 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import GitilesLister
return {
"lister": GitilesLister,
"task_modules": [f"{__name__}.tasks"],
}
# Copyright (C) 2023 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from json import loads
import logging
from typing import Iterator, Optional
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
Origin = str
class GitilesLister(StatelessLister[Origin]):
"""Lister class for Gitiles repositories.
This lister will retrieve the list of published git repositories by
parsing the json page found at the url `<url>?format=json`.
"""
LISTER_NAME = "gitiles"
def __init__(
self,
scheduler: SchedulerInterface,
url: Optional[str] = None,
instance: Optional[str] = None,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
):
"""Lister class for Gitiles repositories.
Args:
url: (Optional) Root URL of the Gitiles instance, i.e. url of the index of
published git repositories on this instance. Defaults to
:file:`https://{instance}` if unset.
instance: Name of gitiles instance. Defaults to url's network location
if unset.
"""
super().__init__(
scheduler=scheduler,
url=url,
instance=instance,
credentials=credentials,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
)
self.session.headers.update({"Accept": "application/json"})
def get_pages(self) -> Iterator[Origin]:
"""Generate git 'project' URLs found on the current Gitiles server."""
response = self.http_request(f"{self.url}?format=json")
text = response.text
# current gitiles' json is returned with a specific prefix
# See. https://github.com/google/gitiles/issues/263
if text.startswith(")]}'\n"):
text = text[5:]
data = loads(text)
for repo in data.values():
yield repo["clone_url"]
def get_origins_from_page(self, origin: Origin) -> Iterator[ListedOrigin]:
"""Convert a page of gitiles repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin,
visit_type="git",
)
# Copyright (C) 2023 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Dict
from celery import shared_task
from .lister import GitilesLister
@shared_task(name=f"{__name__}.GitilesListerTask")
def list_gitiles(**lister_args) -> Dict[str, str]:
"""Lister task for Gitiles instances"""
lister = GitilesLister.from_configfile(**lister_args)
return lister.run().dict()
)]}'
{
"accessories/manifest": {
"name": "accessories/manifest",
"clone_url": "https://android.googlesource.com/accessories/manifest"
},
"device/google/gs201": {
"name": "device/google/gs201",
"clone_url": "https://android.googlesource.com/device/google/gs201",
"description": "Bug: 244231765"
},
"device/google/sunfish-kernel": {
"name": "device/google/sunfish-kernel",
"clone_url": "https://android.googlesource.com/device/google/sunfish-kernel",
"description": "Bug: 160260413"
},
"device/lge/mako-kernel": {
"name": "device/lge/mako-kernel",
"clone_url": "https://android.googlesource.com/device/lge/mako-kernel"
},
"kernel/msm-extra/camera-devicetree": {
"name": "kernel/msm-extra/camera-devicetree",
"clone_url": "https://android.googlesource.com/kernel/msm-extra/camera-devicetree",
"description": "Bug: 167236823"
},
"platform/external/google-fonts/rubik": {
"name": "platform/external/google-fonts/rubik",
"clone_url": "https://android.googlesource.com/platform/external/google-fonts/rubik",
"description": "Bug: 122303069"
},
"trusty/vendor/google/aosp": {
"name": "trusty/vendor/google/aosp",
"clone_url": "https://android.googlesource.com/trusty/vendor/google/aosp"
}
}
These files are a partial dump of https://android.googlesource.com/?format=json.
{
"apps/analytics-etl": {
"name": "apps/analytics-etl",
"clone_url": "https://gerrit.googlesource.com/apps/analytics-etl",
"description": "Spark ETL to extra analytics data from Gerrit Projects using the Analytics plugin"
},
"apps/kibana-dashboard": {
"name": "apps/kibana-dashboard",
"clone_url": "https://gerrit.googlesource.com/apps/kibana-dashboard"
},
"apps/reviewit": {
"name": "apps/reviewit",
"clone_url": "https://gerrit.googlesource.com/apps/reviewit",
"description": "The \u0027Review It!?\u0027 app is an Android application for Gerrit that allows sorting of incoming changes and review of small/trivial changes.\n\nThis is not an official Google product."
}
}
These files are a partial dump of https://gerrit.googlesource.com/?format=json.
# Copyright (C) 2023 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
from typing import List
import pytest
from swh.lister import __version__
from swh.lister.gitiles.lister import GitilesLister
from swh.lister.pattern import ListerStats
MAIN_INSTANCE = "android.googlesource.com"
MAIN_INSTANCE_URL = f"https://{MAIN_INSTANCE}"
def test_lister_gitiles_instantiate(swh_scheduler):
"""Build a lister with either an url or an instance is supported."""
url = MAIN_INSTANCE_URL
lister = GitilesLister(swh_scheduler, url=url)
assert lister is not None
assert lister.url == url
assert GitilesLister(swh_scheduler, instance=MAIN_INSTANCE) is not None
assert lister is not None
assert lister.url == url
def test_lister_gitiles_fail_to_instantiate(swh_scheduler):
"""Build a lister without its url nor its instance should raise"""
# ... It will raise without any of those
with pytest.raises(ValueError, match="'url' or 'instance'"):
GitilesLister(swh_scheduler)
def test_lister_gitiles_get_pages(requests_mock_datadir, swh_scheduler):
"""Computing the number of pages scrapped during a listing."""
url = MAIN_INSTANCE_URL
lister_gitiles = GitilesLister(swh_scheduler, instance=MAIN_INSTANCE)
expected_nb_origins = 7
repos: List[str] = list(lister_gitiles.get_pages())
assert len(repos) == expected_nb_origins
for listed_url in repos:
assert listed_url.startswith(url)
@pytest.mark.parametrize(
"url,expected_nb_origins",
[(MAIN_INSTANCE_URL, 7), ("https://gerrit.googlesource.com", 3)],
)
def test_lister_gitiles_run(
requests_mock_datadir, swh_scheduler, url, expected_nb_origins
):
"""Gitiles lister nominal listing case."""
lister_gitiles = GitilesLister(swh_scheduler, url=url)
stats = lister_gitiles.run()
assert stats == ListerStats(pages=expected_nb_origins, origins=expected_nb_origins)
# test page parsing
scheduler_origins = swh_scheduler.get_listed_origins(
lister_gitiles.lister_obj.id
).results
assert len(scheduler_origins) == expected_nb_origins
assert url.startswith("https://")
# test listed repositories
for listed_origin in scheduler_origins:
assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith(url)
assert listed_origin.url.startswith("https://")
assert listed_origin.last_update is None
# test user agent content
for request in requests_mock_datadir.request_history:
assert "User-Agent" in request.headers
user_agent = request.headers["User-Agent"]
assert "Software Heritage gitiles lister" in user_agent
assert __version__ in user_agent
def test_lister_gitiles_get_pages_with_pages_and_retry(
requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler
):
"""Rate limited page are tested back after some time so ingestion can proceed."""
url = MAIN_INSTANCE_URL
with open(
os.path.join(datadir, f"https_{MAIN_INSTANCE}/,format=json"), "rb"
) as page:
requests_mock.get(
url,
[
{"content": None, "status_code": 429},
{"content": None, "status_code": 429},
{"content": page.read(), "status_code": 200},
],
)
lister_gitiles = GitilesLister(swh_scheduler, url=url)
mocker.patch.object(lister_gitiles.http_request.retry, "sleep")
pages: List[str] = list(lister_gitiles.get_pages())
assert len(pages) == 7
# Copyright (C) 2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
def test_gitweb_lister_task(
swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
):
# setup the mocked GitwebLister
lister = mocker.patch("swh.lister.gitweb.tasks.GitwebLister")
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
kwargs = dict(url="https://android.googlesource.com", max_pages=1)
res = swh_scheduler_celery_app.send_task(
"swh.lister.gitweb.tasks.GitwebListerTask",
kwargs=kwargs,
)
assert res
res.wait()
assert res.successful()
lister.from_configfile.assert_called_once_with(**kwargs)
lister.run.assert_called_once_with()
......@@ -45,6 +45,9 @@ lister_args = {
"gitweb": {
"url": "https://git.distorted.org.uk/~mdw/",
},
"gitiles": {
"instance": "gerrit.googlesource.com",
},
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment