From 3ab856288cd94759cb9b11ffda792042b7e30e25 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org> Date: Wed, 12 Jul 2023 17:46:20 +0200 Subject: [PATCH] Add gitiles lister Gitiles instance returns voluntarily a malformed json output (json prefixed with ``)]}'\n``) [2]. The lister deals with it to properly parse the json response nonetheless. It drops the prefix and then parses the json. If at some point, they drop this prefix to return json directly, the lister will be able to deal with it too. There are 2 tests one with 'standard' gitile format and another with standard json to account for both case. Refs. swh/meta#5045 [2] https://github.com/google/gitiles/issues/263 --- setup.py | 1 + swh/lister/gitiles/__init__.py | 12 ++ swh/lister/gitiles/lister.py | 82 +++++++++++++ swh/lister/gitiles/tasks.py | 16 +++ swh/lister/gitiles/tests/__init__.py | 0 .../,format=json | 35 ++++++ .../https_android.googlesource.com/README | 2 + .../,format=json | 16 +++ .../data/https_gerrit.googlesource.com/README | 2 + swh/lister/gitiles/tests/test_lister.py | 110 ++++++++++++++++++ swh/lister/gitiles/tests/test_tasks.py | 28 +++++ swh/lister/tests/test_cli.py | 3 + 12 files changed, 307 insertions(+) create mode 100644 swh/lister/gitiles/__init__.py create mode 100644 swh/lister/gitiles/lister.py create mode 100644 swh/lister/gitiles/tasks.py create mode 100644 swh/lister/gitiles/tests/__init__.py create mode 100644 swh/lister/gitiles/tests/data/https_android.googlesource.com/,format=json create mode 100644 swh/lister/gitiles/tests/data/https_android.googlesource.com/README create mode 100644 swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/,format=json create mode 100644 swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/README create mode 100644 swh/lister/gitiles/tests/test_lister.py create mode 100644 swh/lister/gitiles/tests/test_tasks.py diff --git a/setup.py b/setup.py index 0ad6aa5f..456cd0e7 100755 --- a/setup.py +++ b/setup.py @@ -68,6 +68,7 @@ setup( lister.fedora=swh.lister.fedora:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register + lister.gitiles=swh.lister.gitiles:register lister.gitlab=swh.lister.gitlab:register lister.gitweb=swh.lister.gitweb:register lister.gnu=swh.lister.gnu:register diff --git a/swh/lister/gitiles/__init__.py b/swh/lister/gitiles/__init__.py new file mode 100644 index 00000000..8f71bf37 --- /dev/null +++ b/swh/lister/gitiles/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import GitilesLister + + return { + "lister": GitilesLister, + "task_modules": [f"{__name__}.tasks"], + } diff --git a/swh/lister/gitiles/lister.py b/swh/lister/gitiles/lister.py new file mode 100644 index 00000000..b92e9e8d --- /dev/null +++ b/swh/lister/gitiles/lister.py @@ -0,0 +1,82 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from json import loads +import logging +from typing import Iterator, Optional + +from swh.lister.pattern import CredentialsType, StatelessLister +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +logger = logging.getLogger(__name__) + +Origin = str + + +class GitilesLister(StatelessLister[Origin]): + """Lister class for Gitiles repositories. + + This lister will retrieve the list of published git repositories by + parsing the json page found at the url `<url>?format=json`. + + """ + + LISTER_NAME = "gitiles" + + def __init__( + self, + scheduler: SchedulerInterface, + url: Optional[str] = None, + instance: Optional[str] = None, + credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, + ): + """Lister class for Gitiles repositories. + + Args: + url: (Optional) Root URL of the Gitiles instance, i.e. url of the index of + published git repositories on this instance. Defaults to + :file:`https://{instance}` if unset. + instance: Name of gitiles instance. Defaults to url's network location + if unset. + + """ + super().__init__( + scheduler=scheduler, + url=url, + instance=instance, + credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, + ) + + self.session.headers.update({"Accept": "application/json"}) + + def get_pages(self) -> Iterator[Origin]: + """Generate git 'project' URLs found on the current Gitiles server.""" + response = self.http_request(f"{self.url}?format=json") + text = response.text + # current gitiles' json is returned with a specific prefix + # See. https://github.com/google/gitiles/issues/263 + if text.startswith(")]}'\n"): + text = text[5:] + + data = loads(text) + + for repo in data.values(): + yield repo["clone_url"] + + def get_origins_from_page(self, origin: Origin) -> Iterator[ListedOrigin]: + """Convert a page of gitiles repositories into a list of ListedOrigins.""" + assert self.lister_obj.id is not None + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin, + visit_type="git", + ) diff --git a/swh/lister/gitiles/tasks.py b/swh/lister/gitiles/tasks.py new file mode 100644 index 00000000..07f292bc --- /dev/null +++ b/swh/lister/gitiles/tasks.py @@ -0,0 +1,16 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import GitilesLister + + +@shared_task(name=f"{__name__}.GitilesListerTask") +def list_gitiles(**lister_args) -> Dict[str, str]: + """Lister task for Gitiles instances""" + lister = GitilesLister.from_configfile(**lister_args) + return lister.run().dict() diff --git a/swh/lister/gitiles/tests/__init__.py b/swh/lister/gitiles/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/swh/lister/gitiles/tests/data/https_android.googlesource.com/,format=json b/swh/lister/gitiles/tests/data/https_android.googlesource.com/,format=json new file mode 100644 index 00000000..8328c7c8 --- /dev/null +++ b/swh/lister/gitiles/tests/data/https_android.googlesource.com/,format=json @@ -0,0 +1,35 @@ +)]}' +{ + "accessories/manifest": { + "name": "accessories/manifest", + "clone_url": "https://android.googlesource.com/accessories/manifest" + }, + "device/google/gs201": { + "name": "device/google/gs201", + "clone_url": "https://android.googlesource.com/device/google/gs201", + "description": "Bug: 244231765" + }, + "device/google/sunfish-kernel": { + "name": "device/google/sunfish-kernel", + "clone_url": "https://android.googlesource.com/device/google/sunfish-kernel", + "description": "Bug: 160260413" + }, + "device/lge/mako-kernel": { + "name": "device/lge/mako-kernel", + "clone_url": "https://android.googlesource.com/device/lge/mako-kernel" + }, + "kernel/msm-extra/camera-devicetree": { + "name": "kernel/msm-extra/camera-devicetree", + "clone_url": "https://android.googlesource.com/kernel/msm-extra/camera-devicetree", + "description": "Bug: 167236823" + }, + "platform/external/google-fonts/rubik": { + "name": "platform/external/google-fonts/rubik", + "clone_url": "https://android.googlesource.com/platform/external/google-fonts/rubik", + "description": "Bug: 122303069" + }, + "trusty/vendor/google/aosp": { + "name": "trusty/vendor/google/aosp", + "clone_url": "https://android.googlesource.com/trusty/vendor/google/aosp" + } +} diff --git a/swh/lister/gitiles/tests/data/https_android.googlesource.com/README b/swh/lister/gitiles/tests/data/https_android.googlesource.com/README new file mode 100644 index 00000000..f2074bab --- /dev/null +++ b/swh/lister/gitiles/tests/data/https_android.googlesource.com/README @@ -0,0 +1,2 @@ +These files are a partial dump of https://android.googlesource.com/?format=json. + diff --git a/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/,format=json b/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/,format=json new file mode 100644 index 00000000..37c0378f --- /dev/null +++ b/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/,format=json @@ -0,0 +1,16 @@ +{ + "apps/analytics-etl": { + "name": "apps/analytics-etl", + "clone_url": "https://gerrit.googlesource.com/apps/analytics-etl", + "description": "Spark ETL to extra analytics data from Gerrit Projects using the Analytics plugin" + }, + "apps/kibana-dashboard": { + "name": "apps/kibana-dashboard", + "clone_url": "https://gerrit.googlesource.com/apps/kibana-dashboard" + }, + "apps/reviewit": { + "name": "apps/reviewit", + "clone_url": "https://gerrit.googlesource.com/apps/reviewit", + "description": "The \u0027Review It!?\u0027 app is an Android application for Gerrit that allows sorting of incoming changes and review of small/trivial changes.\n\nThis is not an official Google product." + } +} diff --git a/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/README b/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/README new file mode 100644 index 00000000..1a016a2d --- /dev/null +++ b/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/README @@ -0,0 +1,2 @@ +These files are a partial dump of https://gerrit.googlesource.com/?format=json. + diff --git a/swh/lister/gitiles/tests/test_lister.py b/swh/lister/gitiles/tests/test_lister.py new file mode 100644 index 00000000..2b121176 --- /dev/null +++ b/swh/lister/gitiles/tests/test_lister.py @@ -0,0 +1,110 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from typing import List + +import pytest + +from swh.lister import __version__ +from swh.lister.gitiles.lister import GitilesLister +from swh.lister.pattern import ListerStats + +MAIN_INSTANCE = "android.googlesource.com" +MAIN_INSTANCE_URL = f"https://{MAIN_INSTANCE}" + + +def test_lister_gitiles_instantiate(swh_scheduler): + """Build a lister with either an url or an instance is supported.""" + url = MAIN_INSTANCE_URL + lister = GitilesLister(swh_scheduler, url=url) + assert lister is not None + assert lister.url == url + + assert GitilesLister(swh_scheduler, instance=MAIN_INSTANCE) is not None + assert lister is not None + assert lister.url == url + + +def test_lister_gitiles_fail_to_instantiate(swh_scheduler): + """Build a lister without its url nor its instance should raise""" + # ... It will raise without any of those + with pytest.raises(ValueError, match="'url' or 'instance'"): + GitilesLister(swh_scheduler) + + +def test_lister_gitiles_get_pages(requests_mock_datadir, swh_scheduler): + """Computing the number of pages scrapped during a listing.""" + url = MAIN_INSTANCE_URL + lister_gitiles = GitilesLister(swh_scheduler, instance=MAIN_INSTANCE) + + expected_nb_origins = 7 + + repos: List[str] = list(lister_gitiles.get_pages()) + assert len(repos) == expected_nb_origins + + for listed_url in repos: + assert listed_url.startswith(url) + + +@pytest.mark.parametrize( + "url,expected_nb_origins", + [(MAIN_INSTANCE_URL, 7), ("https://gerrit.googlesource.com", 3)], +) +def test_lister_gitiles_run( + requests_mock_datadir, swh_scheduler, url, expected_nb_origins +): + """Gitiles lister nominal listing case.""" + lister_gitiles = GitilesLister(swh_scheduler, url=url) + + stats = lister_gitiles.run() + + assert stats == ListerStats(pages=expected_nb_origins, origins=expected_nb_origins) + + # test page parsing + scheduler_origins = swh_scheduler.get_listed_origins( + lister_gitiles.lister_obj.id + ).results + assert len(scheduler_origins) == expected_nb_origins + + assert url.startswith("https://") + + # test listed repositories + for listed_origin in scheduler_origins: + assert listed_origin.visit_type == "git" + assert listed_origin.url.startswith(url) + assert listed_origin.url.startswith("https://") + assert listed_origin.last_update is None + + # test user agent content + for request in requests_mock_datadir.request_history: + assert "User-Agent" in request.headers + user_agent = request.headers["User-Agent"] + assert "Software Heritage gitiles lister" in user_agent + assert __version__ in user_agent + + +def test_lister_gitiles_get_pages_with_pages_and_retry( + requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler +): + """Rate limited page are tested back after some time so ingestion can proceed.""" + url = MAIN_INSTANCE_URL + with open( + os.path.join(datadir, f"https_{MAIN_INSTANCE}/,format=json"), "rb" + ) as page: + requests_mock.get( + url, + [ + {"content": None, "status_code": 429}, + {"content": None, "status_code": 429}, + {"content": page.read(), "status_code": 200}, + ], + ) + + lister_gitiles = GitilesLister(swh_scheduler, url=url) + + mocker.patch.object(lister_gitiles.http_request.retry, "sleep") + + pages: List[str] = list(lister_gitiles.get_pages()) + assert len(pages) == 7 diff --git a/swh/lister/gitiles/tests/test_tasks.py b/swh/lister/gitiles/tests/test_tasks.py new file mode 100644 index 00000000..1b89e77b --- /dev/null +++ b/swh/lister/gitiles/tests/test_tasks.py @@ -0,0 +1,28 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_gitweb_lister_task( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + # setup the mocked GitwebLister + lister = mocker.patch("swh.lister.gitweb.tasks.GitwebLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://android.googlesource.com", max_pages=1) + + res = swh_scheduler_celery_app.send_task( + "swh.lister.gitweb.tasks.GitwebListerTask", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index 29a9a01b..8ce3c607 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -45,6 +45,9 @@ lister_args = { "gitweb": { "url": "https://git.distorted.org.uk/~mdw/", }, + "gitiles": { + "instance": "gerrit.googlesource.com", + }, } -- GitLab