diff --git a/setup.py b/setup.py index 0ad6aa5fddca7fc6f74acf0b505a72e7b92f3ee3..456cd0e7ffa2f9767257bc43602a8f37e3213a78 100755 --- a/setup.py +++ b/setup.py @@ -68,6 +68,7 @@ setup( lister.fedora=swh.lister.fedora:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register + lister.gitiles=swh.lister.gitiles:register lister.gitlab=swh.lister.gitlab:register lister.gitweb=swh.lister.gitweb:register lister.gnu=swh.lister.gnu:register diff --git a/swh/lister/gitiles/__init__.py b/swh/lister/gitiles/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8f71bf370711a0ee74d498e4ae2d5400299dbaf8 --- /dev/null +++ b/swh/lister/gitiles/__init__.py @@ -0,0 +1,12 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +def register(): + from .lister import GitilesLister + + return { + "lister": GitilesLister, + "task_modules": [f"{__name__}.tasks"], + } diff --git a/swh/lister/gitiles/lister.py b/swh/lister/gitiles/lister.py new file mode 100644 index 0000000000000000000000000000000000000000..b92e9e8d91228890e87d66b004d732944d89043b --- /dev/null +++ b/swh/lister/gitiles/lister.py @@ -0,0 +1,82 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from json import loads +import logging +from typing import Iterator, Optional + +from swh.lister.pattern import CredentialsType, StatelessLister +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +logger = logging.getLogger(__name__) + +Origin = str + + +class GitilesLister(StatelessLister[Origin]): + """Lister class for Gitiles repositories. + + This lister will retrieve the list of published git repositories by + parsing the json page found at the url `<url>?format=json`. + + """ + + LISTER_NAME = "gitiles" + + def __init__( + self, + scheduler: SchedulerInterface, + url: Optional[str] = None, + instance: Optional[str] = None, + credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, + ): + """Lister class for Gitiles repositories. + + Args: + url: (Optional) Root URL of the Gitiles instance, i.e. url of the index of + published git repositories on this instance. Defaults to + :file:`https://{instance}` if unset. + instance: Name of gitiles instance. Defaults to url's network location + if unset. + + """ + super().__init__( + scheduler=scheduler, + url=url, + instance=instance, + credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, + ) + + self.session.headers.update({"Accept": "application/json"}) + + def get_pages(self) -> Iterator[Origin]: + """Generate git 'project' URLs found on the current Gitiles server.""" + response = self.http_request(f"{self.url}?format=json") + text = response.text + # current gitiles' json is returned with a specific prefix + # See. https://github.com/google/gitiles/issues/263 + if text.startswith(")]}'\n"): + text = text[5:] + + data = loads(text) + + for repo in data.values(): + yield repo["clone_url"] + + def get_origins_from_page(self, origin: Origin) -> Iterator[ListedOrigin]: + """Convert a page of gitiles repositories into a list of ListedOrigins.""" + assert self.lister_obj.id is not None + + yield ListedOrigin( + lister_id=self.lister_obj.id, + url=origin, + visit_type="git", + ) diff --git a/swh/lister/gitiles/tasks.py b/swh/lister/gitiles/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..07f292bcfaad4fa13182b5c43454fd30aa865f4c --- /dev/null +++ b/swh/lister/gitiles/tasks.py @@ -0,0 +1,16 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import GitilesLister + + +@shared_task(name=f"{__name__}.GitilesListerTask") +def list_gitiles(**lister_args) -> Dict[str, str]: + """Lister task for Gitiles instances""" + lister = GitilesLister.from_configfile(**lister_args) + return lister.run().dict() diff --git a/swh/lister/gitiles/tests/__init__.py b/swh/lister/gitiles/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/swh/lister/gitiles/tests/data/https_android.googlesource.com/,format=json b/swh/lister/gitiles/tests/data/https_android.googlesource.com/,format=json new file mode 100644 index 0000000000000000000000000000000000000000..8328c7c803b77de735e4130f68b4d4541f78888f --- /dev/null +++ b/swh/lister/gitiles/tests/data/https_android.googlesource.com/,format=json @@ -0,0 +1,35 @@ +)]}' +{ + "accessories/manifest": { + "name": "accessories/manifest", + "clone_url": "https://android.googlesource.com/accessories/manifest" + }, + "device/google/gs201": { + "name": "device/google/gs201", + "clone_url": "https://android.googlesource.com/device/google/gs201", + "description": "Bug: 244231765" + }, + "device/google/sunfish-kernel": { + "name": "device/google/sunfish-kernel", + "clone_url": "https://android.googlesource.com/device/google/sunfish-kernel", + "description": "Bug: 160260413" + }, + "device/lge/mako-kernel": { + "name": "device/lge/mako-kernel", + "clone_url": "https://android.googlesource.com/device/lge/mako-kernel" + }, + "kernel/msm-extra/camera-devicetree": { + "name": "kernel/msm-extra/camera-devicetree", + "clone_url": "https://android.googlesource.com/kernel/msm-extra/camera-devicetree", + "description": "Bug: 167236823" + }, + "platform/external/google-fonts/rubik": { + "name": "platform/external/google-fonts/rubik", + "clone_url": "https://android.googlesource.com/platform/external/google-fonts/rubik", + "description": "Bug: 122303069" + }, + "trusty/vendor/google/aosp": { + "name": "trusty/vendor/google/aosp", + "clone_url": "https://android.googlesource.com/trusty/vendor/google/aosp" + } +} diff --git a/swh/lister/gitiles/tests/data/https_android.googlesource.com/README b/swh/lister/gitiles/tests/data/https_android.googlesource.com/README new file mode 100644 index 0000000000000000000000000000000000000000..f2074babfae8a5bfb3f44a4b64e76a72d372de53 --- /dev/null +++ b/swh/lister/gitiles/tests/data/https_android.googlesource.com/README @@ -0,0 +1,2 @@ +These files are a partial dump of https://android.googlesource.com/?format=json. + diff --git a/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/,format=json b/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/,format=json new file mode 100644 index 0000000000000000000000000000000000000000..37c0378f83a18ca23ebc91287c8e96b6c2c85345 --- /dev/null +++ b/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/,format=json @@ -0,0 +1,16 @@ +{ + "apps/analytics-etl": { + "name": "apps/analytics-etl", + "clone_url": "https://gerrit.googlesource.com/apps/analytics-etl", + "description": "Spark ETL to extra analytics data from Gerrit Projects using the Analytics plugin" + }, + "apps/kibana-dashboard": { + "name": "apps/kibana-dashboard", + "clone_url": "https://gerrit.googlesource.com/apps/kibana-dashboard" + }, + "apps/reviewit": { + "name": "apps/reviewit", + "clone_url": "https://gerrit.googlesource.com/apps/reviewit", + "description": "The \u0027Review It!?\u0027 app is an Android application for Gerrit that allows sorting of incoming changes and review of small/trivial changes.\n\nThis is not an official Google product." + } +} diff --git a/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/README b/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/README new file mode 100644 index 0000000000000000000000000000000000000000..1a016a2dfde5676d2ac7e970786a46d431a3a22d --- /dev/null +++ b/swh/lister/gitiles/tests/data/https_gerrit.googlesource.com/README @@ -0,0 +1,2 @@ +These files are a partial dump of https://gerrit.googlesource.com/?format=json. + diff --git a/swh/lister/gitiles/tests/test_lister.py b/swh/lister/gitiles/tests/test_lister.py new file mode 100644 index 0000000000000000000000000000000000000000..2b1211766787f54e5c324595338cbc0ed42537c7 --- /dev/null +++ b/swh/lister/gitiles/tests/test_lister.py @@ -0,0 +1,110 @@ +# Copyright (C) 2023 The Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from typing import List + +import pytest + +from swh.lister import __version__ +from swh.lister.gitiles.lister import GitilesLister +from swh.lister.pattern import ListerStats + +MAIN_INSTANCE = "android.googlesource.com" +MAIN_INSTANCE_URL = f"https://{MAIN_INSTANCE}" + + +def test_lister_gitiles_instantiate(swh_scheduler): + """Build a lister with either an url or an instance is supported.""" + url = MAIN_INSTANCE_URL + lister = GitilesLister(swh_scheduler, url=url) + assert lister is not None + assert lister.url == url + + assert GitilesLister(swh_scheduler, instance=MAIN_INSTANCE) is not None + assert lister is not None + assert lister.url == url + + +def test_lister_gitiles_fail_to_instantiate(swh_scheduler): + """Build a lister without its url nor its instance should raise""" + # ... It will raise without any of those + with pytest.raises(ValueError, match="'url' or 'instance'"): + GitilesLister(swh_scheduler) + + +def test_lister_gitiles_get_pages(requests_mock_datadir, swh_scheduler): + """Computing the number of pages scrapped during a listing.""" + url = MAIN_INSTANCE_URL + lister_gitiles = GitilesLister(swh_scheduler, instance=MAIN_INSTANCE) + + expected_nb_origins = 7 + + repos: List[str] = list(lister_gitiles.get_pages()) + assert len(repos) == expected_nb_origins + + for listed_url in repos: + assert listed_url.startswith(url) + + +@pytest.mark.parametrize( + "url,expected_nb_origins", + [(MAIN_INSTANCE_URL, 7), ("https://gerrit.googlesource.com", 3)], +) +def test_lister_gitiles_run( + requests_mock_datadir, swh_scheduler, url, expected_nb_origins +): + """Gitiles lister nominal listing case.""" + lister_gitiles = GitilesLister(swh_scheduler, url=url) + + stats = lister_gitiles.run() + + assert stats == ListerStats(pages=expected_nb_origins, origins=expected_nb_origins) + + # test page parsing + scheduler_origins = swh_scheduler.get_listed_origins( + lister_gitiles.lister_obj.id + ).results + assert len(scheduler_origins) == expected_nb_origins + + assert url.startswith("https://") + + # test listed repositories + for listed_origin in scheduler_origins: + assert listed_origin.visit_type == "git" + assert listed_origin.url.startswith(url) + assert listed_origin.url.startswith("https://") + assert listed_origin.last_update is None + + # test user agent content + for request in requests_mock_datadir.request_history: + assert "User-Agent" in request.headers + user_agent = request.headers["User-Agent"] + assert "Software Heritage gitiles lister" in user_agent + assert __version__ in user_agent + + +def test_lister_gitiles_get_pages_with_pages_and_retry( + requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler +): + """Rate limited page are tested back after some time so ingestion can proceed.""" + url = MAIN_INSTANCE_URL + with open( + os.path.join(datadir, f"https_{MAIN_INSTANCE}/,format=json"), "rb" + ) as page: + requests_mock.get( + url, + [ + {"content": None, "status_code": 429}, + {"content": None, "status_code": 429}, + {"content": page.read(), "status_code": 200}, + ], + ) + + lister_gitiles = GitilesLister(swh_scheduler, url=url) + + mocker.patch.object(lister_gitiles.http_request.retry, "sleep") + + pages: List[str] = list(lister_gitiles.get_pages()) + assert len(pages) == 7 diff --git a/swh/lister/gitiles/tests/test_tasks.py b/swh/lister/gitiles/tests/test_tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..1b89e77b9ac36d1c91bfa81aa1dc2ba794a8b4df --- /dev/null +++ b/swh/lister/gitiles/tests/test_tasks.py @@ -0,0 +1,28 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_gitweb_lister_task( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + # setup the mocked GitwebLister + lister = mocker.patch("swh.lister.gitweb.tasks.GitwebLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + kwargs = dict(url="https://android.googlesource.com", max_pages=1) + + res = swh_scheduler_celery_app.send_task( + "swh.lister.gitweb.tasks.GitwebListerTask", + kwargs=kwargs, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index 29a9a01baf19e83ab18ec22f623a60c56e4fdc44..8ce3c6072fba5b1b080662ffc66c315db95488d5 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -45,6 +45,9 @@ lister_args = { "gitweb": { "url": "https://git.distorted.org.uk/~mdw/", }, + "gitiles": { + "instance": "gerrit.googlesource.com", + }, }