From f862004700259234a690779b48dd19c37c93d5a0 Mon Sep 17 00:00:00 2001 From: Antoine Lambert <antoine.lambert@inria.fr> Date: Wed, 27 Jan 2021 18:58:54 +0100 Subject: [PATCH] launchpad: Reimplement lister using new Lister API Port launchpad lister to the swh.lister.pattern.Lister API. Last update date of each listed git repositories is now sent to the scheduler. The lister can work in incremental mode, only modified repositories since the last listing operation will be returned in that case. Closes T2992 --- mypy.ini | 6 + swh/lister/launchpad/__init__.py | 4 +- swh/lister/launchpad/lister.py | 207 +++++++++--------- swh/lister/launchpad/models.py | 16 -- swh/lister/launchpad/tasks.py | 41 ++-- ...esponse0.json => launchpad_response1.json} | 0 ...esponse1.json => launchpad_response2.json} | 0 .../launchpad/tests/data/response2.json | 1 - swh/lister/launchpad/tests/test_lister.py | 164 ++++++++++++-- swh/lister/launchpad/tests/test_tasks.py | 52 +++-- swh/lister/tests/test_cli.py | 3 +- 11 files changed, 305 insertions(+), 189 deletions(-) delete mode 100644 swh/lister/launchpad/models.py rename swh/lister/launchpad/tests/data/{response0.json => launchpad_response1.json} (100%) rename swh/lister/launchpad/tests/data/{response1.json => launchpad_response2.json} (100%) delete mode 100644 swh/lister/launchpad/tests/data/response2.json diff --git a/mypy.ini b/mypy.ini index d32ccd27..fc04b828 100644 --- a/mypy.ini +++ b/mypy.ini @@ -20,6 +20,12 @@ ignore_missing_imports = True [mypy-iso8601.*] ignore_missing_imports = True +[mypy-launchpadlib.*] +ignore_missing_imports = True + +[mypy-lazr.*] +ignore_missing_imports = True + [mypy-pkg_resources.*] ignore_missing_imports = True diff --git a/swh/lister/launchpad/__init__.py b/swh/lister/launchpad/__init__.py index ac9c66b7..5efc2ddc 100644 --- a/swh/lister/launchpad/__init__.py +++ b/swh/lister/launchpad/__init__.py @@ -1,14 +1,12 @@ -# Copyright (C) 2020 the Software Heritage developers +# Copyright (C) 2020-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def register(): from .lister import LaunchpadLister - from .models import LaunchpadModel return { - "models": [LaunchpadModel], "lister": LaunchpadLister, "task_modules": ["%s.tasks" % __name__], } diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index 4f981f5e..3a5004f1 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -1,128 +1,127 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from datetime import datetime, timedelta -from itertools import count -from typing import Any, Dict, List, Optional, Tuple, Union +from dataclasses import asdict, dataclass +from datetime import datetime +import logging +from typing import Any, Dict, Iterator, Optional -from launchpadlib.launchpad import Launchpad # type: ignore -from lazr.restfulclient.resource import Collection, Entry # type: ignore -from sqlalchemy import func +import iso8601 +from launchpadlib.launchpad import Launchpad +from lazr.restfulclient.resource import Collection -from swh.lister.core.lister_base import ListerBase +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin -from .models import LaunchpadModel +from ..pattern import CredentialsType, Lister +logger = logging.getLogger(__name__) + +LaunchpadPageType = Iterator[Collection] + + +@dataclass +class LaunchpadListerState: + """State of Launchpad lister""" + + date_last_modified: Optional[datetime] = None + """modification date of last updated repository since last listing""" + + +class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): + """ + List git repositories from Launchpad. + + Args: + scheduler: instance of SchedulerInterface + incremental: defines if incremental listing should be used, in that case + only modified or new repositories since last incremental listing operation + will be returned + """ -class LaunchpadLister(ListerBase): - MODEL = LaunchpadModel LISTER_NAME = "launchpad" - instance = "launchpad" - flush_packet_db = 20 - def __init__(self, override_config=None): - super().__init__(override_config=override_config) + def __init__( + self, + scheduler: SchedulerInterface, + incremental: bool = False, + credentials: CredentialsType = None, + ): + super().__init__( + scheduler=scheduler, + url="https://launchpad.net/", + instance="launchpad", + credentials=credentials, + ) + self.incremental = incremental + self.date_last_modified = None + + def state_from_dict(self, d: Dict[str, Any]) -> LaunchpadListerState: + date_last_modified = d.get("date_last_modified") + if date_last_modified is not None: + d["date_last_modified"] = iso8601.parse_date(date_last_modified) + return LaunchpadListerState(**d) + + def state_to_dict(self, state: LaunchpadListerState) -> Dict[str, Any]: + d = asdict(state) + date_last_modified = d.get("date_last_modified") + if date_last_modified is not None: + d["date_last_modified"] = date_last_modified.isoformat() + return d + + def get_pages(self) -> Iterator[LaunchpadPageType]: + """ + Yields an iterator on all git repositories hosted on Launchpad sorted + by last modification date in ascending order. + """ launchpad = Launchpad.login_anonymously( "softwareheritage", "production", version="devel" ) - self.get_repos = launchpad.git_repositories.getRepositories - - def get_model_from_repo(self, repo: Entry) -> Dict[str, Union[str, datetime]]: - return { - "uid": repo.unique_name, - "name": repo.name, - "full_name": repo.name, - "origin_url": repo.git_https_url, - "html_url": repo.web_link, - "origin_type": "git", - "date_last_modified": repo.date_last_modified, - } - - def lib_response_simplified( - self, response: Collection - ) -> List[Dict[str, Union[str, datetime]]]: - return [ - self.get_model_from_repo(repo) for repo in response[: len(response.entries)] - ] - - def db_last_threshold(self) -> Optional[datetime]: - t = self.db_session.query(func.max(self.MODEL.date_last_modified)).first() - if t: - return t[0] - else: - return None - - def ingest_data_lp( - self, identifier: Optional[datetime], checks: bool = False - ) -> Tuple[Collection, dict]: - """ The core data fetch sequence. Request launchpadlib endpoint. Simplify and - filter response list of repositories. Inject repo information into - local db. Queue loader tasks for linked repositories. - - Args: - identifier: Resource identifier. - checks: Additional checks required - """ - response = self.get_repos( - order_by="most neglected first", modified_since_date=identifier + date_last_modified = None + if self.incremental: + date_last_modified = self.state.date_last_modified + get_repos = launchpad.git_repositories.getRepositories + yield get_repos( + order_by="most neglected first", modified_since_date=date_last_modified ) - models_list = self.lib_response_simplified(response) - models_list = self.filter_before_inject(models_list) - if checks: - models_list = self.do_additional_checks(models_list) - if not models_list: - return response, {} - # inject into local db - injected = self.inject_repo_data_into_db(models_list) - # queue workers - self.schedule_missing_tasks(models_list, injected) - return response, injected - - def run(self, max_bound: Optional[datetime] = None) -> Dict[str, Any]: - """ Main entry function. Sequentially fetches repository data - from the service according to the basic outline in the class - docstring, continually fetching sublists until either there - is no next index reference given or the given next index is greater - than the desired max_bound. - - Args: - max_bound : optional date to start at - Returns: - Dict containing listing status + + def get_origins_from_page(self, page: LaunchpadPageType) -> Iterator[ListedOrigin]: """ - status = "uneventful" + Iterate on all git repositories and yield ListedOrigin instances. + """ + assert self.lister_obj.id is not None + + for repo in page: + + origin_url = repo.git_https_url - def ingest_git_repos(): - threshold = max_bound - for i in count(1): - response, injected_repos = self.ingest_data_lp(threshold) - if not response and not injected_repos: - return + # filter out origins with invalid URL + if not origin_url.startswith("https://"): + continue - # batch is empty - if len(response.entries) == 0: - return + last_update = repo.date_last_modified - first: datetime = response[0].date_last_modified - last: datetime = response[len(response.entries) - 1].date_last_modified + self.date_last_modified = last_update - next_date = last - timedelta(seconds=15) + logger.debug("Found origin %s last updated on %s", origin_url, last_update) - if next_date <= first: - delta = last - first - next_date = last - delta / 2 + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type="git", + url=origin_url, + last_update=last_update, + ) - threshold = next_date - yield i + def finalize(self) -> None: + if self.date_last_modified is None: + return - for i in ingest_git_repos(): - if (i % self.flush_packet_db) == 0: - self.db_session.commit() - self.db_session = self.mk_session() - status = "eventful" + if self.incremental and ( + self.state.date_last_modified is None + or self.date_last_modified > self.state.date_last_modified + ): + self.state.date_last_modified = self.date_last_modified - self.db_session.commit() - self.db_session = self.mk_session() - return {"status": status} + self.updated = True diff --git a/swh/lister/launchpad/models.py b/swh/lister/launchpad/models.py deleted file mode 100644 index daa0a4bd..00000000 --- a/swh/lister/launchpad/models.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (C) 2017-2020 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from sqlalchemy import Column, Date, String - -from swh.lister.core.models import ModelBase - - -class LaunchpadModel(ModelBase): - """a Launchpad repository""" - - __tablename__ = "launchpad_repo" - - uid = Column(String, primary_key=True) - date_last_modified = Column(Date, index=True) diff --git a/swh/lister/launchpad/tasks.py b/swh/lister/launchpad/tasks.py index 2ee98882..a67d3655 100644 --- a/swh/lister/launchpad/tasks.py +++ b/swh/lister/launchpad/tasks.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,27 +8,20 @@ from celery import shared_task from .lister import LaunchpadLister +@shared_task(name=__name__ + ".ping") +def ping(): + return "OK" + + +@shared_task(name=__name__ + ".FullLaunchpadLister") +def list_launchpad_full(**lister_args): + """Full listing of git repositories hosted on Launchpad""" + lister = LaunchpadLister.from_configfile(**lister_args) + return lister.run().dict() + + @shared_task(name=__name__ + ".IncrementalLaunchpadLister") -def list_launchpad_incremental(threshold, **lister_args): - """Incremental update - """ - lister = LaunchpadLister(**lister_args) - return lister.run(max_bound=threshold) - - -@shared_task(name=__name__ + ".FullLaunchpadLister", bind=True) -def list_launchpad_full(self, **lister_args): - """Full update of Launchpad - """ - self.log.debug("%s OK, spawned full task" % (self.name)) - return list_launchpad_incremental(threshold=None, **lister_args) - - -@shared_task(name=__name__ + ".NewLaunchpadLister", bind=True) -def list_launchpad_new(self, **lister_args): - """Update new entries of Launchpad - """ - lister = LaunchpadLister(**lister_args) - threshold = lister.db_last_threshold() - self.log.debug("%s OK, spawned new task" % (self.name)) - return list_launchpad_incremental(threshold=threshold, **lister_args) +def list_launchpad_incremental(**lister_args): + """Incremental listing of git repositories hosted on Launchpad""" + lister = LaunchpadLister.from_configfile(**lister_args, incremental=True) + return lister.run().dict() diff --git a/swh/lister/launchpad/tests/data/response0.json b/swh/lister/launchpad/tests/data/launchpad_response1.json similarity index 100% rename from swh/lister/launchpad/tests/data/response0.json rename to swh/lister/launchpad/tests/data/launchpad_response1.json diff --git a/swh/lister/launchpad/tests/data/response1.json b/swh/lister/launchpad/tests/data/launchpad_response2.json similarity index 100% rename from swh/lister/launchpad/tests/data/response1.json rename to swh/lister/launchpad/tests/data/launchpad_response2.json diff --git a/swh/lister/launchpad/tests/data/response2.json b/swh/lister/launchpad/tests/data/response2.json deleted file mode 100644 index 0637a088..00000000 --- a/swh/lister/launchpad/tests/data/response2.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file diff --git a/swh/lister/launchpad/tests/test_lister.py b/swh/lister/launchpad/tests/test_lister.py index 543f6ec7..d36f0265 100644 --- a/swh/lister/launchpad/tests/test_lister.py +++ b/swh/lister/launchpad/tests/test_lister.py @@ -1,30 +1,156 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime +import json +from pathlib import Path +from typing import List -def test_launchpad_lister(lister_launchpad, datadir): - lister_launchpad.run() +import pytest - assert len(lister_launchpad.get_repos.mock_calls) == 3 +from ..lister import LaunchpadLister - r = lister_launchpad.scheduler.search_tasks(task_type="load-git") - assert len(r) == 30 - for row in r: - assert row["type"] == "load-git" - # arguments check - args = row["arguments"]["args"] - assert len(args) == 0 +class _Repo: + def __init__(self, d: dict): + for key in d.keys(): + if key == "date_last_modified": + setattr(self, key, datetime.fromisoformat(d[key])) + else: + setattr(self, key, d[key]) - # kwargs - kwargs = row["arguments"]["kwargs"] - assert set(kwargs.keys()) == {"url"} - url = kwargs["url"] - assert url.startswith("https://git.launchpad.net") +class _Collection: + entries: List[_Repo] = [] - assert row["policy"] == "recurring" - assert row["priority"] is None - assert row["retries_left"] == 3 + def __init__(self, file): + self.entries = [_Repo(r) for r in file] + + def __getitem__(self, key): + return self.entries[key] + + def __len__(self): + return len(self.entries) + + +def _launchpad_response(datadir, datafile): + return _Collection(json.loads(Path(datadir, datafile).read_text())) + + +@pytest.fixture +def launchpad_response1(datadir): + return _launchpad_response(datadir, "launchpad_response1.json") + + +@pytest.fixture +def launchpad_response2(datadir): + return _launchpad_response(datadir, "launchpad_response2.json") + + +def _mock_getRepositories(mocker, launchpad_response): + mock_launchpad = mocker.patch("swh.lister.launchpad.lister.Launchpad") + mock_getRepositories = mock_launchpad.git_repositories.getRepositories + mock_getRepositories.return_value = launchpad_response + mock_launchpad.login_anonymously.return_value = mock_launchpad + + return mock_getRepositories + + +def _check_listed_origins(scheduler_origins, launchpad_response): + for origin in launchpad_response: + + filtered_origins = [ + o for o in scheduler_origins if o.url == origin.git_https_url + ] + + assert len(filtered_origins) == 1 + + assert filtered_origins[0].last_update == origin.date_last_modified + + +def test_lister_from_configfile(swh_scheduler_config, mocker): + load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") + load_from_envvar.return_value = { + "scheduler": {"cls": "local", **swh_scheduler_config}, + "credentials": {}, + } + lister = LaunchpadLister.from_configfile() + assert lister.scheduler is not None + assert lister.credentials is not None + + +def test_launchpad_full_lister(swh_scheduler, mocker, launchpad_response1): + mock_getRepositories = _mock_getRepositories(mocker, launchpad_response1) + lister = LaunchpadLister(scheduler=swh_scheduler) + stats = lister.run() + + assert not lister.incremental + assert lister.updated + assert stats.pages == 1 + assert stats.origins == len(launchpad_response1) + + mock_getRepositories.assert_called_once_with( + order_by="most neglected first", modified_since_date=None + ) + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(launchpad_response1) + + _check_listed_origins(scheduler_origins, launchpad_response1) + + +def test_launchpad_incremental_lister( + swh_scheduler, mocker, launchpad_response1, launchpad_response2 +): + mock_getRepositories = _mock_getRepositories(mocker, launchpad_response1) + lister = LaunchpadLister(scheduler=swh_scheduler, incremental=True) + stats = lister.run() + + assert lister.incremental + assert lister.updated + assert stats.pages == 1 + assert stats.origins == len(launchpad_response1) + + mock_getRepositories.assert_called_once_with( + order_by="most neglected first", modified_since_date=None + ) + + lister_state = lister.get_state_from_scheduler() + assert lister_state.date_last_modified == launchpad_response1[-1].date_last_modified + + mock_getRepositories = _mock_getRepositories(mocker, launchpad_response2) + lister = LaunchpadLister(scheduler=swh_scheduler, incremental=True) + stats = lister.run() + + assert lister.incremental + assert lister.updated + assert stats.pages == 1 + assert stats.origins == len(launchpad_response2) + + mock_getRepositories.assert_called_once_with( + order_by="most neglected first", + modified_since_date=lister_state.date_last_modified, + ) + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(launchpad_response1) + len(launchpad_response2) + + _check_listed_origins(scheduler_origins, launchpad_response1) + _check_listed_origins(scheduler_origins, launchpad_response2) + + +def test_launchpad_lister_invalid_url_filtering( + swh_scheduler, mocker, +): + invalid_origin = [_Repo({"git_https_url": "tag:launchpad.net:2008:redacted",})] + _mock_getRepositories(mocker, invalid_origin) + lister = LaunchpadLister(scheduler=swh_scheduler) + stats = lister.run() + + assert not lister.updated + assert stats.pages == 1 + assert stats.origins == 0 diff --git a/swh/lister/launchpad/tests/test_tasks.py b/swh/lister/launchpad/tests/test_tasks.py index 46423734..f7247a34 100644 --- a/swh/lister/launchpad/tests/test_tasks.py +++ b/swh/lister/launchpad/tests/test_tasks.py @@ -1,42 +1,54 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from unittest.mock import patch +from swh.lister.pattern import ListerStats -@patch("swh.lister.launchpad.tasks.LaunchpadLister") -def test_new(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - # setup the mocked LaunchpadLister - lister.return_value = lister - lister.run.return_value = None +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.launchpad.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_launchpad_full_listing_task( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + lister = mocker.patch("swh.lister.launchpad.tasks.LaunchpadLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=28000) + lister.run.return_value = stats res = swh_scheduler_celery_app.send_task( - "swh.lister.launchpad.tasks.NewLaunchpadLister" + "swh.lister.launchpad.tasks.FullLaunchpadLister" ) assert res res.wait() assert res.successful() + assert res.result == stats.dict() - assert lister.call_count == 2 - lister.db_last_threshold.assert_called_once() - lister.run.assert_called_once() + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with() -@patch("swh.lister.launchpad.tasks.LaunchpadLister") -def test_full(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - # setup the mocked LaunchpadLister - lister.return_value = lister - lister.run.return_value = None +def test_launchpad_incremental_listing_task( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + lister = mocker.patch("swh.lister.launchpad.tasks.LaunchpadLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=1, origins=200) + lister.run.return_value = stats res = swh_scheduler_celery_app.send_task( - "swh.lister.launchpad.tasks.FullLaunchpadLister" + "swh.lister.launchpad.tasks.IncrementalLaunchpadLister" ) assert res res.wait() assert res.successful() + assert res.result == stats.dict() - lister.assert_called_once() - lister.db_last_threshold.assert_not_called() - lister.run.assert_called_once_with(max_bound=None) + lister.from_configfile.assert_called_once_with(incremental=True) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index 2eadc1a4..6066dc68 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -36,8 +36,7 @@ def test_get_lister(swh_scheduler_config): db_url = init_db().url() # Drop launchpad lister from the lister to check, its test setup is more involved # than the other listers and it's not currently done here - supported_listers = set(SUPPORTED_LISTERS) - {"launchpad"} - for lister_name in supported_listers: + for lister_name in SUPPORTED_LISTERS: lst = get_lister( lister_name, db_url, -- GitLab