From 82ee0951285db2642ed6e59eab81ed53d35b2e6c Mon Sep 17 00:00:00 2001 From: Franck Bret <franck.bret@octobus.net> Date: Thu, 21 Dec 2023 09:54:29 +0100 Subject: [PATCH] Elm stateful lister Use another Api endpoint that helps the lister to be stateful. The Api endpoint used needs a ``since`` value that represents a sequential index in the history. The ``all_packages_count`` state helps in storing a count which will be used as ``since`` argument on the next run. --- swh/lister/elm/__init__.py | 22 +++--- swh/lister/elm/lister.py | 67 ++++++++++++++---- .../all-packages_since_0 | 1 + .../all-packages_since_3 | 1 + .../all-packages_since_4 | 1 + .../https_package.elm-lang.org/search.json | 20 ------ swh/lister/elm/tests/test_lister.py | 68 +++++++++++++++++-- 7 files changed, 133 insertions(+), 47 deletions(-) create mode 100644 swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_0 create mode 100644 swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_3 create mode 100644 swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_4 delete mode 100644 swh/lister/elm/tests/data/https_package.elm-lang.org/search.json diff --git a/swh/lister/elm/__init__.py b/swh/lister/elm/__init__.py index 614d8efd..53634f53 100644 --- a/swh/lister/elm/__init__.py +++ b/swh/lister/elm/__init__.py @@ -12,20 +12,24 @@ Elm lister Additional packages for the language can be searched from the `Packages`_ website and installed with `elm install`_ command. The Elm packages website also provides a -`Http Api endpoint`_ listing all available packages. +`Http Api endpoint`_ listing all available packages versions since a count of +package versions. -Elm origins are Git repositories hosted on Github. Each repository must provide its -packaged releases using the Github release system. +Elm origins are Git repositories hosted on GitHub. Each repository must provide its +packaged releases using the GitHub release system. As of July 2023 `Packages`_ list 1746 packages. Origins retrieving strategy --------------------------- -To build a list of origins we make a GET request to the `Http Api endpoint`_ that returns -a Json array of objects. +To build a list of origins we make a GET request to the `Http Api endpoint`_ with a +``since`` argument as a sequential index in the history which returns a Json array +of strings. +Each string represents a new version for a package. The string is split to get the +``name`` of the package. The origin url for each package is constructed with the information of corresponding -`name` entry which represents the suffix of Github repositories (org/project_name). +``name`` entry which represents the suffix of GitHub repositories (*org*/*project_name*). Page listing ------------ @@ -35,8 +39,8 @@ There is only one page listing all origins url. Origins from page ----------------- -The lister is stateless and yields all origins url from one page. It is a list of package -repository url. +The lister is stateful and yields all new origins url from one page since the last run. +It is a list of package repository url. Running tests ------------- @@ -63,7 +67,7 @@ You can follow lister execution by displaying logs of swh-lister service:: .. _Elm: https://elm-lang.org/ .. _Packages: https://package.elm-lang.org/ .. _elm install: https://guide.elm-lang.org/install/elm.html#elm-install -.. _Http Api endpoint: https://package.elm-lang.org/search.json +.. _Http Api endpoint: https://package.elm-lang.org/all-packages/since/5000 """ diff --git a/swh/lister/elm/lister.py b/swh/lister/elm/lister.py index 545aad8a..d7fe6f57 100644 --- a/swh/lister/elm/lister.py +++ b/swh/lister/elm/lister.py @@ -3,36 +3,47 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from dataclasses import asdict, dataclass import logging -from typing import Any, Dict, Iterator, List, Optional +from typing import Any, Dict, Iterator, Optional, Set from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. -ElmListerPage = List[Dict[str, Any]] +ElmListerPage = Set[str] -class ElmLister(StatelessLister[ElmListerPage]): +@dataclass +class ElmListerState: + """Store lister state for incremental mode operations""" + + all_packages_count: Optional[int] = None + """Store the count of all existing packages, used as ``since`` argument of + API endpoint url. + """ + + +class ElmLister(Lister[ElmListerState, ElmListerPage]): """List Elm packages origins""" LISTER_NAME = "elm" VISIT_TYPE = "git" # Elm origins url are Git repositories INSTANCE = "elm" - SEARCH_URL = "https://package.elm-lang.org/search.json" - + BASE_URL = "https://package.elm-lang.org" + ALL_PACKAGES_URL_PATTERN = "{base_url}/all-packages/since/{since}" REPO_URL_PATTERN = "https://github.com/{name}" def __init__( self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, - url: str = SEARCH_URL, + url: str = BASE_URL, instance: str = INSTANCE, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -47,25 +58,47 @@ class ElmLister(StatelessLister[ElmListerPage]): max_pages=max_pages, enable_origins=enable_origins, ) + self.all_packages_count: int = 0 self.session.headers.update({"Accept": "application/json"}) + def state_from_dict(self, d: Dict[str, Any]) -> ElmListerState: + return ElmListerState(**d) + + def state_to_dict(self, state: ElmListerState) -> Dict[str, Any]: + return asdict(state) + def get_pages(self) -> Iterator[ElmListerPage]: """Yield an iterator which returns 'page' - It uses the unique Http api endpoint `https://package.elm-lang.org/search.json` - to get a list of names corresponding to Github repository url suffixes. + It uses the Http api endpoint ``https://package.elm-lang.org/all-packages/since/:since`` + to get a list of packages versions from where we get names corresponding to GitHub + repository url suffixes. There is only one page that list all origins urls. """ - response = self.http_request(self.url) - yield response.json() + + if not self.state.all_packages_count: + since = 0 + else: + since = self.state.all_packages_count + + response = self.http_request( + self.ALL_PACKAGES_URL_PATTERN.format(base_url=self.url, since=since) + ) + # We’ll save this to the state in finalize() + self.all_packages_count = len(response.json()) + since + + res = set() + for entry in response.json(): + res.add(entry.split("@")[0]) + + yield res def get_origins_from_page(self, page: ElmListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances""" assert self.lister_obj.id is not None - for entry in page: - name: str = entry["name"] + for name in page: repo_url: str = self.REPO_URL_PATTERN.format(name=name) yield ListedOrigin( @@ -74,3 +107,11 @@ class ElmLister(StatelessLister[ElmListerPage]): url=repo_url, last_update=None, ) + + def finalize(self) -> None: + if ( + self.state.all_packages_count is None + or self.all_packages_count > self.state.all_packages_count + ): + self.state.all_packages_count = self.all_packages_count + self.updated = True diff --git a/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_0 b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_0 new file mode 100644 index 00000000..fad5548f --- /dev/null +++ b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_0 @@ -0,0 +1 @@ +["mercurymedia/elm-ag-grid@20.0.0","elm-toulouse/cbor@3.4.0","elm-toulouse/cbor@3.3.0"] diff --git a/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_3 b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_3 new file mode 100644 index 00000000..9b21f025 --- /dev/null +++ b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_3 @@ -0,0 +1 @@ +["miniBill/elm-avataaars@1.1.1"] diff --git a/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_4 b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_4 new file mode 100644 index 00000000..fe51488c --- /dev/null +++ b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_4 @@ -0,0 +1 @@ +[] diff --git a/swh/lister/elm/tests/data/https_package.elm-lang.org/search.json b/swh/lister/elm/tests/data/https_package.elm-lang.org/search.json deleted file mode 100644 index cdb20cec..00000000 --- a/swh/lister/elm/tests/data/https_package.elm-lang.org/search.json +++ /dev/null @@ -1,20 +0,0 @@ -[ - { - "name": "elm/bytes", - "summary": "Work with sequences of bytes (a.k.a. ArrayBuffer, typed arrays, DataView)", - "license": "BSD-3-Clause", - "version": "1.0.8" - }, - { - "name": "STTR13/ziplist", - "summary": "List with a selected element that makes impossible state impossible.", - "license": "BSD-3-Clause", - "version": "1.4.2" - }, - { - "name": "cuducos/elm-format-number", - "summary": "Format numbers as pretty strings", - "license": "BSD-3-Clause", - "version": "9.0.1" - } -] diff --git a/swh/lister/elm/tests/test_lister.py b/swh/lister/elm/tests/test_lister.py index 88511bbb..e8cb6f50 100644 --- a/swh/lister/elm/tests/test_lister.py +++ b/swh/lister/elm/tests/test_lister.py @@ -5,10 +5,13 @@ from swh.lister.elm.lister import ElmLister -expected_origins = [ - "https://github.com/STTR13/ziplist", - "https://github.com/elm/bytes", - "https://github.com/cuducos/elm-format-number", +expected_origins_since_0 = [ + "https://github.com/elm-toulouse/cbor", + "https://github.com/mercurymedia/elm-ag-grid", +] + +expected_origins_since_3 = [ + "https://github.com/miniBill/elm-avataaars", ] @@ -17,10 +20,57 @@ def test_elm_lister(datadir, requests_mock_datadir, swh_scheduler): res = lister.run() assert res.pages == 1 - assert res.origins == 1 + 1 + 1 + # 2 of the 3 entries are related to the same package so the origins count is 2 + assert res.origins == 2 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + + assert len(scheduler_origins) == len(expected_origins_since_0) + assert { + ( + scheduled.visit_type, + scheduled.url, + scheduled.last_update, + ) + for scheduled in scheduler_origins + } == {("git", expected, None) for expected in expected_origins_since_0} + + # Check that all_packages_count is set + assert lister.state.all_packages_count == 3 # 3 entries + + +def test_elm_lister_incremental(datadir, requests_mock_datadir, swh_scheduler): + # First run, since=0 + lister = ElmLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 1 + # 2 of the 3 entries are related to the same package so the origins count is 2 + assert res.origins == 2 scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == len(expected_origins_since_0) + assert { + ( + scheduled.visit_type, + scheduled.url, + scheduled.last_update, + ) + for scheduled in scheduler_origins + } == {("git", expected, None) for expected in expected_origins_since_0} + + # Check that all_packages_count is set + assert lister.state.all_packages_count == 3 # 3 entries + + # Second run, since=3 + lister = ElmLister(scheduler=swh_scheduler) + res = lister.run() + assert res.pages == 1 + assert res.origins == 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + expected_origins = expected_origins_since_0 + expected_origins_since_3 assert len(scheduler_origins) == len(expected_origins) assert { ( @@ -30,3 +80,11 @@ def test_elm_lister(datadir, requests_mock_datadir, swh_scheduler): ) for scheduled in scheduler_origins } == {("git", expected, None) for expected in expected_origins} + assert lister.state.all_packages_count == 4 # 4 entries + + # Third run, since=4, nothing new + lister = ElmLister(scheduler=swh_scheduler) + res = lister.run() + assert res.pages == 1 + assert res.origins == 0 + assert lister.state.all_packages_count == 4 # 4 entries -- GitLab