From 82ee0951285db2642ed6e59eab81ed53d35b2e6c Mon Sep 17 00:00:00 2001
From: Franck Bret <franck.bret@octobus.net>
Date: Thu, 21 Dec 2023 09:54:29 +0100
Subject: [PATCH] Elm stateful lister

Use another Api endpoint that helps the lister to be stateful.
The Api endpoint used needs a ``since`` value that represents a
sequential index in the history.
The ``all_packages_count`` state helps in storing a count which will be
used as ``since`` argument on the next run.
---
 swh/lister/elm/__init__.py                    | 22 +++---
 swh/lister/elm/lister.py                      | 67 ++++++++++++++----
 .../all-packages_since_0                      |  1 +
 .../all-packages_since_3                      |  1 +
 .../all-packages_since_4                      |  1 +
 .../https_package.elm-lang.org/search.json    | 20 ------
 swh/lister/elm/tests/test_lister.py           | 68 +++++++++++++++++--
 7 files changed, 133 insertions(+), 47 deletions(-)
 create mode 100644 swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_0
 create mode 100644 swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_3
 create mode 100644 swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_4
 delete mode 100644 swh/lister/elm/tests/data/https_package.elm-lang.org/search.json

diff --git a/swh/lister/elm/__init__.py b/swh/lister/elm/__init__.py
index 614d8efd..53634f53 100644
--- a/swh/lister/elm/__init__.py
+++ b/swh/lister/elm/__init__.py
@@ -12,20 +12,24 @@ Elm lister
 
 Additional packages for the language can be searched from the `Packages`_ website
 and installed with `elm install`_ command. The Elm packages website also provides a
-`Http Api endpoint`_ listing all available packages.
+`Http Api endpoint`_ listing all available packages versions since a count of
+package versions.
 
-Elm origins are Git repositories hosted on Github. Each repository must provide its
-packaged releases using the Github release system.
+Elm origins are Git repositories hosted on GitHub. Each repository must provide its
+packaged releases using the GitHub release system.
 
 As of July 2023 `Packages`_ list 1746 packages.
 
 Origins retrieving strategy
 ---------------------------
 
-To build a list of origins we make a GET request to the `Http Api endpoint`_ that returns
-a Json array of objects.
+To build a list of origins we make a GET request to the `Http Api endpoint`_ with a
+``since`` argument as  a sequential index in the history which returns a Json array
+of strings.
+Each string represents a new version for a package. The string is split to get the
+``name`` of the package.
 The origin url for each package is constructed with the information of corresponding
-`name` entry which represents the suffix of Github repositories (org/project_name).
+``name`` entry which represents the suffix of GitHub repositories (*org*/*project_name*).
 
 Page listing
 ------------
@@ -35,8 +39,8 @@ There is only one page listing all origins url.
 Origins from page
 -----------------
 
-The lister is stateless and yields all origins url from one page. It is a list of package
-repository url.
+The lister is stateful and yields all new origins url from one page since the last run.
+It is a list of package repository url.
 
 Running tests
 -------------
@@ -63,7 +67,7 @@ You can follow lister execution by displaying logs of swh-lister service::
 .. _Elm: https://elm-lang.org/
 .. _Packages: https://package.elm-lang.org/
 .. _elm install: https://guide.elm-lang.org/install/elm.html#elm-install
-.. _Http Api endpoint: https://package.elm-lang.org/search.json
+.. _Http Api endpoint: https://package.elm-lang.org/all-packages/since/5000
 """
 
 
diff --git a/swh/lister/elm/lister.py b/swh/lister/elm/lister.py
index 545aad8a..d7fe6f57 100644
--- a/swh/lister/elm/lister.py
+++ b/swh/lister/elm/lister.py
@@ -3,36 +3,47 @@
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+from dataclasses import asdict, dataclass
 import logging
-from typing import Any, Dict, Iterator, List, Optional
+from typing import Any, Dict, Iterator, Optional, Set
 
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
-from ..pattern import CredentialsType, StatelessLister
+from ..pattern import CredentialsType, Lister
 
 logger = logging.getLogger(__name__)
 
 # Aliasing the page results returned by `get_pages` method from the lister.
-ElmListerPage = List[Dict[str, Any]]
+ElmListerPage = Set[str]
 
 
-class ElmLister(StatelessLister[ElmListerPage]):
+@dataclass
+class ElmListerState:
+    """Store lister state for incremental mode operations"""
+
+    all_packages_count: Optional[int] = None
+    """Store the count of all existing packages, used as ``since`` argument of
+    API endpoint url.
+    """
+
+
+class ElmLister(Lister[ElmListerState, ElmListerPage]):
     """List Elm packages origins"""
 
     LISTER_NAME = "elm"
     VISIT_TYPE = "git"  # Elm origins url are Git repositories
     INSTANCE = "elm"
 
-    SEARCH_URL = "https://package.elm-lang.org/search.json"
-
+    BASE_URL = "https://package.elm-lang.org"
+    ALL_PACKAGES_URL_PATTERN = "{base_url}/all-packages/since/{since}"
     REPO_URL_PATTERN = "https://github.com/{name}"
 
     def __init__(
         self,
         scheduler: SchedulerInterface,
         credentials: Optional[CredentialsType] = None,
-        url: str = SEARCH_URL,
+        url: str = BASE_URL,
         instance: str = INSTANCE,
         max_origins_per_page: Optional[int] = None,
         max_pages: Optional[int] = None,
@@ -47,25 +58,47 @@ class ElmLister(StatelessLister[ElmListerPage]):
             max_pages=max_pages,
             enable_origins=enable_origins,
         )
+        self.all_packages_count: int = 0
         self.session.headers.update({"Accept": "application/json"})
 
+    def state_from_dict(self, d: Dict[str, Any]) -> ElmListerState:
+        return ElmListerState(**d)
+
+    def state_to_dict(self, state: ElmListerState) -> Dict[str, Any]:
+        return asdict(state)
+
     def get_pages(self) -> Iterator[ElmListerPage]:
         """Yield an iterator which returns 'page'
 
-        It uses the unique Http api endpoint `https://package.elm-lang.org/search.json`
-        to get a list of names corresponding to Github repository url suffixes.
+        It uses the Http api endpoint ``https://package.elm-lang.org/all-packages/since/:since``
+        to get a list of packages versions from where we get names corresponding to GitHub
+        repository url suffixes.
 
         There is only one page that list all origins urls.
         """
-        response = self.http_request(self.url)
-        yield response.json()
+
+        if not self.state.all_packages_count:
+            since = 0
+        else:
+            since = self.state.all_packages_count
+
+        response = self.http_request(
+            self.ALL_PACKAGES_URL_PATTERN.format(base_url=self.url, since=since)
+        )
+        # We’ll save this to the state in finalize()
+        self.all_packages_count = len(response.json()) + since
+
+        res = set()
+        for entry in response.json():
+            res.add(entry.split("@")[0])
+
+        yield res
 
     def get_origins_from_page(self, page: ElmListerPage) -> Iterator[ListedOrigin]:
         """Iterate on all pages and yield ListedOrigin instances"""
         assert self.lister_obj.id is not None
 
-        for entry in page:
-            name: str = entry["name"]
+        for name in page:
             repo_url: str = self.REPO_URL_PATTERN.format(name=name)
 
             yield ListedOrigin(
@@ -74,3 +107,11 @@ class ElmLister(StatelessLister[ElmListerPage]):
                 url=repo_url,
                 last_update=None,
             )
+
+    def finalize(self) -> None:
+        if (
+            self.state.all_packages_count is None
+            or self.all_packages_count > self.state.all_packages_count
+        ):
+            self.state.all_packages_count = self.all_packages_count
+            self.updated = True
diff --git a/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_0 b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_0
new file mode 100644
index 00000000..fad5548f
--- /dev/null
+++ b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_0
@@ -0,0 +1 @@
+["mercurymedia/elm-ag-grid@20.0.0","elm-toulouse/cbor@3.4.0","elm-toulouse/cbor@3.3.0"]
diff --git a/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_3 b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_3
new file mode 100644
index 00000000..9b21f025
--- /dev/null
+++ b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_3
@@ -0,0 +1 @@
+["miniBill/elm-avataaars@1.1.1"]
diff --git a/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_4 b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_4
new file mode 100644
index 00000000..fe51488c
--- /dev/null
+++ b/swh/lister/elm/tests/data/https_package.elm-lang.org/all-packages_since_4
@@ -0,0 +1 @@
+[]
diff --git a/swh/lister/elm/tests/data/https_package.elm-lang.org/search.json b/swh/lister/elm/tests/data/https_package.elm-lang.org/search.json
deleted file mode 100644
index cdb20cec..00000000
--- a/swh/lister/elm/tests/data/https_package.elm-lang.org/search.json
+++ /dev/null
@@ -1,20 +0,0 @@
-[
-  {
-    "name": "elm/bytes",
-    "summary": "Work with sequences of bytes (a.k.a. ArrayBuffer, typed arrays, DataView)",
-    "license": "BSD-3-Clause",
-    "version": "1.0.8"
-  },
-  {
-    "name": "STTR13/ziplist",
-    "summary": "List with a selected element that makes impossible state impossible.",
-    "license": "BSD-3-Clause",
-    "version": "1.4.2"
-  },
-  {
-    "name": "cuducos/elm-format-number",
-    "summary": "Format numbers as pretty strings",
-    "license": "BSD-3-Clause",
-    "version": "9.0.1"
-  }
-]
diff --git a/swh/lister/elm/tests/test_lister.py b/swh/lister/elm/tests/test_lister.py
index 88511bbb..e8cb6f50 100644
--- a/swh/lister/elm/tests/test_lister.py
+++ b/swh/lister/elm/tests/test_lister.py
@@ -5,10 +5,13 @@
 
 from swh.lister.elm.lister import ElmLister
 
-expected_origins = [
-    "https://github.com/STTR13/ziplist",
-    "https://github.com/elm/bytes",
-    "https://github.com/cuducos/elm-format-number",
+expected_origins_since_0 = [
+    "https://github.com/elm-toulouse/cbor",
+    "https://github.com/mercurymedia/elm-ag-grid",
+]
+
+expected_origins_since_3 = [
+    "https://github.com/miniBill/elm-avataaars",
 ]
 
 
@@ -17,10 +20,57 @@ def test_elm_lister(datadir, requests_mock_datadir, swh_scheduler):
     res = lister.run()
 
     assert res.pages == 1
-    assert res.origins == 1 + 1 + 1
+    # 2 of the 3 entries are related to the same package so the origins count is 2
+    assert res.origins == 2
+
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+    assert len(scheduler_origins) == len(expected_origins_since_0)
+    assert {
+        (
+            scheduled.visit_type,
+            scheduled.url,
+            scheduled.last_update,
+        )
+        for scheduled in scheduler_origins
+    } == {("git", expected, None) for expected in expected_origins_since_0}
+
+    # Check that all_packages_count is set
+    assert lister.state.all_packages_count == 3  # 3 entries
+
+
+def test_elm_lister_incremental(datadir, requests_mock_datadir, swh_scheduler):
+    # First run, since=0
+    lister = ElmLister(scheduler=swh_scheduler)
+    res = lister.run()
+
+    assert res.pages == 1
+    # 2 of the 3 entries are related to the same package so the origins count is 2
+    assert res.origins == 2
 
     scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
 
+    assert len(scheduler_origins) == len(expected_origins_since_0)
+    assert {
+        (
+            scheduled.visit_type,
+            scheduled.url,
+            scheduled.last_update,
+        )
+        for scheduled in scheduler_origins
+    } == {("git", expected, None) for expected in expected_origins_since_0}
+
+    # Check that all_packages_count is set
+    assert lister.state.all_packages_count == 3  # 3 entries
+
+    # Second run, since=3
+    lister = ElmLister(scheduler=swh_scheduler)
+    res = lister.run()
+    assert res.pages == 1
+    assert res.origins == 1
+
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+    expected_origins = expected_origins_since_0 + expected_origins_since_3
     assert len(scheduler_origins) == len(expected_origins)
     assert {
         (
@@ -30,3 +80,11 @@ def test_elm_lister(datadir, requests_mock_datadir, swh_scheduler):
         )
         for scheduled in scheduler_origins
     } == {("git", expected, None) for expected in expected_origins}
+    assert lister.state.all_packages_count == 4  # 4 entries
+
+    # Third run, since=4, nothing new
+    lister = ElmLister(scheduler=swh_scheduler)
+    res = lister.run()
+    assert res.pages == 1
+    assert res.origins == 0
+    assert lister.state.all_packages_count == 4  # 4 entries
-- 
GitLab