From 3a1beae36e43b55f4d388be5903a8feaf5722a50 Mon Sep 17 00:00:00 2001
From: Franck Bret <franck.bret@octobus.net>
Date: Mon, 24 Jul 2023 09:34:04 +0200
Subject: [PATCH] Elm Lister

The Elm Lister lists Elm packages origins from the Elm
lang registry.
It uses an http api endpoint to list packages origins.
Origins are Github repositories, releases take advantages
of Github relase Api.
---
 swh/lister/elm/__init__.py                    | 76 +++++++++++++++++++
 swh/lister/elm/lister.py                      | 74 ++++++++++++++++++
 swh/lister/elm/tasks.py                       | 19 +++++
 swh/lister/elm/tests/__init__.py              |  0
 .../https_package.elm-lang.org/search.json    | 20 +++++
 swh/lister/elm/tests/test_lister.py           | 32 ++++++++
 swh/lister/elm/tests/test_tasks.py            | 31 ++++++++
 7 files changed, 252 insertions(+)
 create mode 100644 swh/lister/elm/__init__.py
 create mode 100644 swh/lister/elm/lister.py
 create mode 100644 swh/lister/elm/tasks.py
 create mode 100644 swh/lister/elm/tests/__init__.py
 create mode 100644 swh/lister/elm/tests/data/https_package.elm-lang.org/search.json
 create mode 100644 swh/lister/elm/tests/test_lister.py
 create mode 100644 swh/lister/elm/tests/test_tasks.py

diff --git a/swh/lister/elm/__init__.py b/swh/lister/elm/__init__.py
new file mode 100644
index 00000000..614d8efd
--- /dev/null
+++ b/swh/lister/elm/__init__.py
@@ -0,0 +1,76 @@
+# Copyright (C) 2023  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+
+"""
+Elm lister
+==========
+
+`Elm`_ is a functional language that compiles to JavaScript.
+
+Additional packages for the language can be searched from the `Packages`_ website
+and installed with `elm install`_ command. The Elm packages website also provides a
+`Http Api endpoint`_ listing all available packages.
+
+Elm origins are Git repositories hosted on Github. Each repository must provide its
+packaged releases using the Github release system.
+
+As of July 2023 `Packages`_ list 1746 packages.
+
+Origins retrieving strategy
+---------------------------
+
+To build a list of origins we make a GET request to the `Http Api endpoint`_ that returns
+a Json array of objects.
+The origin url for each package is constructed with the information of corresponding
+`name` entry which represents the suffix of Github repositories (org/project_name).
+
+Page listing
+------------
+
+There is only one page listing all origins url.
+
+Origins from page
+-----------------
+
+The lister is stateless and yields all origins url from one page. It is a list of package
+repository url.
+
+Running tests
+-------------
+
+Activate the virtualenv and run from within swh-lister directory::
+
+   pytest -s -vv --log-cli-level=DEBUG swh/lister/elm/tests
+
+Testing with Docker
+-------------------
+
+Change directory to swh/docker then launch the docker environment::
+
+   docker compose up -d
+
+Then schedule a elm listing task::
+
+   docker compose exec swh-scheduler swh scheduler task add -p oneshot list-elm
+
+You can follow lister execution by displaying logs of swh-lister service::
+
+   docker compose logs -f swh-lister
+
+.. _Elm: https://elm-lang.org/
+.. _Packages: https://package.elm-lang.org/
+.. _elm install: https://guide.elm-lang.org/install/elm.html#elm-install
+.. _Http Api endpoint: https://package.elm-lang.org/search.json
+"""
+
+
+def register():
+    from .lister import ElmLister
+
+    return {
+        "lister": ElmLister,
+        "task_modules": ["%s.tasks" % __name__],
+    }
diff --git a/swh/lister/elm/lister.py b/swh/lister/elm/lister.py
new file mode 100644
index 00000000..83a1b84e
--- /dev/null
+++ b/swh/lister/elm/lister.py
@@ -0,0 +1,74 @@
+# Copyright (C) 2023  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+from typing import Any, Dict, Iterator, List, Optional
+
+from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.model import ListedOrigin
+
+from ..pattern import CredentialsType, StatelessLister
+
+logger = logging.getLogger(__name__)
+
+# Aliasing the page results returned by `get_pages` method from the lister.
+ElmListerPage = List[Dict[str, Any]]
+
+
+class ElmLister(StatelessLister[ElmListerPage]):
+    """List Elm packages origins"""
+
+    LISTER_NAME = "elm"
+    VISIT_TYPE = "git"  # Elm origins url are Git repositories
+    INSTANCE = "elm"
+
+    SEARCH_URL = "https://package.elm-lang.org/search.json"
+
+    REPO_URL_PATTERN = "https://github.com/{name}"
+
+    def __init__(
+        self,
+        scheduler: SchedulerInterface,
+        credentials: Optional[CredentialsType] = None,
+        max_origins_per_page: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        enable_origins: bool = True,
+    ):
+        super().__init__(
+            scheduler=scheduler,
+            credentials=credentials,
+            instance=self.INSTANCE,
+            url=self.SEARCH_URL,
+            max_origins_per_page=max_origins_per_page,
+            max_pages=max_pages,
+            enable_origins=enable_origins,
+        )
+        self.session.headers.update({"Accept": "application/json"})
+
+    def get_pages(self) -> Iterator[ElmListerPage]:
+        """Yield an iterator which returns 'page'
+
+        It uses the unique Http api endpoint `https://package.elm-lang.org/search.json`
+        to get a list of names corresponding to Github repository url suffixes.
+
+        There is only one page that list all origins urls.
+        """
+        response = self.http_request(self.url)
+        yield response.json()
+
+    def get_origins_from_page(self, page: ElmListerPage) -> Iterator[ListedOrigin]:
+        """Iterate on all pages and yield ListedOrigin instances"""
+        assert self.lister_obj.id is not None
+
+        for entry in page:
+            name: str = entry["name"]
+            repo_url: str = self.REPO_URL_PATTERN.format(name=name)
+
+            yield ListedOrigin(
+                lister_id=self.lister_obj.id,
+                visit_type=self.VISIT_TYPE,
+                url=repo_url,
+                last_update=None,
+            )
diff --git a/swh/lister/elm/tasks.py b/swh/lister/elm/tasks.py
new file mode 100644
index 00000000..de9ff975
--- /dev/null
+++ b/swh/lister/elm/tasks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2023  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from celery import shared_task
+
+from swh.lister.elm.lister import ElmLister
+
+
+@shared_task(name=__name__ + ".ElmListerTask")
+def list_elm(**lister_args):
+    """Lister task for Elm lang packages"""
+    return ElmLister.from_configfile(**lister_args).run().dict()
+
+
+@shared_task(name=__name__ + ".ping")
+def _ping():
+    return "OK"
diff --git a/swh/lister/elm/tests/__init__.py b/swh/lister/elm/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/swh/lister/elm/tests/data/https_package.elm-lang.org/search.json b/swh/lister/elm/tests/data/https_package.elm-lang.org/search.json
new file mode 100644
index 00000000..cdb20cec
--- /dev/null
+++ b/swh/lister/elm/tests/data/https_package.elm-lang.org/search.json
@@ -0,0 +1,20 @@
+[
+  {
+    "name": "elm/bytes",
+    "summary": "Work with sequences of bytes (a.k.a. ArrayBuffer, typed arrays, DataView)",
+    "license": "BSD-3-Clause",
+    "version": "1.0.8"
+  },
+  {
+    "name": "STTR13/ziplist",
+    "summary": "List with a selected element that makes impossible state impossible.",
+    "license": "BSD-3-Clause",
+    "version": "1.4.2"
+  },
+  {
+    "name": "cuducos/elm-format-number",
+    "summary": "Format numbers as pretty strings",
+    "license": "BSD-3-Clause",
+    "version": "9.0.1"
+  }
+]
diff --git a/swh/lister/elm/tests/test_lister.py b/swh/lister/elm/tests/test_lister.py
new file mode 100644
index 00000000..88511bbb
--- /dev/null
+++ b/swh/lister/elm/tests/test_lister.py
@@ -0,0 +1,32 @@
+# Copyright (C) 2023  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.elm.lister import ElmLister
+
+expected_origins = [
+    "https://github.com/STTR13/ziplist",
+    "https://github.com/elm/bytes",
+    "https://github.com/cuducos/elm-format-number",
+]
+
+
+def test_elm_lister(datadir, requests_mock_datadir, swh_scheduler):
+    lister = ElmLister(scheduler=swh_scheduler)
+    res = lister.run()
+
+    assert res.pages == 1
+    assert res.origins == 1 + 1 + 1
+
+    scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+
+    assert len(scheduler_origins) == len(expected_origins)
+    assert {
+        (
+            scheduled.visit_type,
+            scheduled.url,
+            scheduled.last_update,
+        )
+        for scheduled in scheduler_origins
+    } == {("git", expected, None) for expected in expected_origins}
diff --git a/swh/lister/elm/tests/test_tasks.py b/swh/lister/elm/tests/test_tasks.py
new file mode 100644
index 00000000..cbea3220
--- /dev/null
+++ b/swh/lister/elm/tests/test_tasks.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2023  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.lister.pattern import ListerStats
+
+
+def test_elm_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
+    res = swh_scheduler_celery_app.send_task("swh.lister.elm.tasks.ping")
+    assert res
+    res.wait()
+    assert res.successful()
+    assert res.result == "OK"
+
+
+def test_elm_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
+    # setup the mocked ElmLister
+    lister = mocker.patch("swh.lister.elm.tasks.ElmLister")
+    lister.from_configfile.return_value = lister
+    stats = ListerStats(pages=42, origins=42)
+    lister.run.return_value = stats
+
+    res = swh_scheduler_celery_app.send_task("swh.lister.elm.tasks.ElmListerTask")
+    assert res
+    res.wait()
+    assert res.successful()
+    assert res.result == stats.dict()
+
+    lister.from_configfile.assert_called_once_with()
+    lister.run.assert_called_once_with()
-- 
GitLab