diff --git a/requirements-test.txt b/requirements-test.txt index 977c91ea6b07479005977da884843621f81dc4ab..8e1fd60cba203386047619fe128fda8a0a85a58c 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -5,3 +5,4 @@ requests_mock types-click types-pyyaml types-requests +types-toml diff --git a/requirements.txt b/requirements.txt index bf2beb6e8ec2a1958b4bb1553ceb64cb55d51337..bc64858a3ae18ac3054a29d1ba55b40caf05cc2c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ rpy2 setuptools tenacity >= 6.2 testing.postgresql +toml diff --git a/setup.py b/setup.py index 49bcfdfdca26d61b78aa9c44ce568a6e3593a532..2a9c51f61c5611d0a4e78cfbae5890cedf5f7016 100755 --- a/setup.py +++ b/setup.py @@ -76,6 +76,7 @@ setup( lister.gogs=swh.lister.gogs:register lister.hackage=swh.lister.hackage:register lister.hex=swh.lister.hex:register + lister.julia=swh.lister.julia:register lister.launchpad=swh.lister.launchpad:register lister.nixguix=swh.lister.nixguix:register lister.npm=swh.lister.npm:register diff --git a/swh/lister/julia/__init__.py b/swh/lister/julia/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ed9c8584d5523e844d54951a76161af42247db09 --- /dev/null +++ b/swh/lister/julia/__init__.py @@ -0,0 +1,83 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +Julia lister +============= + +`Julia`_ is a dynamic language for scientific computing applications. It comes with +an ecosystem of packages managed with its internal package manager `Pkg`_. + +A list of all officially registered packages can be found in the `Julia General Registry`_ +on GitHub, but it's easier to search for packages using the `JuliaHub`_ and +`Julia Packages`_ sites. + +The `Julia`_ lister lists origins from a Git repository, the `Julia General registry`_. +The main `Registry.toml`_ file list available Julia packages. Each directory +match a package name and have Toml files to describe the package and its versions. + +Julia origins are Git repositories hosted on Github. Each repository must provide its +packaged releases using the Github release system. + +As of July 2023 `Julia General registry`_ list 9714 packages names. + +Origins retrieving strategy +--------------------------- + +To build a list of origins we clone the `Julia General registry`_ Git repository, then +read the `Registry.toml`_ file to get the path to packages directories. +Each directory have a `Package.toml` file from where we get the Git repository url for +a package. + +Page listing +------------ + +There is only one page listing all origins url. + +Origins from page +----------------- + +The lister is stateless and yields all origins url from one page. +Each url corresponds to the Git url of the package repository. + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/julia/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker compose up -d + +Then schedule a julia listing task:: + + docker compose exec swh-scheduler swh scheduler task add -p oneshot list-julia + +You can follow lister execution by displaying logs of swh-lister service:: + + docker compose logs -f swh-lister + +.. _Julia: https://julialang.org/ +.. _Pkg: https://docs.julialang.org/en/v1/stdlib/Pkg/ +.. _Julia General registry: https://github.com/JuliaRegistries/General +.. _JuliaHub: https://juliahub.com/ +.. _Julia Packages: https://julialang.org/packages/ +.. _Registry.toml: https://github.com/JuliaRegistries/General/blob/master/Registry.toml +""" # noqa: B950 + + +def register(): + from .lister import JuliaLister + + return { + "lister": JuliaLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/julia/lister.py b/swh/lister/julia/lister.py new file mode 100644 index 0000000000000000000000000000000000000000..ecc22f1054bccf32d2b38ae6766b7b02fc59b1c7 --- /dev/null +++ b/swh/lister/julia/lister.py @@ -0,0 +1,90 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import logging +from pathlib import Path +from typing import Any, Iterator, List, Optional, Tuple + +from dulwich import porcelain +import toml + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +JuliaListerPage = List[Tuple[str, Any]] + + +class JuliaLister(StatelessLister[JuliaListerPage]): + """List Julia packages origins""" + + LISTER_NAME = "julia" + VISIT_TYPE = "git" # Julia origins url are Git repositories + INSTANCE = "julia" + + REPO_URL = ( + "https://github.com/JuliaRegistries/General.git" # Julia General Registry + ) + REPO_PATH = Path("/tmp/General") + REGISTRY_PATH = REPO_PATH / "Registry.toml" + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + url: Optional[str] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=url or self.REPO_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, + ) + + def get_registry_repository(self) -> None: + """Get Julia General Registry Git repository up to date on disk""" + if self.REPO_PATH.exists(): + porcelain.pull(self.REPO_PATH, remote_location=self.url) + else: + porcelain.clone(source=self.url, target=self.REPO_PATH) + + def get_pages(self) -> Iterator[JuliaListerPage]: + """Yield an iterator which returns 'page' + + It uses the api endpoint provided by `https://registry.julia.io/packages` + to get a list of package names with an origin url that corresponds to Git + repository. + + There is only one page that list all origins urls. + """ + self.get_registry_repository() + assert self.REGISTRY_PATH.exists() + registry = toml.load(self.REGISTRY_PATH) + yield registry["packages"].items() + + def get_origins_from_page(self, page: JuliaListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances""" + assert self.lister_obj.id is not None + assert self.REPO_PATH.exists() + + for uuid, info in page: + package_info_path = self.REPO_PATH / info["path"] / "Package.toml" + package_info = toml.load(package_info_path) + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=package_info["repo"], + last_update=None, + ) diff --git a/swh/lister/julia/tasks.py b/swh/lister/julia/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..ef6b70754575af5cc68e0d28ba7bac0efdbb452d --- /dev/null +++ b/swh/lister/julia/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.julia.lister import JuliaLister + + +@shared_task(name=__name__ + ".JuliaListerTask") +def list_julia(**lister_args): + """Lister task for Julia General Registry""" + return JuliaLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/julia/tests/__init__.py b/swh/lister/julia/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..69c40d6c13d04afd07b3f93061e15a7bdb4d8cdf --- /dev/null +++ b/swh/lister/julia/tests/__init__.py @@ -0,0 +1,30 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +from pathlib import PosixPath +import subprocess +from typing import Optional, Union + + +def prepare_repository_from_archive( + archive_path: str, + filename: Optional[str] = None, + tmp_path: Union[PosixPath, str] = "/tmp", +) -> str: + """Given an existing archive_path, uncompress it. + Returns a file repo url which can be used as origin url. + + This does not deal with the case where the archive passed along does not exist. + + """ + if not isinstance(tmp_path, str): + tmp_path = str(tmp_path) + # uncompress folder/repositories/dump for the loader to ingest + subprocess.check_output(["tar", "xf", archive_path, "-C", tmp_path]) + # build the origin url (or some derivative form) + _fname = filename if filename else os.path.basename(archive_path) + repo_url = f"file://{tmp_path}/{_fname}" + return repo_url diff --git a/swh/lister/julia/tests/data/fake-julia-registry-repository.tar.gz b/swh/lister/julia/tests/data/fake-julia-registry-repository.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..1681e535c3b04d1bd3c6061bacbd0863570dc2be Binary files /dev/null and b/swh/lister/julia/tests/data/fake-julia-registry-repository.tar.gz differ diff --git a/swh/lister/julia/tests/data/fake_julia_registry_repository.sh b/swh/lister/julia/tests/data/fake_julia_registry_repository.sh new file mode 100644 index 0000000000000000000000000000000000000000..0a3f181be1c12c9f6eed2e0f435bd7705f712eb9 --- /dev/null +++ b/swh/lister/julia/tests/data/fake_julia_registry_repository.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash + +# Script to generate fake-julia-registry-repository.tar.gz +# Creates a git repository like https://github.com/JuliaRegistries/General.git +# for tests purposes + +set -euo pipefail + +# files and directories for Julia registry repository +mkdir -p tmp_dir/General/ +cd tmp_dir/General/ + +touch Registry.toml + +echo -e '''name = "General" +uuid = "23338594-aafe-5451-b93e-139f81909106" +repo = "https://github.com/JuliaRegistries/General.git" + +description = """ +Official general Julia package registry where people can +register any package they want without too much debate about +naming and without enforced standards on documentation or +testing. We nevertheless encourage documentation, testing and +some amount of consideration when choosing package names. +""" + +[packages]''' > Registry.toml + +# Init as a git repository +git init +git add . +git commit -m "Init fake Julia registry repository for tests purpose" + +mkdir -p F/Fable + +touch F/Fable/Package.toml +touch F/Fable/Versions.toml + +echo -e '''name = "Fable" +uuid = "a3ea4736-0a3b-4c29-ac8a-20364318a635" +repo = "https://github.com/leios/Fable.jl.git" +''' > F/Fable/Package.toml + +echo -e '''["0.0.1"] +git-tree-sha1 = "d98ef9a5309f0ec8caaf34bf4cefaf1f1ca525e8" + +["0.0.2"] +git-tree-sha1 = "65301af3ab06b04cf8a52cd43b06222bab5249c2" +''' > F/Fable/Versions.toml + +echo 'a3ea4736-0a3b-4c29-ac8a-20364318a635 = { name = "Fable", path = "F/Fable" }' >> Registry.toml + +git add . +git commit -m "New package: Fable v0.0.2" + +mkdir -p O/Oscar + +touch O/Oscar/Package.toml +touch O/Oscar/Versions.toml + +echo -e '''name = "Oscar" +uuid = "f1435218-dba5-11e9-1e4d-f1a5fab5fc13" +repo = "https://github.com/oscar-system/Oscar.jl.git" +''' > O/Oscar/Package.toml + +echo -e '''["0.2.0"] +git-tree-sha1 = "cda489ed50fbd625d245655ce6e5858c3c21ce12" + +["0.3.0"] +git-tree-sha1 = "d62e911d06affb6450a0d059c3432df284a8e3c1" + +["0.4.0"] +git-tree-sha1 = "91a9c623da588d5fcfc1f0ce0b3d57a0e35c65d2" + +["0.5.0"] +git-tree-sha1 = "5d595e843a71df04da0e8027c4773a158be0c4f4" + +["0.5.1"] +git-tree-sha1 = "501602b8c0efc9b4fc6a68d0cb53b9103f736313" + +["0.5.2"] +git-tree-sha1 = "aa42d7bc3282e72b1b5c41d518661634cc454de0" + +["0.6.0"] +git-tree-sha1 = "a3ca062f1e9ab1728de6af6812c1a09bb527e5ce" + +["0.7.0"] +git-tree-sha1 = "185ce4c7b082bf3530940af4954642292da25ff9" + +["0.7.1"] +git-tree-sha1 = "26815d2504820400189b2ba822bea2b4c81555d9" + +["0.8.0"] +git-tree-sha1 = "25c9620ab9ee15e72b1fea5a903de51088185a7e" + +["0.8.1"] +git-tree-sha1 = "53a5c754fbf891bc279040cfb9a2b85c03489f38" + +["0.8.2"] +git-tree-sha1 = "cd7595c13e95d810bfd2dd3a96558fb8fd545470" + +["0.9.0"] +git-tree-sha1 = "738574ad4cb14da838e3fa5a2bae0c84cca324ed" + +["0.10.0"] +git-tree-sha1 = "79e850c5e047754e985c8e0a4220d6f7b1715999" + +["0.10.1"] +git-tree-sha1 = "45a146665c899f358c5d24a1551fee8e710285a1" + +["0.10.2"] +git-tree-sha1 = "0b127546fd5068de5d161c9ace299cbeb5b8c8b3" + +["0.11.0"] +git-tree-sha1 = "001842c060d17eecae8070f8ba8e8163f760722f" + +["0.11.1"] +git-tree-sha1 = "3309b97c9327617cd063cc1de5850dc13aad6007" + +["0.11.2"] +git-tree-sha1 = "9c2873412042edb336c5347ffa7a9daf29264da8" + +["0.11.3"] +git-tree-sha1 = "0c452a18943144989213e2042766371d49505b22" + +["0.12.0"] +git-tree-sha1 = "7618e3ba2e9b2ea43ad5d2c809e726a8a9e6e7b1" + +["0.12.1"] +git-tree-sha1 = "59619a31c56c9e61b5dabdbd339e30c227c5d13d" +''' > O/Oscar/Versions.toml + +echo 'f1435218-dba5-11e9-1e4d-f1a5fab5fc13 = { name = "Oscar", path = "O/Oscar" }' >> Registry.toml + +git add . +git commit -m "New package: Oscar v0.12.1" + +# Save some space +rm .git/hooks/*.sample + +# Archive +cd ../ +tar -czf fake-julia-registry-repository.tar.gz General +mv fake-julia-registry-repository.tar.gz ../ + +# Clean up tmp_dir +cd ../ +rm -rf tmp_dir diff --git a/swh/lister/julia/tests/test_lister.py b/swh/lister/julia/tests/test_lister.py new file mode 100644 index 0000000000000000000000000000000000000000..f67b0bf820f0380b6c84bf684fc58701c03d9add --- /dev/null +++ b/swh/lister/julia/tests/test_lister.py @@ -0,0 +1,36 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path + +from swh.lister.julia.lister import JuliaLister +from swh.lister.julia.tests import prepare_repository_from_archive + +expected_origins = [ + "https://github.com/leios/Fable.jl.git", + "https://github.com/oscar-system/Oscar.jl.git", +] + + +def test_julia_lister(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-julia-registry-repository.tar.gz") + repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path) + lister = JuliaLister(url=repo_url, scheduler=swh_scheduler) + lister.REPO_PATH = Path(tmp_path, "General") + lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml" + + res = lister.run() + assert res.origins == 1 + 1 + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == len(expected_origins) + assert { + ( + scheduled.visit_type, + scheduled.url, + scheduled.last_update, + ) + for scheduled in scheduler_origins + } == {("git", expected, None) for expected in expected_origins} diff --git a/swh/lister/julia/tests/test_tasks.py b/swh/lister/julia/tests/test_tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..e9e8b84161331e62535c5fc36f32c5f56fcdf3fe --- /dev/null +++ b/swh/lister/julia/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_julia_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.julia.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_julia_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked JuliaLister + lister = mocker.patch("swh.lister.julia.tasks.JuliaLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.julia.tasks.JuliaListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()