diff --git a/swh/lister/julia/__init__.py b/swh/lister/julia/__init__.py index 8388c0a2d26f78da7e7767b14a677df8cd29caed..73e5382a6480df7e889c04983f7ac3d39f919897 100644 --- a/swh/lister/julia/__init__.py +++ b/swh/lister/julia/__init__.py @@ -28,9 +28,9 @@ Origins retrieval strategy -------------------------- To build a list of origins we clone the `Julia General registry`_ Git repository, then -read the `Registry.toml`_ file to get the path to packages directories. -Each directory have a `Package.toml` file from where we get the Git repository url for -a package. +walk through commits with the help of `Dulwich`_ to detect commit related to a new package +or a new version of a package. For each of those commits we get the path to `Package.toml` +file from where we get the Git repository url for a package. Page listing ------------ @@ -40,7 +40,12 @@ There is only one page listing all origins url. Origins from page ----------------- -The lister is stateless and yields all origins url from one page. +The lister yields all origins url from one page. + +Each time the lister is executed, the HEAD commit id of `Julia General registry`_ +is stored as ``state.last_seen_commit`` and used on next run to retrieve new origins +since the last commit. + Each url corresponds to the Git url of the package repository. Running tests @@ -71,6 +76,7 @@ You can follow lister execution by displaying logs of swh-lister service:: .. _JuliaHub: https://juliahub.com/ .. _Julia Packages: https://julialang.org/packages/ .. _Registry.toml: https://github.com/JuliaRegistries/General/blob/master/Registry.toml +.. _Dulwich: https://www.dulwich.io/ """ # noqa: B950 diff --git a/swh/lister/julia/lister.py b/swh/lister/julia/lister.py index 214cbcef7c324a6e0799a18c69b304b3217decf7..a31c5cc4c59ec3ad60b70b4c5b95211952f69db6 100644 --- a/swh/lister/julia/lister.py +++ b/swh/lister/julia/lister.py @@ -3,27 +3,40 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from dataclasses import asdict, dataclass +import datetime import logging from pathlib import Path import shutil import tempfile -from typing import Any, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, Optional from dulwich import porcelain +from dulwich.repo import Repo +from dulwich.walk import WalkEntry +import iso8601 import toml from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin -from ..pattern import CredentialsType, StatelessLister +from ..pattern import CredentialsType, Lister logger = logging.getLogger(__name__) # Aliasing the page results returned by `get_pages` method from the lister. -JuliaListerPage = List[Tuple[str, Any]] +JuliaListerPage = Dict[str, Any] -class JuliaLister(StatelessLister[JuliaListerPage]): +@dataclass +class JuliaListerState: + """Store lister state for incremental mode operations""" + + last_seen_commit: Optional[str] = None + """Hash of the latest Git commit when lister was executed""" + + +class JuliaLister(Lister[JuliaListerState, JuliaListerPage]): """List Julia packages origins""" LISTER_NAME = "julia" @@ -34,7 +47,6 @@ class JuliaLister(StatelessLister[JuliaListerPage]): "https://github.com/JuliaRegistries/General.git" # Julia General Registry ) REPO_PATH = Path(tempfile.mkdtemp(), "General") - REGISTRY_PATH = REPO_PATH / "Registry.toml" def __init__( self, @@ -63,40 +75,111 @@ class JuliaLister(StatelessLister[JuliaListerPage]): except FileExistsError: porcelain.pull(self.REPO_PATH, remote_location=self.url) + def state_from_dict(self, d: Dict[str, Any]) -> JuliaListerState: + return JuliaListerState(**d) + + def state_to_dict(self, state: JuliaListerState) -> Dict[str, Any]: + return asdict(state) + + def get_origin_data(self, entry: WalkEntry) -> Dict[str, Any]: + """ + Given an entry object parse its commit message and other attributes + to detect if the commit is valid to describe a new package or + a new package version. + + Returns a dict with origin url as key and iso8601 commit date as value + """ + assert entry + + if ( + entry.commit + and entry.changes() + and ( + entry.commit.message.startswith(b"New package: ") + or entry.commit.message.startswith(b"New version: ") + ) + ): + package_toml = None + for change in entry.changes(): + if change and hasattr(change, "new"): + if change.new.path.endswith(b"/Package.toml"): + package_toml = self.REPO_PATH / change.new.path.decode() + break + elif change.new.path.endswith(b"/Versions.toml"): + versions_path = self.REPO_PATH / change.new.path.decode() + if versions_path.exists(): + package_path, _ = change.new.path.decode().split( + "Versions.toml" + ) + package_toml = ( + self.REPO_PATH / package_path / "Package.toml" + ) + break + + if package_toml and package_toml.exists(): + origin = toml.load(package_toml)["repo"] + last_update = datetime.datetime.fromtimestamp( + entry.commit.commit_time, + tz=datetime.timezone.utc, + ).isoformat() + return {f"{origin}": last_update} + + return {} + def get_pages(self) -> Iterator[JuliaListerPage]: """Yield an iterator which returns 'page' - To build a list of origins the `Julia General registry` Git - repository is cloned to get a `Registry.toml` file, an index file of - packages directories. + To build a list of origins the ``Julia General registry`` Git + repository is cloned to look at commits history to discover new + package and new package versions. + + Depending on ``last_seen_commit`` state it initiate a commit walker + since the last time the lister has been executed. There is only one page that list all origins urls. """ + # Clone the repository self.get_registry_repository() - assert self.REGISTRY_PATH.exists() - registry = toml.load(self.REGISTRY_PATH) - yield registry["packages"].items() + assert self.REPO_PATH.exists() + + repo = Repo(str(self.REPO_PATH)) + + # Detect commits related to new package and new versions since last_seen_commit + if not self.state.last_seen_commit: + walker = repo.get_walker() + else: + last = repo[self.state.last_seen_commit.encode()] + walker = repo.get_walker(since=last.commit_time, exclude=[last.id]) + + assert walker + packages = {} + for entry in walker: + packages.update(self.get_origin_data(entry=entry)) + + yield packages def get_origins_from_page(self, page: JuliaListerPage) -> Iterator[ListedOrigin]: """Iterate on all pages and yield ListedOrigin instances - Each directory of the Git repository have a `Package.toml` file from - where we get the Git repository url for each package. + Each directory of the Git repository have a ``Package.toml`` file from + where we get the Git repository url as an origin for each package. """ assert self.lister_obj.id is not None - assert self.REPO_PATH.exists() - for uuid, info in page: - package_info_path = self.REPO_PATH / info["path"] / "Package.toml" - package_info = toml.load(package_info_path) + for origin, last_update in page.items(): + last_update = iso8601.parse_date(last_update) yield ListedOrigin( lister_id=self.lister_obj.id, visit_type=self.VISIT_TYPE, - url=package_info["repo"], - last_update=None, + url=origin, + last_update=last_update, ) def finalize(self) -> None: + # Get Git HEAD commit hash + repo = Repo(str(self.REPO_PATH)) + self.state.last_seen_commit = repo.head().decode("ascii") + self.updated = True # Rm tmp directory REPO_PATH if self.REPO_PATH.exists(): shutil.rmtree(self.REPO_PATH) diff --git a/swh/lister/julia/tests/data/fake-julia-registry-repository.tar.gz b/swh/lister/julia/tests/data/fake-julia-registry-repository.tar.gz deleted file mode 100644 index e79d4464de974285e883294c4349226726a91cb0..0000000000000000000000000000000000000000 Binary files a/swh/lister/julia/tests/data/fake-julia-registry-repository.tar.gz and /dev/null differ diff --git a/swh/lister/julia/tests/data/fake-julia-registry-repository_0.tar.gz b/swh/lister/julia/tests/data/fake-julia-registry-repository_0.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..458e4f1670b5ca6d9abff8b1cb7da305e1222783 Binary files /dev/null and b/swh/lister/julia/tests/data/fake-julia-registry-repository_0.tar.gz differ diff --git a/swh/lister/julia/tests/data/fake-julia-registry-repository_1.tar.gz b/swh/lister/julia/tests/data/fake-julia-registry-repository_1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..59275dd83e8fa9158179e33bfb777eaf56e17c7a Binary files /dev/null and b/swh/lister/julia/tests/data/fake-julia-registry-repository_1.tar.gz differ diff --git a/swh/lister/julia/tests/data/fake_julia_registry_repository.sh b/swh/lister/julia/tests/data/fake_julia_registry_repository.sh index 4dfbc222b5fc5d52064871e2311c95705aa85a43..5721d129c5db15209e2211083d576da9a0561d22 100755 --- a/swh/lister/julia/tests/data/fake_julia_registry_repository.sh +++ b/swh/lister/julia/tests/data/fake_julia_registry_repository.sh @@ -27,6 +27,9 @@ some amount of consideration when choosing package names. [packages]' > Registry.toml # Init as a git repository +# Force author and commit date to be the same +export GIT_AUTHOR_DATE='2001-01-01T17:18:19+00:00' +export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE git init git add . git commit -m "Init fake Julia registry repository for tests purpose" @@ -50,6 +53,8 @@ git-tree-sha1 = "65301af3ab06b04cf8a52cd43b06222bab5249c2" echo 'a3ea4736-0a3b-4c29-ac8a-20364318a635 = { name = "Fable", path = "F/Fable" }' >> Registry.toml +export GIT_AUTHOR_DATE='2001-01-02T17:18:19+00:00' +export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE git add . git commit -m "New package: Fable v0.0.2" @@ -132,16 +137,60 @@ git-tree-sha1 = "59619a31c56c9e61b5dabdbd339e30c227c5d13d" echo 'f1435218-dba5-11e9-1e4d-f1a5fab5fc13 = { name = "Oscar", path = "O/Oscar" }' >> Registry.toml +export GIT_AUTHOR_DATE='2001-01-03T17:18:19+00:00' +export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE git add . git commit -m "New package: Oscar v0.12.1" # Save some space rm .git/hooks/*.sample -# Archive +# First Archive cd ../ -tar -czf fake-julia-registry-repository.tar.gz General -mv fake-julia-registry-repository.tar.gz ../ +tar -czf fake-julia-registry-repository_0.tar.gz General +mv fake-julia-registry-repository_0.tar.gz ../ + +# Add some more commits and build a second archive for incremental tests purpose +cd General +echo ' + +["0.13.0"] +git-tree-sha1 = "c090495f818a063ed23d2d911fe74cc4358b5351" +' >> O/Oscar/Versions.toml + +# New version, replace previous uuid with a new one +sed -i -e 's/f1435218-dba5-11e9-1e4d-f1a5fab5fc13/a3ea4736-0a3b-4c29-ac8a-20364318a635/g' Registry.toml + +export GIT_AUTHOR_DATE='2001-01-04T17:18:19+00:00' +export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE +git add . +git commit -m "New version: Oscar v0.13.0" + +mkdir -p V/VulkanSpec + +touch V/VulkanSpec/Package.toml +touch V/VulkanSpec/Versions.toml + +echo 'name = "VulkanSpec" +uuid = "99a7788f-8f0f-454f-8f6c-c6cf389551ae" +repo = "https://github.com/serenity4/VulkanSpec.jl.git" +' > V/VulkanSpec/Package.toml + +echo '["0.1.0"] +git-tree-sha1 = "b5fef67130191c797007a1484f4dc6bfc840caa2" +' > V/VulkanSpec/Versions.toml + +echo '99a7788f-8f0f-454f-8f6c-c6cf389551ae = { name = "VulkanSpec", path = "V/VulkanSpec" }' >> Registry.toml + +export GIT_AUTHOR_DATE='2001-01-05T17:18:19+00:00' +export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE +git add . +git commit -m "New package: VulkanSpec v0.1.0" + +# Second Archive +cd ../ +tar -czf fake-julia-registry-repository_1.tar.gz General +mv fake-julia-registry-repository_1.tar.gz ../ # Clean up tmp_dir cd ../ diff --git a/swh/lister/julia/tests/test_lister.py b/swh/lister/julia/tests/test_lister.py index 6e5d2ea9bf54b96dd35678a5c979312c92b81a47..4febe1aea33b4a90d6c846872ea6c3c8a6a14611 100644 --- a/swh/lister/julia/tests/test_lister.py +++ b/swh/lister/julia/tests/test_lister.py @@ -5,17 +5,25 @@ from pathlib import Path +from dulwich import porcelain +import iso8601 + from swh.lister.julia.lister import JuliaLister from swh.lister.julia.tests import prepare_repository_from_archive -expected_origins = [ - "https://github.com/leios/Fable.jl.git", - "https://github.com/oscar-system/Oscar.jl.git", -] +expected_origins_0 = { + "https://github.com/leios/Fable.jl.git": "2001-01-02T17:18:19+00:00", + "https://github.com/oscar-system/Oscar.jl.git": "2001-01-03T17:18:19+00:00", +} + +expected_origins_1 = { + "https://github.com/oscar-system/Oscar.jl.git": "2001-01-04T17:18:19+00:00", + "https://github.com/serenity4/VulkanSpec.jl.git": "2001-01-05T17:18:19+00:00", +} def test_julia_get_registry_repository(datadir, tmp_path, swh_scheduler): - archive_path = Path(datadir, "fake-julia-registry-repository.tar.gz") + archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz") repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path) lister = JuliaLister(url=repo_url, scheduler=swh_scheduler) @@ -33,17 +41,18 @@ def test_julia_get_registry_repository(datadir, tmp_path, swh_scheduler): def test_julia_lister(datadir, tmp_path, swh_scheduler): - archive_path = Path(datadir, "fake-julia-registry-repository.tar.gz") + archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz") repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path) lister = JuliaLister(url=repo_url, scheduler=swh_scheduler) lister.REPO_PATH = Path(tmp_path, "General") lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml" res = lister.run() - assert res.origins == 1 + 1 + assert res.origins == len(expected_origins_0) scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - assert len(scheduler_origins) == len(expected_origins) + assert len(scheduler_origins) == len(expected_origins_0) + assert { ( scheduled.visit_type, @@ -51,4 +60,106 @@ def test_julia_lister(datadir, tmp_path, swh_scheduler): scheduled.last_update, ) for scheduled in scheduler_origins - } == {("git", expected, None) for expected in expected_origins} + } == { + ("git", origin, iso8601.parse_date(last_update)) + for origin, last_update in expected_origins_0.items() + } + + +def test_julia_lister_incremental(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz") + repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path) + + # Prepare first run + lister = JuliaLister(url=repo_url, scheduler=swh_scheduler) + lister.REPO_PATH = Path(tmp_path, "General") + lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml" + # Latest Git commit hash expected + with porcelain.open_repo_closing(lister.REPO_PATH) as r: + expected_last_seen_commit = r.head().decode("ascii") + + assert expected_last_seen_commit is not None + assert lister.state.last_seen_commit is None + + # First run + res = lister.run() + assert res.pages == 1 + assert res.origins == len(expected_origins_0) + assert lister.state.last_seen_commit == expected_last_seen_commit + + scheduler_origins_0 = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins_0) == len(expected_origins_0) + assert { + ( + scheduled.visit_type, + scheduled.url, + scheduled.last_update, + ) + for scheduled in scheduler_origins_0 + } == { + ("git", origin, iso8601.parse_date(last_update)) + for origin, last_update in expected_origins_0.items() + } + + # Prepare second run + archive_path = Path(datadir, "fake-julia-registry-repository_1.tar.gz") + repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path) + + lister = JuliaLister(url=repo_url, scheduler=swh_scheduler) + lister.REPO_PATH = Path(tmp_path, "General") + lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml" + + assert lister.state.last_seen_commit == expected_last_seen_commit + + with porcelain.open_repo_closing(lister.REPO_PATH) as repo: + new_expected_last_seen_commit = repo.head().decode("ascii") + + assert expected_last_seen_commit != new_expected_last_seen_commit + + # Second run + res = lister.run() + assert lister.state.last_seen_commit == new_expected_last_seen_commit + assert res.pages == 1 + # One new package, one new version + assert res.origins == len(expected_origins_1) + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + expected_origins = {**expected_origins_0, **expected_origins_1} + assert len(scheduler_origins) == len(expected_origins) + + +def test_julia_lister_incremental_no_changes(datadir, tmp_path, swh_scheduler): + archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz") + repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path) + lister = JuliaLister(url=repo_url, scheduler=swh_scheduler) + lister.REPO_PATH = Path(tmp_path, "General") + lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml" + + # Latest Git commit hash expected + with porcelain.open_repo_closing(lister.REPO_PATH) as r: + expected_last_seen_commit = r.head().decode("ascii") + + assert expected_last_seen_commit is not None + assert lister.state.last_seen_commit is None + + # First run + res = lister.run() + assert res.pages == 1 + assert res.origins == len(expected_origins_0) + assert expected_last_seen_commit is not None + assert lister.state.last_seen_commit == expected_last_seen_commit + + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + assert len(scheduler_origins) == len(expected_origins_0) + + # Prepare second run, repository state is the same as the one of the first run + repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path) + lister = JuliaLister(url=repo_url, scheduler=swh_scheduler) + assert lister.state.last_seen_commit == expected_last_seen_commit + + # Second run + res = lister.run() + assert lister.state.last_seen_commit == expected_last_seen_commit + assert res.pages == 1 + # Nothing new + assert res.origins == 0