Skip to content
Snippets Groups Projects
Commit 99bbd9d6 authored by Franck Bret's avatar Franck Bret
Browse files

Stateful Julia lister

Add a state to the lister to store the ``last_seen_commit`` as a Git
commit hash.

Use Dulwich to retrieve a Git commit walker since
``last_seen_commit`` if any.
For each commit detect if it is a new package or a new package
version commit and returns its origin with commit date as
last_update.
parent 053f0a93
No related branches found
No related tags found
1 merge request!507Stateful Julia lister
Pipeline #6274 passed
......@@ -28,9 +28,9 @@ Origins retrieval strategy
--------------------------
To build a list of origins we clone the `Julia General registry`_ Git repository, then
read the `Registry.toml`_ file to get the path to packages directories.
Each directory have a `Package.toml` file from where we get the Git repository url for
a package.
walk through commits with the help of `Dulwich`_ to detect commit related to a new package
or a new version of a package. For each of those commits we get the path to `Package.toml`
file from where we get the Git repository url for a package.
Page listing
------------
......@@ -40,7 +40,12 @@ There is only one page listing all origins url.
Origins from page
-----------------
The lister is stateless and yields all origins url from one page.
The lister yields all origins url from one page.
Each time the lister is executed, the HEAD commit id of `Julia General registry`_
is stored as ``state.last_seen_commit`` and used on next run to retrieve new origins
since the last commit.
Each url corresponds to the Git url of the package repository.
Running tests
......@@ -71,6 +76,7 @@ You can follow lister execution by displaying logs of swh-lister service::
.. _JuliaHub: https://juliahub.com/
.. _Julia Packages: https://julialang.org/packages/
.. _Registry.toml: https://github.com/JuliaRegistries/General/blob/master/Registry.toml
.. _Dulwich: https://www.dulwich.io/
""" # noqa: B950
......
......@@ -3,27 +3,40 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import asdict, dataclass
import datetime
import logging
from pathlib import Path
import shutil
import tempfile
from typing import Any, Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, Optional
from dulwich import porcelain
from dulwich.repo import Repo
from dulwich.walk import WalkEntry
import iso8601
import toml
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
JuliaListerPage = List[Tuple[str, Any]]
JuliaListerPage = Dict[str, Any]
class JuliaLister(StatelessLister[JuliaListerPage]):
@dataclass
class JuliaListerState:
"""Store lister state for incremental mode operations"""
last_seen_commit: Optional[str] = None
"""Hash of the latest Git commit when lister was executed"""
class JuliaLister(Lister[JuliaListerState, JuliaListerPage]):
"""List Julia packages origins"""
LISTER_NAME = "julia"
......@@ -34,7 +47,6 @@ class JuliaLister(StatelessLister[JuliaListerPage]):
"https://github.com/JuliaRegistries/General.git" # Julia General Registry
)
REPO_PATH = Path(tempfile.mkdtemp(), "General")
REGISTRY_PATH = REPO_PATH / "Registry.toml"
def __init__(
self,
......@@ -63,40 +75,111 @@ class JuliaLister(StatelessLister[JuliaListerPage]):
except FileExistsError:
porcelain.pull(self.REPO_PATH, remote_location=self.url)
def state_from_dict(self, d: Dict[str, Any]) -> JuliaListerState:
return JuliaListerState(**d)
def state_to_dict(self, state: JuliaListerState) -> Dict[str, Any]:
return asdict(state)
def get_origin_data(self, entry: WalkEntry) -> Dict[str, Any]:
"""
Given an entry object parse its commit message and other attributes
to detect if the commit is valid to describe a new package or
a new package version.
Returns a dict with origin url as key and iso8601 commit date as value
"""
assert entry
if (
entry.commit
and entry.changes()
and (
entry.commit.message.startswith(b"New package: ")
or entry.commit.message.startswith(b"New version: ")
)
):
package_toml = None
for change in entry.changes():
if change and hasattr(change, "new"):
if change.new.path.endswith(b"/Package.toml"):
package_toml = self.REPO_PATH / change.new.path.decode()
break
elif change.new.path.endswith(b"/Versions.toml"):
versions_path = self.REPO_PATH / change.new.path.decode()
if versions_path.exists():
package_path, _ = change.new.path.decode().split(
"Versions.toml"
)
package_toml = (
self.REPO_PATH / package_path / "Package.toml"
)
break
if package_toml and package_toml.exists():
origin = toml.load(package_toml)["repo"]
last_update = datetime.datetime.fromtimestamp(
entry.commit.commit_time,
tz=datetime.timezone.utc,
).isoformat()
return {f"{origin}": last_update}
return {}
def get_pages(self) -> Iterator[JuliaListerPage]:
"""Yield an iterator which returns 'page'
To build a list of origins the `Julia General registry` Git
repository is cloned to get a `Registry.toml` file, an index file of
packages directories.
To build a list of origins the ``Julia General registry`` Git
repository is cloned to look at commits history to discover new
package and new package versions.
Depending on ``last_seen_commit`` state it initiate a commit walker
since the last time the lister has been executed.
There is only one page that list all origins urls.
"""
# Clone the repository
self.get_registry_repository()
assert self.REGISTRY_PATH.exists()
registry = toml.load(self.REGISTRY_PATH)
yield registry["packages"].items()
assert self.REPO_PATH.exists()
repo = Repo(str(self.REPO_PATH))
# Detect commits related to new package and new versions since last_seen_commit
if not self.state.last_seen_commit:
walker = repo.get_walker()
else:
last = repo[self.state.last_seen_commit.encode()]
walker = repo.get_walker(since=last.commit_time, exclude=[last.id])
assert walker
packages = {}
for entry in walker:
packages.update(self.get_origin_data(entry=entry))
yield packages
def get_origins_from_page(self, page: JuliaListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances
Each directory of the Git repository have a `Package.toml` file from
where we get the Git repository url for each package.
Each directory of the Git repository have a ``Package.toml`` file from
where we get the Git repository url as an origin for each package.
"""
assert self.lister_obj.id is not None
assert self.REPO_PATH.exists()
for uuid, info in page:
package_info_path = self.REPO_PATH / info["path"] / "Package.toml"
package_info = toml.load(package_info_path)
for origin, last_update in page.items():
last_update = iso8601.parse_date(last_update)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=package_info["repo"],
last_update=None,
url=origin,
last_update=last_update,
)
def finalize(self) -> None:
# Get Git HEAD commit hash
repo = Repo(str(self.REPO_PATH))
self.state.last_seen_commit = repo.head().decode("ascii")
self.updated = True
# Rm tmp directory REPO_PATH
if self.REPO_PATH.exists():
shutil.rmtree(self.REPO_PATH)
......
File deleted
File added
File added
......@@ -27,6 +27,9 @@ some amount of consideration when choosing package names.
[packages]' > Registry.toml
# Init as a git repository
# Force author and commit date to be the same
export GIT_AUTHOR_DATE='2001-01-01T17:18:19+00:00'
export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE
git init
git add .
git commit -m "Init fake Julia registry repository for tests purpose"
......@@ -50,6 +53,8 @@ git-tree-sha1 = "65301af3ab06b04cf8a52cd43b06222bab5249c2"
echo 'a3ea4736-0a3b-4c29-ac8a-20364318a635 = { name = "Fable", path = "F/Fable" }' >> Registry.toml
export GIT_AUTHOR_DATE='2001-01-02T17:18:19+00:00'
export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE
git add .
git commit -m "New package: Fable v0.0.2"
......@@ -132,16 +137,60 @@ git-tree-sha1 = "59619a31c56c9e61b5dabdbd339e30c227c5d13d"
echo 'f1435218-dba5-11e9-1e4d-f1a5fab5fc13 = { name = "Oscar", path = "O/Oscar" }' >> Registry.toml
export GIT_AUTHOR_DATE='2001-01-03T17:18:19+00:00'
export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE
git add .
git commit -m "New package: Oscar v0.12.1"
# Save some space
rm .git/hooks/*.sample
# Archive
# First Archive
cd ../
tar -czf fake-julia-registry-repository.tar.gz General
mv fake-julia-registry-repository.tar.gz ../
tar -czf fake-julia-registry-repository_0.tar.gz General
mv fake-julia-registry-repository_0.tar.gz ../
# Add some more commits and build a second archive for incremental tests purpose
cd General
echo '
["0.13.0"]
git-tree-sha1 = "c090495f818a063ed23d2d911fe74cc4358b5351"
' >> O/Oscar/Versions.toml
# New version, replace previous uuid with a new one
sed -i -e 's/f1435218-dba5-11e9-1e4d-f1a5fab5fc13/a3ea4736-0a3b-4c29-ac8a-20364318a635/g' Registry.toml
export GIT_AUTHOR_DATE='2001-01-04T17:18:19+00:00'
export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE
git add .
git commit -m "New version: Oscar v0.13.0"
mkdir -p V/VulkanSpec
touch V/VulkanSpec/Package.toml
touch V/VulkanSpec/Versions.toml
echo 'name = "VulkanSpec"
uuid = "99a7788f-8f0f-454f-8f6c-c6cf389551ae"
repo = "https://github.com/serenity4/VulkanSpec.jl.git"
' > V/VulkanSpec/Package.toml
echo '["0.1.0"]
git-tree-sha1 = "b5fef67130191c797007a1484f4dc6bfc840caa2"
' > V/VulkanSpec/Versions.toml
echo '99a7788f-8f0f-454f-8f6c-c6cf389551ae = { name = "VulkanSpec", path = "V/VulkanSpec" }' >> Registry.toml
export GIT_AUTHOR_DATE='2001-01-05T17:18:19+00:00'
export GIT_COMMITTER_DATE=$GIT_AUTHOR_DATE
git add .
git commit -m "New package: VulkanSpec v0.1.0"
# Second Archive
cd ../
tar -czf fake-julia-registry-repository_1.tar.gz General
mv fake-julia-registry-repository_1.tar.gz ../
# Clean up tmp_dir
cd ../
......
......@@ -5,17 +5,25 @@
from pathlib import Path
from dulwich import porcelain
import iso8601
from swh.lister.julia.lister import JuliaLister
from swh.lister.julia.tests import prepare_repository_from_archive
expected_origins = [
"https://github.com/leios/Fable.jl.git",
"https://github.com/oscar-system/Oscar.jl.git",
]
expected_origins_0 = {
"https://github.com/leios/Fable.jl.git": "2001-01-02T17:18:19+00:00",
"https://github.com/oscar-system/Oscar.jl.git": "2001-01-03T17:18:19+00:00",
}
expected_origins_1 = {
"https://github.com/oscar-system/Oscar.jl.git": "2001-01-04T17:18:19+00:00",
"https://github.com/serenity4/VulkanSpec.jl.git": "2001-01-05T17:18:19+00:00",
}
def test_julia_get_registry_repository(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-julia-registry-repository.tar.gz")
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
......@@ -33,17 +41,18 @@ def test_julia_get_registry_repository(datadir, tmp_path, swh_scheduler):
def test_julia_lister(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-julia-registry-repository.tar.gz")
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
lister.REPO_PATH = Path(tmp_path, "General")
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
res = lister.run()
assert res.origins == 1 + 1
assert res.origins == len(expected_origins_0)
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins)
assert len(scheduler_origins) == len(expected_origins_0)
assert {
(
scheduled.visit_type,
......@@ -51,4 +60,106 @@ def test_julia_lister(datadir, tmp_path, swh_scheduler):
scheduled.last_update,
)
for scheduled in scheduler_origins
} == {("git", expected, None) for expected in expected_origins}
} == {
("git", origin, iso8601.parse_date(last_update))
for origin, last_update in expected_origins_0.items()
}
def test_julia_lister_incremental(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
# Prepare first run
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
lister.REPO_PATH = Path(tmp_path, "General")
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
# Latest Git commit hash expected
with porcelain.open_repo_closing(lister.REPO_PATH) as r:
expected_last_seen_commit = r.head().decode("ascii")
assert expected_last_seen_commit is not None
assert lister.state.last_seen_commit is None
# First run
res = lister.run()
assert res.pages == 1
assert res.origins == len(expected_origins_0)
assert lister.state.last_seen_commit == expected_last_seen_commit
scheduler_origins_0 = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins_0) == len(expected_origins_0)
assert {
(
scheduled.visit_type,
scheduled.url,
scheduled.last_update,
)
for scheduled in scheduler_origins_0
} == {
("git", origin, iso8601.parse_date(last_update))
for origin, last_update in expected_origins_0.items()
}
# Prepare second run
archive_path = Path(datadir, "fake-julia-registry-repository_1.tar.gz")
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
lister.REPO_PATH = Path(tmp_path, "General")
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
assert lister.state.last_seen_commit == expected_last_seen_commit
with porcelain.open_repo_closing(lister.REPO_PATH) as repo:
new_expected_last_seen_commit = repo.head().decode("ascii")
assert expected_last_seen_commit != new_expected_last_seen_commit
# Second run
res = lister.run()
assert lister.state.last_seen_commit == new_expected_last_seen_commit
assert res.pages == 1
# One new package, one new version
assert res.origins == len(expected_origins_1)
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
expected_origins = {**expected_origins_0, **expected_origins_1}
assert len(scheduler_origins) == len(expected_origins)
def test_julia_lister_incremental_no_changes(datadir, tmp_path, swh_scheduler):
archive_path = Path(datadir, "fake-julia-registry-repository_0.tar.gz")
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
lister.REPO_PATH = Path(tmp_path, "General")
lister.REGISTRY_PATH = lister.REPO_PATH / "Registry.toml"
# Latest Git commit hash expected
with porcelain.open_repo_closing(lister.REPO_PATH) as r:
expected_last_seen_commit = r.head().decode("ascii")
assert expected_last_seen_commit is not None
assert lister.state.last_seen_commit is None
# First run
res = lister.run()
assert res.pages == 1
assert res.origins == len(expected_origins_0)
assert expected_last_seen_commit is not None
assert lister.state.last_seen_commit == expected_last_seen_commit
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins_0)
# Prepare second run, repository state is the same as the one of the first run
repo_url = prepare_repository_from_archive(archive_path, "General", tmp_path)
lister = JuliaLister(url=repo_url, scheduler=swh_scheduler)
assert lister.state.last_seen_commit == expected_last_seen_commit
# Second run
res = lister.run()
assert lister.state.last_seen_commit == expected_last_seen_commit
assert res.pages == 1
# Nothing new
assert res.origins == 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment