Skip to content
Snippets Groups Projects
Commit 8ff418fb authored by Franck Bret's avatar Franck Bret
Browse files

Conda: List origins for Anaconda, the package manager that provides tooling for datascience

Related T4547
parent fd1a4244
No related branches found
No related tags found
No related merge requests found
Showing
with 387 additions and 0 deletions
......@@ -60,6 +60,7 @@ setup(
lister.bitbucket=swh.lister.bitbucket:register
lister.bower=swh.lister.bower:register
lister.cgit=swh.lister.cgit:register
lister.conda=swh.lister.conda:register
lister.cran=swh.lister.cran:register
lister.crates=swh.lister.crates:register
lister.debian=swh.lister.debian:register
......
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""
Conda lister
============
Anaconda is a package manager that provides tooling for datascience.
The Conda lister list `packages`_ from Anaconda `repositories`_.
Those repositories host packages for several languages (Python, R) operating systems
and architecture.
Packages are grouped within free or commercial `channels`_.
To instantiate a conda lister we need to give some `channel`and `arch` arguments::
lister = CondaLister(
scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
)
The default `url` value of lister is `https://repo.anaconda.com/pkgs`. One can set another
repository url, for example::
lister = CondaLister(
scheduler=swh_scheduler,
url="https://conda.anaconda.org",
channel="conda-forge",
archs=["linux-64"],
)
Origins retrieving strategy
---------------------------
Each channel provides several `repodata.json`_ files that list available packages
and related versions.
Given a channel and a list of system and architecture the lister download and parse
corresponding repodata.json.
We use bz2 compressed version of repodata.json. See for example `main/linux-64`_ page
to view available repodata files.
Page listing
------------
The lister returns one page per channel / architecture that list all available package
versions.
Origins from page
-----------------
Origins urls are built following this pattern `https://anaconda.org/{channel}/{pkgname}`.
Each origin is yield with an `artifacts` entry in `extra_loader_arguments` that list
artifact metadata for each archived package version.
Origin data example for one origin with two related versions.::
{
"url": "https://anaconda.org/conda-forge/lifetimes",
"artifacts": {
"linux-64/0.11.1-py36h9f0ad1d_1": {
"url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950
"date": "2020-07-06T12:19:36.425000+00:00",
"version": "0.11.1",
"filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2",
"checksums": {
"md5": "faa398f7ba0d60ce44aa6eeded490cee",
"sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950
},
},
"linux-64/0.11.1-py36hc560c46_1": {
"url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950
"date": "2020-07-06T12:19:37.032000+00:00",
"version": "0.11.1",
"filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2",
"checksums": {
"md5": "c53a689a4c5948e84211bdfc23e3fe68",
"sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950
},
},
},
}
Running tests
-------------
Activate the virtualenv and run from within swh-lister directory::
pytest -s -vv --log-cli-level=DEBUG swh/lister/conda/tests
Testing with Docker
-------------------
Change directory to swh/docker then launch the docker environment::
docker compose up -d
Then schedule a conda listing task::
docker compose exec swh-scheduler swh scheduler task add -p oneshot list-conda channel="free" archs="[linux-64, osx-64, win-64]" # noqa: B950
You can follow lister execution by displaying logs of swh-lister service::
docker compose logs -f swh-lister
.. _packages: https://docs.anaconda.com/anaconda/packages/pkg-docs/
.. _Anaconda: https://anaconda.com/
.. _repositories: https://repo.anaconda.com/pkgs/
.. _channels: https://docs.anaconda.com/anaconda/user-guide/tasks/using-repositories/
.. _main/linux-64: https://repo.anaconda.com/pkgs/main/linux-64/
.. _repodata.json: https://repo.anaconda.com/pkgs/free/linux-64/repodata.json
"""
def register():
from .lister import CondaLister
return {
"lister": CondaLister,
"task_modules": ["%s.tasks" % __name__],
}
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import bz2
from collections import defaultdict
import datetime
import json
import logging
from typing import Any, Dict, Iterator, List, Optional, Tuple
import iso8601
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
CondaListerPage = Tuple[str, Dict[str, Dict[str, Any]]]
class CondaLister(StatelessLister[CondaListerPage]):
"""List Conda (anaconda.com) origins."""
LISTER_NAME = "conda"
VISIT_TYPE = "conda"
INSTANCE = "conda"
BASE_REPO_URL = "https://repo.anaconda.com/pkgs"
REPO_URL_PATTERN = "{url}/{channel}/{arch}/repodata.json.bz2"
ORIGIN_URL_PATTERN = "https://anaconda.org/{channel}/{pkgname}"
ARCHIVE_URL_PATTERN = "{url}/{channel}/{arch}/{filename}"
def __init__(
self,
scheduler: SchedulerInterface,
credentials: Optional[CredentialsType] = None,
url: str = BASE_REPO_URL,
channel: str = "",
archs: List = [],
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=url,
)
self.channel: str = channel
self.archs: List[str] = archs
self.packages: Dict[str, Any] = defaultdict(dict)
self.package_dates: Dict[str, Any] = defaultdict(list)
def get_pages(self) -> Iterator[CondaListerPage]:
"""Yield an iterator which returns 'page'"""
for arch in self.archs:
repodata_url = self.REPO_URL_PATTERN.format(
url=self.url, channel=self.channel, arch=arch
)
response = self.http_request(url=repodata_url)
packages = json.loads(bz2.decompress(response.content))["packages"]
yield (arch, packages)
def get_origins_from_page(self, page: CondaListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
arch, packages = page
for filename, package_metadata in packages.items():
artifact = {
"filename": filename,
"url": self.ARCHIVE_URL_PATTERN.format(
url=self.url,
channel=self.channel,
filename=filename,
arch=arch,
),
"version": package_metadata["version"],
"checksums": {},
}
for checksum in ("md5", "sha256"):
if checksum in package_metadata:
artifact["checksums"][checksum] = package_metadata[checksum]
version_key = (
f"{arch}/{package_metadata['version']}-{package_metadata['build']}"
)
self.packages[package_metadata["name"]][version_key] = artifact
package_date = None
if "timestamp" in package_metadata:
package_date = datetime.datetime.fromtimestamp(
package_metadata["timestamp"] / 1e3, datetime.timezone.utc
)
elif "date" in package_metadata:
package_date = iso8601.parse_date(package_metadata["date"])
last_update = None
if package_date:
artifact["date"] = package_date.isoformat()
self.package_dates[package_metadata["name"]].append(package_date)
last_update = max(self.package_dates[package_metadata["name"]])
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=self.ORIGIN_URL_PATTERN.format(
channel=self.channel, pkgname=package_metadata["name"]
),
last_update=last_update,
extra_loader_arguments={
"artifacts": self.packages[package_metadata["name"]],
},
)
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import shared_task
from swh.lister.conda.lister import CondaLister
@shared_task(name=__name__ + ".CondaListerTask")
def list_conda(**lister_args):
"""Lister task for Anaconda registry"""
return CondaLister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"
File added
File added
File added
File added
File added
File added
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.conda.lister import CondaLister
def test_conda_lister_free_channel(datadir, requests_mock_datadir, swh_scheduler):
lister = CondaLister(
scheduler=swh_scheduler, channel="free", archs=["linux-64", "osx-64", "win-64"]
)
res = lister.run()
assert res.pages == 3
assert res.origins == 14
def test_conda_lister_conda_forge_channel(
datadir, requests_mock_datadir, swh_scheduler
):
lister = CondaLister(
scheduler=swh_scheduler,
url="https://conda.anaconda.org",
channel="conda-forge",
archs=["linux-64"],
)
res = lister.run()
assert res.pages == 1
assert res.origins == 2
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
expected_origins = [
{
"url": "https://anaconda.org/conda-forge/21cmfast",
"artifacts": {
"linux-64/3.0.2-py36h1af98f8_1": {
"url": "https://conda.anaconda.org/conda-forge/linux-64/21cmfast-3.0.2-py36h1af98f8_1.tar.bz2", # noqa: B950
"date": "2020-11-11T16:04:49.658000+00:00",
"version": "3.0.2",
"filename": "21cmfast-3.0.2-py36h1af98f8_1.tar.bz2",
"checksums": {
"md5": "d65ab674acf3b7294ebacaec05fc5b54",
"sha256": "1154fceeb5c4ee9bb97d245713ac21eb1910237c724d2b7103747215663273c2", # noqa: B950
},
}
},
},
{
"url": "https://anaconda.org/conda-forge/lifetimes",
"artifacts": {
"linux-64/0.11.1-py36h9f0ad1d_1": {
"url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2", # noqa: B950
"date": "2020-07-06T12:19:36.425000+00:00",
"version": "0.11.1",
"filename": "lifetimes-0.11.1-py36h9f0ad1d_1.tar.bz2",
"checksums": {
"md5": "faa398f7ba0d60ce44aa6eeded490cee",
"sha256": "f82a352dfae8abceeeaa538b220fd9c5e4aa4e59092a6a6cea70b9ec0581ea03", # noqa: B950
},
},
"linux-64/0.11.1-py36hc560c46_1": {
"url": "https://conda.anaconda.org/conda-forge/linux-64/lifetimes-0.11.1-py36hc560c46_1.tar.bz2", # noqa: B950
"date": "2020-07-06T12:19:37.032000+00:00",
"version": "0.11.1",
"filename": "lifetimes-0.11.1-py36hc560c46_1.tar.bz2",
"checksums": {
"md5": "c53a689a4c5948e84211bdfc23e3fe68",
"sha256": "76146c2ebd6e3b65928bde53a2585287759d77beba785c0eeb889ee565c0035d", # noqa: B950
},
},
},
},
]
assert len(scheduler_origins) == len(expected_origins)
assert [
(
scheduled.visit_type,
scheduled.url,
scheduled.extra_loader_arguments["artifacts"],
)
for scheduled in sorted(scheduler_origins, key=lambda scheduled: scheduled.url)
] == [
(
"conda",
expected["url"],
expected["artifacts"],
)
for expected in sorted(expected_origins, key=lambda expected: expected["url"])
]
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
def test_conda_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.conda.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
def test_conda_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
# setup the mocked CondaLister
lister = mocker.patch("swh.lister.conda.tasks.CondaLister")
lister.from_configfile.return_value = lister
stats = ListerStats(pages=42, origins=42)
lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task("swh.lister.conda.tasks.CondaListerTask")
assert res
res.wait()
assert res.successful()
assert res.result == stats.dict()
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment