Skip to content
Snippets Groups Projects
Commit 91e4e33d authored by Antoine Lambert's avatar Antoine Lambert
Browse files

cran: Improve listing of R packages

Previously, the lister was relying on the use of the CRANtools R module
but it has the drawback to only list the latest version of each registered
package in the CRAN registry.

In order to get all possible versions for each CRAN package, prefer to exploit
the content of the weekly dump of the CRAN database in RDS format.

To read the content of the RDS file from Python, the rpy2 package is used as
it has the advantage to be packaged in debian.

Related to swh/meta#1709.
parent 3a0e8b99
No related branches found
No related tags found
No related merge requests found
......@@ -25,6 +25,9 @@ ignore_missing_imports = True
[mypy-lxml.*]
ignore_missing_imports = True
[mypy-pandas.*]
ignore_missing_imports = True
[mypy-pkg_resources.*]
ignore_missing_imports = True
......@@ -37,6 +40,9 @@ ignore_missing_imports = True
[mypy-requests_mock.*]
ignore_missing_imports = True
[mypy-rpy2.*]
ignore_missing_imports = True
[mypy-urllib3.util.*]
ignore_missing_imports = True
......
pandas
pytest
pytest-mock
requests_mock
......
......@@ -8,6 +8,7 @@ psycopg2
python_debian
repomd
requests
rpy2
setuptools
tenacity >= 6.2
testing.postgresql
#!/usr/bin/Rscript
# This R script calls the buildin API to get list of
# all the packages of R and their description, then convert the API
# response to JSON string and print it
db <- tools::CRAN_package_db()[, c("Package", "Version", "Packaged", "MD5sum")]
dbjson <- jsonlite::toJSON(db)
print(dbjson)
\ No newline at end of file
# Copyright (C) 2019-2021 the Software Heritage developers
# Copyright (C) 2019-2023 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
from datetime import datetime, timezone
import json
import logging
import subprocess
from typing import Dict, Iterator, List, Optional, Tuple
import os
import tempfile
from typing import Any, Dict, Iterator, List, Optional, Tuple
from urllib.parse import urljoin
import pkg_resources
from rpy2 import robjects
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
......@@ -16,17 +18,22 @@ from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
CRAN_MIRROR = "https://cran.r-project.org"
CRAN_MIRROR_URL = "https://cran.r-project.org"
CRAN_INFO_DB_URL = f"{CRAN_MIRROR_URL}/web/dbs/cran_info_db.rds"
PageType = List[Dict[str, str]]
# List[Tuple[origin_url, List[Dict[package_version, package_metdata]]]]
PageType = List[Tuple[str, List[Dict[str, Any]]]]
class CRANLister(StatelessLister[PageType]):
"""
List all packages hosted on The Comprehensive R Archive Network.
The lister parses and reads the content of the weekly CRAN database
dump in RDS format referencing all downloadable package tarballs.
"""
LISTER_NAME = "CRAN"
LISTER_NAME = "cran"
def __init__(
self,
......@@ -38,7 +45,7 @@ class CRANLister(StatelessLister[PageType]):
):
super().__init__(
scheduler,
url=CRAN_MIRROR,
url=CRAN_MIRROR_URL,
instance="cran",
credentials=credentials,
max_origins_per_page=max_origins_per_page,
......@@ -50,110 +57,69 @@ class CRANLister(StatelessLister[PageType]):
"""
Yields a single page containing all CRAN packages info.
"""
yield read_cran_data()
with tempfile.TemporaryDirectory() as tmpdir:
package_artifacts: Dict[str, Dict[str, Any]] = defaultdict(dict)
dest_path = os.path.join(tmpdir, os.path.basename(CRAN_INFO_DB_URL))
response = self.http_request(CRAN_INFO_DB_URL, stream=True)
with open(dest_path, "wb") as rds_file:
for chunk in response.iter_content(chunk_size=1024):
rds_file.write(chunk)
logger.debug("Parsing %s file", dest_path)
robjects.r(f"cran_info_db_df <- readRDS('{dest_path}')")
r_df = robjects.r["cran_info_db_df"]
colnames = list(r_df.colnames)
def _get_col_value(row, colname):
return r_df[colnames.index(colname)][row]
logger.debug("Processing CRAN packages")
for i in range(r_df.nrow):
tarball_path = r_df.rownames[i]
package_info = tarball_path.split("/")[-1].replace(".tar.gz", "")
if "_" not in package_info and "-" not in package_info:
# skip package artifact with no version
continue
try:
package_name, package_version = package_info.split("_", maxsplit=1)
except ValueError:
# old artifacts can separate name and version with a dash
package_name, package_version = package_info.split("-", maxsplit=1)
package_artifacts[package_name][package_version] = {
"url": urljoin(
CRAN_MIRROR_URL, tarball_path.replace("/srv/ftp/pub/R", "")
),
"version": package_version,
"package": package_name,
"checksums": {"length": int(_get_col_value(i, "size"))},
"mtime": (
datetime.fromtimestamp(
_get_col_value(i, "mtime"), tz=timezone.utc
)
),
}
yield [
(f"{CRAN_MIRROR_URL}/package={package_name}", list(artifacts.values()))
for package_name, artifacts in package_artifacts.items()
]
def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
assert self.lister_obj.id is not None
seen_urls = set()
for package_info in page:
origin_url, artifact_url = compute_origin_urls(package_info)
if origin_url in seen_urls:
# prevent multiple listing of an origin,
# most recent version will be listed first
continue
seen_urls.add(origin_url)
for origin_url, artifacts in page:
mtimes = [artifact.pop("mtime") for artifact in artifacts]
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="cran",
last_update=parse_packaged_date(package_info),
last_update=max(mtimes),
extra_loader_arguments={
"artifacts": [
{
"url": artifact_url,
"version": package_info["Version"],
"package": package_info["Package"],
"checksums": {"md5": package_info["MD5sum"]},
}
]
},
)
def read_cran_data() -> List[Dict[str, str]]:
"""
Runs R script which uses inbuilt API to return a json response
containing data about the R packages.
Returns:
List of Dict about R packages. For example::
[
{
'Package': 'A3',
'Version': '1.0.0'
},
{
'Package': 'abbyyR',
'Version': '0.5.4'
"artifacts": list(sorted(artifacts, key=lambda a: a["version"]))
},
...
]
"""
filepath = pkg_resources.resource_filename("swh.lister.cran", "list_all_packages.R")
logger.debug("Executing R script %s", filepath)
response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False)
return json.loads(response.stdout.decode("utf-8"))
def compute_origin_urls(package_info: Dict[str, str]) -> Tuple[str, str]:
"""Compute the package url from the repo dict.
Args:
repo: dict with key 'Package', 'Version'
Returns:
the tuple project url, artifact url
"""
package = package_info["Package"]
version = package_info["Version"]
origin_url = f"{CRAN_MIRROR}/package={package}"
artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz"
return origin_url, artifact_url
def parse_packaged_date(package_info: Dict[str, str]) -> Optional[datetime]:
packaged_at_str = package_info.get("Packaged", "")
packaged_at = None
if packaged_at_str:
packaged_at_str = packaged_at_str.replace(" UTC", "")
# Packaged field possible formats:
# - "%Y-%m-%d %H:%M:%S[.%f] UTC; <packager>",
# - "%a %b %d %H:%M:%S %Y; <packager>"
for date_format in (
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M:%S.%f",
"%a %b %d %H:%M:%S %Y",
):
try:
packaged_at = datetime.strptime(
packaged_at_str.split(";")[0],
date_format,
).replace(tzinfo=timezone.utc)
break
except Exception:
continue
if packaged_at is None:
logger.debug(
"Could not parse %s package release date: %s",
package_info["Package"],
packaged_at_str,
)
return packaged_at
[
{
"Package": "cNORM",
"Version": "3.0.2",
"Packaged": "2022-06-12 08:46:39 UTC; gbpa005",
"MD5sum": "d878686afc17b990e500dc88afb3a990"
},
{
"Package": "CNprep",
"Version": "2.2",
"Packaged": "2022-05-23 23:58:37 UTC; Astrid",
"MD5sum": "4b6ddc37df607c79b7fb50a96a57197f"
},
{
"Package": "CNPS",
"Version": "1.0.0",
"Packaged": "2021-05-21 16:55:04 UTC; Surface",
"MD5sum": "deac071a9387e3a296481d041e6d09ee"
},
{
"Package": "cns",
"Version": "0.1.0",
"Packaged": "2021-07-16 19:30:51 UTC; nfultz",
"MD5sum": "3ad5a474260dbacb889be461b826a73b"
},
{
"Package": "cnum",
"Version": "0.1.3",
"Packaged": "2021-01-11 13:24:52 UTC; Elgar",
"MD5sum": "3cb5ab3fdaf4277d1ebfbe147e8990e1"
}
]
\ No newline at end of file
# Copyright (C) 2019-2021 The Software Heritage developers
# Copyright (C) 2019-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import json
from os import path
import pandas
import pytest
from swh.lister.cran.lister import (
CRAN_MIRROR,
CRANLister,
compute_origin_urls,
parse_packaged_date,
)
def test_cran_compute_origin_urls():
pack = "something"
vers = "0.0.1"
origin_url, artifact_url = compute_origin_urls(
{
"Package": pack,
"Version": vers,
}
from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
try:
from rpy2.robjects.conversion import py2rpy
except ImportError:
# for old rpy2 versions (fix debian buster package build)
from rpy2.robjects.pandas2ri import py2ri as py2rpy # noqa
from swh.lister.cran.lister import CRAN_INFO_DB_URL, CRAN_MIRROR_URL, CRANLister
CRAN_INFO_DB_DATA = {
"/srv/ftp/pub/R/src/contrib/Archive/zooimage/zooimage_3.0-3.tar.gz": {
"size": 2482446.0,
"isdir": False,
"mode": 436,
"mtime": pandas.Timestamp("2013-02-11 20:03:00.351782400"),
"ctime": pandas.Timestamp("2023-08-12 16:02:51.731749120"),
"atime": pandas.Timestamp("2023-08-12 18:26:53.976175872"),
"uid": 1001,
"gid": 1001,
"uname": "hornik",
"grname": "cranadmin",
},
"/srv/ftp/pub/R/src/contrib/Archive/zooimage/zooimage_3.0-5.tar.gz": {
"size": 2483495.0,
"isdir": False,
"mode": 436,
"mtime": pandas.Timestamp("2014-03-02 12:20:17.842085376"),
"ctime": pandas.Timestamp("2023-08-12 16:02:51.731749120"),
"atime": pandas.Timestamp("2023-08-12 18:26:53.976175872"),
"uid": 1008,
"gid": 1001,
"uname": "ripley",
"grname": "cranadmin",
},
"/srv/ftp/pub/R/src/contrib/zooimage_5.5.2.tar.gz": {
"size": 2980492.0,
"isdir": False,
"mode": 436,
"mtime": pandas.Timestamp("2018-06-29 16:00:29.281795328"),
"ctime": pandas.Timestamp("2023-08-12 16:02:52.227744768"),
"atime": pandas.Timestamp("2023-08-12 18:13:24.175266560"),
"uid": 1010,
"gid": 1001,
"uname": "ligges",
"grname": "cranadmin",
},
"/srv/ftp/pub/R/src/contrib/Archive/xtune/xtune_0.1.0.tar.gz": {
"size": 366098.0,
"isdir": False,
"mode": 436,
"mtime": pandas.Timestamp("2019-05-24 09:00:04.697701120"),
"ctime": pandas.Timestamp("2023-08-12 16:02:52.135745536"),
"atime": pandas.Timestamp("2023-08-12 18:28:29.483338752"),
"uid": 1010,
"gid": 1001,
"uname": "ligges",
"grname": "cranadmin",
},
"/srv/ftp/pub/R/src/contrib/xtune_2.0.0.tar.gz": {
"size": 4141076.0,
"isdir": False,
"mode": 436,
"mtime": pandas.Timestamp("2023-06-18 22:40:04.242652416"),
"ctime": pandas.Timestamp("2023-08-12 16:02:52.279744512"),
"atime": pandas.Timestamp("2023-08-12 18:12:28.311755776"),
"uid": 1010,
"gid": 1001,
"uname": "ligges",
"grname": "cranadmin",
},
"/srv/ftp/pub/R/src/contrib/Old/0.50/bootstrap.tar.gz": {
"size": 16306.0,
"isdir": False,
"mode": 436,
"mtime": pandas.Timestamp("1997-04-16 10:10:36"),
"ctime": pandas.Timestamp("2023-08-12 16:02:51.571750400"),
"atime": pandas.Timestamp("2023-08-12 18:12:45.115608576"),
"uid": 0,
"gid": 1001,
"uname": "root",
"grname": "cranadmin",
},
}
@pytest.fixture
def cran_info_db_rds_path(tmp_path):
"""Build a sample RDS file with small extract of CRAN database"""
df = pandas.DataFrame.from_dict(
CRAN_INFO_DB_DATA,
orient="index",
)
assert origin_url == f"{CRAN_MIRROR}/package={pack}"
assert artifact_url == f"{CRAN_MIRROR}/src/contrib/{pack}_{vers}.tar.gz"
def test_cran_compute_origin_urls_failure():
for incomplete_repo in [{"Version": "0.0.1"}, {"Package": "package"}, {}]:
with pytest.raises(KeyError):
compute_origin_urls(incomplete_repo)
def test_parse_packaged_date():
common_date_format = {
"Package": "test",
"Packaged": "2017-04-26 11:36:15 UTC; Jonathan",
}
assert parse_packaged_date(common_date_format) == datetime(
year=2017, month=4, day=26, hour=11, minute=36, second=15, tzinfo=timezone.utc
)
common_date_format = {
"Package": "test",
"Packaged": "2017-04-26 11:36:15.123456 UTC; Jonathan",
}
assert parse_packaged_date(common_date_format) == datetime(
year=2017,
month=4,
day=26,
hour=11,
minute=36,
second=15,
microsecond=123456,
tzinfo=timezone.utc,
)
old_date_format = {
"Package": "test",
"Packaged": "Thu Mar 30 10:48:35 2006; hornik",
}
assert parse_packaged_date(old_date_format) == datetime(
year=2006, month=3, day=30, hour=10, minute=48, second=35, tzinfo=timezone.utc
)
invalid_date_format = {
"Package": "test",
"Packaged": "foo",
}
assert parse_packaged_date(invalid_date_format) is None
missing_date = {
"Package": "test",
}
assert parse_packaged_date(missing_date) is None
rds_path = path.join(tmp_path, "cran_info_db.rds")
# Convert pandas dataframe to R dataframe
with localconverter(robjects.default_converter + pandas2ri.converter):
r_df = py2rpy(df)
robjects.r.assign("cran_info_db_df", r_df)
robjects.r(f"saveRDS(cran_info_db_df, file='{rds_path}')")
return rds_path
def test_cran_lister_cran(datadir, swh_scheduler, mocker):
with open(path.join(datadir, "list-r-packages.json")) as f:
cran_data = json.loads(f.read())
def test_cran_lister_cran(swh_scheduler, requests_mock, cran_info_db_rds_path):
lister = CRANLister(swh_scheduler)
with open(cran_info_db_rds_path, "rb") as cran_db_rds:
mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data")
requests_mock.get(CRAN_INFO_DB_URL, body=cran_db_rds)
mock_cran.return_value = cran_data
lister = CRANLister(swh_scheduler)
stats = lister.run()
stats = lister.run()
assert stats.pages == 1
assert stats.origins == len(cran_data)
assert stats.pages == 1
assert stats.origins == 2
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(cran_data)
for package_info in cran_data:
origin_url, artifact_url = compute_origin_urls(package_info)
filtered_origins = [o for o in scheduler_origins if o.url == origin_url]
assert len(filtered_origins) == 1
assert filtered_origins[0].extra_loader_arguments == {
"artifacts": [
{
"url": artifact_url,
"version": package_info["Version"],
"package": package_info["Package"],
"checksums": {"md5": package_info["MD5sum"]},
}
]
scheduler_origins = {
o.url: o
for o in swh_scheduler.get_listed_origins(lister.lister_obj.id).results
}
filtered_origins[0].last_update == parse_packaged_date(package_info)
def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker):
with open(path.join(datadir, "list-r-packages.json")) as f:
cran_data = json.loads(f.read())
lister = CRANLister(swh_scheduler)
mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data")
mock_cran.return_value = cran_data + cran_data
stats = lister.run()
assert set(scheduler_origins.keys()) == {
f"{CRAN_MIRROR_URL}/package=zooimage",
f"{CRAN_MIRROR_URL}/package=xtune",
}
assert stats.pages == 1
assert stats.origins == len(cran_data)
assert scheduler_origins[
f"{CRAN_MIRROR_URL}/package=zooimage"
].extra_loader_arguments["artifacts"] == [
{
"url": f"{CRAN_MIRROR_URL}/src/contrib/Archive/zooimage/zooimage_3.0-3.tar.gz", # noqa
"package": "zooimage",
"version": "3.0-3",
"checksums": {"length": 2482446},
},
{
"url": f"{CRAN_MIRROR_URL}/src/contrib/Archive/zooimage/zooimage_3.0-5.tar.gz", # noqa
"package": "zooimage",
"version": "3.0-5",
"checksums": {"length": 2483495},
},
{
"url": f"{CRAN_MIRROR_URL}/src/contrib/zooimage_5.5.2.tar.gz",
"package": "zooimage",
"version": "5.5.2",
"checksums": {"length": 2980492},
},
]
assert scheduler_origins[f"{CRAN_MIRROR_URL}/package=xtune"].extra_loader_arguments[
"artifacts"
] == [
{
"url": f"{CRAN_MIRROR_URL}/src/contrib/Archive/xtune/xtune_0.1.0.tar.gz",
"package": "xtune",
"version": "0.1.0",
"checksums": {"length": 366098},
},
{
"url": f"{CRAN_MIRROR_URL}/src/contrib/xtune_2.0.0.tar.gz",
"package": "xtune",
"version": "2.0.0",
"checksums": {"length": 4141076},
},
]
@pytest.mark.parametrize(
......@@ -139,7 +181,7 @@ def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker):
(None, []),
({"key": "value"}, []),
(
{"CRAN": {"cran": [{"username": "user", "password": "pass"}]}},
{"cran": {"cran": [{"username": "user", "password": "pass"}]}},
[{"username": "user", "password": "pass"}],
),
],
......@@ -147,6 +189,7 @@ def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker):
def test_lister_cran_instantiation_with_credentials(
credentials, expected_credentials, swh_scheduler
):
lister = CRANLister(swh_scheduler, credentials=credentials)
# Credentials are allowed in constructor
......@@ -154,6 +197,7 @@ def test_lister_cran_instantiation_with_credentials(
def test_lister_cran_from_configfile(swh_scheduler_config, mocker):
load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
load_from_envvar.return_value = {
"scheduler": {"cls": "local", **swh_scheduler_config},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment