diff --git a/mypy.ini b/mypy.ini index 76468c2ad0ed728c250ea209d637c76eeb3856ee..67c2c7cfdae6278adc6bf6265759d22786cd3f12 100644 --- a/mypy.ini +++ b/mypy.ini @@ -25,6 +25,9 @@ ignore_missing_imports = True [mypy-lxml.*] ignore_missing_imports = True +[mypy-pandas.*] +ignore_missing_imports = True + [mypy-pkg_resources.*] ignore_missing_imports = True @@ -37,6 +40,9 @@ ignore_missing_imports = True [mypy-requests_mock.*] ignore_missing_imports = True +[mypy-rpy2.*] +ignore_missing_imports = True + [mypy-urllib3.util.*] ignore_missing_imports = True diff --git a/requirements-test.txt b/requirements-test.txt index c73a59f74a4d3b5697762085af88bd19948032a1..977c91ea6b07479005977da884843621f81dc4ab 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,4 @@ +pandas pytest pytest-mock requests_mock diff --git a/requirements.txt b/requirements.txt index 3dd137af20004520a6848881f650358559f2034f..bf2beb6e8ec2a1958b4bb1553ceb64cb55d51337 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ psycopg2 python_debian repomd requests +rpy2 setuptools tenacity >= 6.2 testing.postgresql diff --git a/swh/lister/cran/list_all_packages.R b/swh/lister/cran/list_all_packages.R deleted file mode 100755 index 67d9c6db1def1f3a69c82777dd6271a04f4b946c..0000000000000000000000000000000000000000 --- a/swh/lister/cran/list_all_packages.R +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/Rscript - -# This R script calls the buildin API to get list of -# all the packages of R and their description, then convert the API -# response to JSON string and print it - -db <- tools::CRAN_package_db()[, c("Package", "Version", "Packaged", "MD5sum")] -dbjson <- jsonlite::toJSON(db) -print(dbjson) \ No newline at end of file diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index 728c6d3d9d30aecc47a297b4e27ed941fb5ff45c..e0dbd32fb36b4eef47e041e3d68e8c6391bf3c7f 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -1,14 +1,16 @@ -# Copyright (C) 2019-2021 the Software Heritage developers +# Copyright (C) 2019-2023 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict from datetime import datetime, timezone -import json import logging -import subprocess -from typing import Dict, Iterator, List, Optional, Tuple +import os +import tempfile +from typing import Any, Dict, Iterator, List, Optional, Tuple +from urllib.parse import urljoin -import pkg_resources +from rpy2 import robjects from swh.lister.pattern import CredentialsType, StatelessLister from swh.scheduler.interface import SchedulerInterface @@ -16,17 +18,22 @@ from swh.scheduler.model import ListedOrigin logger = logging.getLogger(__name__) -CRAN_MIRROR = "https://cran.r-project.org" +CRAN_MIRROR_URL = "https://cran.r-project.org" +CRAN_INFO_DB_URL = f"{CRAN_MIRROR_URL}/web/dbs/cran_info_db.rds" -PageType = List[Dict[str, str]] +# List[Tuple[origin_url, List[Dict[package_version, package_metdata]]]] +PageType = List[Tuple[str, List[Dict[str, Any]]]] class CRANLister(StatelessLister[PageType]): """ List all packages hosted on The Comprehensive R Archive Network. + + The lister parses and reads the content of the weekly CRAN database + dump in RDS format referencing all downloadable package tarballs. """ - LISTER_NAME = "CRAN" + LISTER_NAME = "cran" def __init__( self, @@ -38,7 +45,7 @@ class CRANLister(StatelessLister[PageType]): ): super().__init__( scheduler, - url=CRAN_MIRROR, + url=CRAN_MIRROR_URL, instance="cran", credentials=credentials, max_origins_per_page=max_origins_per_page, @@ -50,110 +57,69 @@ class CRANLister(StatelessLister[PageType]): """ Yields a single page containing all CRAN packages info. """ - yield read_cran_data() + + with tempfile.TemporaryDirectory() as tmpdir: + package_artifacts: Dict[str, Dict[str, Any]] = defaultdict(dict) + dest_path = os.path.join(tmpdir, os.path.basename(CRAN_INFO_DB_URL)) + + response = self.http_request(CRAN_INFO_DB_URL, stream=True) + with open(dest_path, "wb") as rds_file: + for chunk in response.iter_content(chunk_size=1024): + rds_file.write(chunk) + + logger.debug("Parsing %s file", dest_path) + robjects.r(f"cran_info_db_df <- readRDS('{dest_path}')") + r_df = robjects.r["cran_info_db_df"] + colnames = list(r_df.colnames) + + def _get_col_value(row, colname): + return r_df[colnames.index(colname)][row] + + logger.debug("Processing CRAN packages") + for i in range(r_df.nrow): + tarball_path = r_df.rownames[i] + package_info = tarball_path.split("/")[-1].replace(".tar.gz", "") + if "_" not in package_info and "-" not in package_info: + # skip package artifact with no version + continue + + try: + package_name, package_version = package_info.split("_", maxsplit=1) + except ValueError: + # old artifacts can separate name and version with a dash + package_name, package_version = package_info.split("-", maxsplit=1) + + package_artifacts[package_name][package_version] = { + "url": urljoin( + CRAN_MIRROR_URL, tarball_path.replace("/srv/ftp/pub/R", "") + ), + "version": package_version, + "package": package_name, + "checksums": {"length": int(_get_col_value(i, "size"))}, + "mtime": ( + datetime.fromtimestamp( + _get_col_value(i, "mtime"), tz=timezone.utc + ) + ), + } + + yield [ + (f"{CRAN_MIRROR_URL}/package={package_name}", list(artifacts.values())) + for package_name, artifacts in package_artifacts.items() + ] def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None - seen_urls = set() - for package_info in page: - origin_url, artifact_url = compute_origin_urls(package_info) - - if origin_url in seen_urls: - # prevent multiple listing of an origin, - # most recent version will be listed first - continue - - seen_urls.add(origin_url) + for origin_url, artifacts in page: + mtimes = [artifact.pop("mtime") for artifact in artifacts] yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, visit_type="cran", - last_update=parse_packaged_date(package_info), + last_update=max(mtimes), extra_loader_arguments={ - "artifacts": [ - { - "url": artifact_url, - "version": package_info["Version"], - "package": package_info["Package"], - "checksums": {"md5": package_info["MD5sum"]}, - } - ] - }, - ) - - -def read_cran_data() -> List[Dict[str, str]]: - """ - Runs R script which uses inbuilt API to return a json response - containing data about the R packages. - - Returns: - List of Dict about R packages. For example:: - - [ - { - 'Package': 'A3', - 'Version': '1.0.0' - }, - { - 'Package': 'abbyyR', - 'Version': '0.5.4' + "artifacts": list(sorted(artifacts, key=lambda a: a["version"])) }, - ... - ] - """ - filepath = pkg_resources.resource_filename("swh.lister.cran", "list_all_packages.R") - logger.debug("Executing R script %s", filepath) - response = subprocess.run(filepath, stdout=subprocess.PIPE, shell=False) - return json.loads(response.stdout.decode("utf-8")) - - -def compute_origin_urls(package_info: Dict[str, str]) -> Tuple[str, str]: - """Compute the package url from the repo dict. - - Args: - repo: dict with key 'Package', 'Version' - - Returns: - the tuple project url, artifact url - - """ - package = package_info["Package"] - version = package_info["Version"] - origin_url = f"{CRAN_MIRROR}/package={package}" - artifact_url = f"{CRAN_MIRROR}/src/contrib/{package}_{version}.tar.gz" - return origin_url, artifact_url - - -def parse_packaged_date(package_info: Dict[str, str]) -> Optional[datetime]: - packaged_at_str = package_info.get("Packaged", "") - packaged_at = None - if packaged_at_str: - packaged_at_str = packaged_at_str.replace(" UTC", "") - # Packaged field possible formats: - # - "%Y-%m-%d %H:%M:%S[.%f] UTC; <packager>", - # - "%a %b %d %H:%M:%S %Y; <packager>" - for date_format in ( - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d %H:%M:%S.%f", - "%a %b %d %H:%M:%S %Y", - ): - try: - packaged_at = datetime.strptime( - packaged_at_str.split(";")[0], - date_format, - ).replace(tzinfo=timezone.utc) - break - except Exception: - continue - - if packaged_at is None: - logger.debug( - "Could not parse %s package release date: %s", - package_info["Package"], - packaged_at_str, ) - - return packaged_at diff --git a/swh/lister/cran/tests/data/list-r-packages.json b/swh/lister/cran/tests/data/list-r-packages.json deleted file mode 100644 index 7357cd58e046755a6998d49417550171249b713c..0000000000000000000000000000000000000000 --- a/swh/lister/cran/tests/data/list-r-packages.json +++ /dev/null @@ -1,32 +0,0 @@ -[ - { - "Package": "cNORM", - "Version": "3.0.2", - "Packaged": "2022-06-12 08:46:39 UTC; gbpa005", - "MD5sum": "d878686afc17b990e500dc88afb3a990" - }, - { - "Package": "CNprep", - "Version": "2.2", - "Packaged": "2022-05-23 23:58:37 UTC; Astrid", - "MD5sum": "4b6ddc37df607c79b7fb50a96a57197f" - }, - { - "Package": "CNPS", - "Version": "1.0.0", - "Packaged": "2021-05-21 16:55:04 UTC; Surface", - "MD5sum": "deac071a9387e3a296481d041e6d09ee" - }, - { - "Package": "cns", - "Version": "0.1.0", - "Packaged": "2021-07-16 19:30:51 UTC; nfultz", - "MD5sum": "3ad5a474260dbacb889be461b826a73b" - }, - { - "Package": "cnum", - "Version": "0.1.3", - "Packaged": "2021-01-11 13:24:52 UTC; Elgar", - "MD5sum": "3cb5ab3fdaf4277d1ebfbe147e8990e1" - } -] \ No newline at end of file diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py index 6501a77d5b49f4d28f1670083edbc87e8def01e1..c7f70568d2a3b2a1c1e9335a6c7cb29ae6fc1e44 100644 --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -1,136 +1,178 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from datetime import datetime, timezone -import json from os import path +import pandas import pytest - -from swh.lister.cran.lister import ( - CRAN_MIRROR, - CRANLister, - compute_origin_urls, - parse_packaged_date, -) - - -def test_cran_compute_origin_urls(): - pack = "something" - vers = "0.0.1" - origin_url, artifact_url = compute_origin_urls( - { - "Package": pack, - "Version": vers, - } +from rpy2 import robjects +from rpy2.robjects import pandas2ri +from rpy2.robjects.conversion import localconverter + +try: + from rpy2.robjects.conversion import py2rpy +except ImportError: + # for old rpy2 versions (fix debian buster package build) + from rpy2.robjects.pandas2ri import py2ri as py2rpy # noqa + +from swh.lister.cran.lister import CRAN_INFO_DB_URL, CRAN_MIRROR_URL, CRANLister + +CRAN_INFO_DB_DATA = { + "/srv/ftp/pub/R/src/contrib/Archive/zooimage/zooimage_3.0-3.tar.gz": { + "size": 2482446.0, + "isdir": False, + "mode": 436, + "mtime": pandas.Timestamp("2013-02-11 20:03:00.351782400"), + "ctime": pandas.Timestamp("2023-08-12 16:02:51.731749120"), + "atime": pandas.Timestamp("2023-08-12 18:26:53.976175872"), + "uid": 1001, + "gid": 1001, + "uname": "hornik", + "grname": "cranadmin", + }, + "/srv/ftp/pub/R/src/contrib/Archive/zooimage/zooimage_3.0-5.tar.gz": { + "size": 2483495.0, + "isdir": False, + "mode": 436, + "mtime": pandas.Timestamp("2014-03-02 12:20:17.842085376"), + "ctime": pandas.Timestamp("2023-08-12 16:02:51.731749120"), + "atime": pandas.Timestamp("2023-08-12 18:26:53.976175872"), + "uid": 1008, + "gid": 1001, + "uname": "ripley", + "grname": "cranadmin", + }, + "/srv/ftp/pub/R/src/contrib/zooimage_5.5.2.tar.gz": { + "size": 2980492.0, + "isdir": False, + "mode": 436, + "mtime": pandas.Timestamp("2018-06-29 16:00:29.281795328"), + "ctime": pandas.Timestamp("2023-08-12 16:02:52.227744768"), + "atime": pandas.Timestamp("2023-08-12 18:13:24.175266560"), + "uid": 1010, + "gid": 1001, + "uname": "ligges", + "grname": "cranadmin", + }, + "/srv/ftp/pub/R/src/contrib/Archive/xtune/xtune_0.1.0.tar.gz": { + "size": 366098.0, + "isdir": False, + "mode": 436, + "mtime": pandas.Timestamp("2019-05-24 09:00:04.697701120"), + "ctime": pandas.Timestamp("2023-08-12 16:02:52.135745536"), + "atime": pandas.Timestamp("2023-08-12 18:28:29.483338752"), + "uid": 1010, + "gid": 1001, + "uname": "ligges", + "grname": "cranadmin", + }, + "/srv/ftp/pub/R/src/contrib/xtune_2.0.0.tar.gz": { + "size": 4141076.0, + "isdir": False, + "mode": 436, + "mtime": pandas.Timestamp("2023-06-18 22:40:04.242652416"), + "ctime": pandas.Timestamp("2023-08-12 16:02:52.279744512"), + "atime": pandas.Timestamp("2023-08-12 18:12:28.311755776"), + "uid": 1010, + "gid": 1001, + "uname": "ligges", + "grname": "cranadmin", + }, + "/srv/ftp/pub/R/src/contrib/Old/0.50/bootstrap.tar.gz": { + "size": 16306.0, + "isdir": False, + "mode": 436, + "mtime": pandas.Timestamp("1997-04-16 10:10:36"), + "ctime": pandas.Timestamp("2023-08-12 16:02:51.571750400"), + "atime": pandas.Timestamp("2023-08-12 18:12:45.115608576"), + "uid": 0, + "gid": 1001, + "uname": "root", + "grname": "cranadmin", + }, +} + + +@pytest.fixture +def cran_info_db_rds_path(tmp_path): + """Build a sample RDS file with small extract of CRAN database""" + df = pandas.DataFrame.from_dict( + CRAN_INFO_DB_DATA, + orient="index", ) - - assert origin_url == f"{CRAN_MIRROR}/package={pack}" - assert artifact_url == f"{CRAN_MIRROR}/src/contrib/{pack}_{vers}.tar.gz" - - -def test_cran_compute_origin_urls_failure(): - for incomplete_repo in [{"Version": "0.0.1"}, {"Package": "package"}, {}]: - with pytest.raises(KeyError): - compute_origin_urls(incomplete_repo) - - -def test_parse_packaged_date(): - common_date_format = { - "Package": "test", - "Packaged": "2017-04-26 11:36:15 UTC; Jonathan", - } - assert parse_packaged_date(common_date_format) == datetime( - year=2017, month=4, day=26, hour=11, minute=36, second=15, tzinfo=timezone.utc - ) - common_date_format = { - "Package": "test", - "Packaged": "2017-04-26 11:36:15.123456 UTC; Jonathan", - } - assert parse_packaged_date(common_date_format) == datetime( - year=2017, - month=4, - day=26, - hour=11, - minute=36, - second=15, - microsecond=123456, - tzinfo=timezone.utc, - ) - old_date_format = { - "Package": "test", - "Packaged": "Thu Mar 30 10:48:35 2006; hornik", - } - assert parse_packaged_date(old_date_format) == datetime( - year=2006, month=3, day=30, hour=10, minute=48, second=35, tzinfo=timezone.utc - ) - invalid_date_format = { - "Package": "test", - "Packaged": "foo", - } - assert parse_packaged_date(invalid_date_format) is None - missing_date = { - "Package": "test", - } - assert parse_packaged_date(missing_date) is None + rds_path = path.join(tmp_path, "cran_info_db.rds") + # Convert pandas dataframe to R dataframe + with localconverter(robjects.default_converter + pandas2ri.converter): + r_df = py2rpy(df) + robjects.r.assign("cran_info_db_df", r_df) + robjects.r(f"saveRDS(cran_info_db_df, file='{rds_path}')") + return rds_path -def test_cran_lister_cran(datadir, swh_scheduler, mocker): - with open(path.join(datadir, "list-r-packages.json")) as f: - cran_data = json.loads(f.read()) +def test_cran_lister_cran(swh_scheduler, requests_mock, cran_info_db_rds_path): - lister = CRANLister(swh_scheduler) + with open(cran_info_db_rds_path, "rb") as cran_db_rds: - mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data") + requests_mock.get(CRAN_INFO_DB_URL, body=cran_db_rds) - mock_cran.return_value = cran_data + lister = CRANLister(swh_scheduler) - stats = lister.run() + stats = lister.run() - assert stats.pages == 1 - assert stats.origins == len(cran_data) + assert stats.pages == 1 + assert stats.origins == 2 - scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - - assert len(scheduler_origins) == len(cran_data) - - for package_info in cran_data: - origin_url, artifact_url = compute_origin_urls(package_info) - - filtered_origins = [o for o in scheduler_origins if o.url == origin_url] - - assert len(filtered_origins) == 1 - - assert filtered_origins[0].extra_loader_arguments == { - "artifacts": [ - { - "url": artifact_url, - "version": package_info["Version"], - "package": package_info["Package"], - "checksums": {"md5": package_info["MD5sum"]}, - } - ] + scheduler_origins = { + o.url: o + for o in swh_scheduler.get_listed_origins(lister.lister_obj.id).results } - filtered_origins[0].last_update == parse_packaged_date(package_info) - - -def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker): - with open(path.join(datadir, "list-r-packages.json")) as f: - cran_data = json.loads(f.read()) - - lister = CRANLister(swh_scheduler) - - mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data") - - mock_cran.return_value = cran_data + cran_data - - stats = lister.run() + assert set(scheduler_origins.keys()) == { + f"{CRAN_MIRROR_URL}/package=zooimage", + f"{CRAN_MIRROR_URL}/package=xtune", + } - assert stats.pages == 1 - assert stats.origins == len(cran_data) + assert scheduler_origins[ + f"{CRAN_MIRROR_URL}/package=zooimage" + ].extra_loader_arguments["artifacts"] == [ + { + "url": f"{CRAN_MIRROR_URL}/src/contrib/Archive/zooimage/zooimage_3.0-3.tar.gz", # noqa + "package": "zooimage", + "version": "3.0-3", + "checksums": {"length": 2482446}, + }, + { + "url": f"{CRAN_MIRROR_URL}/src/contrib/Archive/zooimage/zooimage_3.0-5.tar.gz", # noqa + "package": "zooimage", + "version": "3.0-5", + "checksums": {"length": 2483495}, + }, + { + "url": f"{CRAN_MIRROR_URL}/src/contrib/zooimage_5.5.2.tar.gz", + "package": "zooimage", + "version": "5.5.2", + "checksums": {"length": 2980492}, + }, + ] + + assert scheduler_origins[f"{CRAN_MIRROR_URL}/package=xtune"].extra_loader_arguments[ + "artifacts" + ] == [ + { + "url": f"{CRAN_MIRROR_URL}/src/contrib/Archive/xtune/xtune_0.1.0.tar.gz", + "package": "xtune", + "version": "0.1.0", + "checksums": {"length": 366098}, + }, + { + "url": f"{CRAN_MIRROR_URL}/src/contrib/xtune_2.0.0.tar.gz", + "package": "xtune", + "version": "2.0.0", + "checksums": {"length": 4141076}, + }, + ] @pytest.mark.parametrize( @@ -139,7 +181,7 @@ def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker): (None, []), ({"key": "value"}, []), ( - {"CRAN": {"cran": [{"username": "user", "password": "pass"}]}}, + {"cran": {"cran": [{"username": "user", "password": "pass"}]}}, [{"username": "user", "password": "pass"}], ), ], @@ -147,6 +189,7 @@ def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker): def test_lister_cran_instantiation_with_credentials( credentials, expected_credentials, swh_scheduler ): + lister = CRANLister(swh_scheduler, credentials=credentials) # Credentials are allowed in constructor @@ -154,6 +197,7 @@ def test_lister_cran_instantiation_with_credentials( def test_lister_cran_from_configfile(swh_scheduler_config, mocker): + load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") load_from_envvar.return_value = { "scheduler": {"cls": "local", **swh_scheduler_config},