Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • anlambert/swh-lister
  • vlorentz/swh-lister
  • KShivendu/swh-lister
  • franckbret/swh-lister
  • lunar/swh-lister
  • ardumont/swh-lister
  • olasd/swh-lister
  • swh/devel/swh-lister
  • douardda/swh-lister
  • charly/swh-lister
  • marmoute/swh-lister
11 results
Show changes
Commits on Source (2)
......@@ -31,6 +31,9 @@ ignore_missing_imports = True
[mypy-pkg_resources.*]
ignore_missing_imports = True
[mypy-pyreadr.*]
ignore_missing_imports = True
[mypy-pytest.*]
ignore_missing_imports = True
......@@ -40,9 +43,6 @@ ignore_missing_imports = True
[mypy-requests_mock.*]
ignore_missing_imports = True
[mypy-rpy2.*]
ignore_missing_imports = True
[mypy-urllib3.util.*]
ignore_missing_imports = True
......
swh.core[db,github] >= 2.22.0
swh.core[db] >= 2.22.0
swh.scheduler >= 1.12.0
......@@ -3,14 +3,14 @@
# See top-level LICENSE file for more information
from collections import defaultdict
from datetime import datetime, timezone
import logging
import os
import tempfile
from typing import Any, Dict, Iterator, List, Optional, Tuple
from urllib.parse import urljoin
from rpy2 import robjects
import iso8601
import pyreadr
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
......@@ -64,23 +64,14 @@ class CRANLister(StatelessLister[PageType]):
with tempfile.TemporaryDirectory() as tmpdir:
package_artifacts: Dict[str, Dict[str, Any]] = defaultdict(dict)
dest_path = os.path.join(tmpdir, os.path.basename(CRAN_INFO_DB_URL))
response = self.http_request(CRAN_INFO_DB_URL, stream=True)
with open(dest_path, "wb") as rds_file:
for chunk in response.iter_content(chunk_size=1024):
rds_file.write(chunk)
logger.debug("Fetching %s file to %s", CRAN_INFO_DB_URL, dest_path)
dest_path = pyreadr.download_file(CRAN_INFO_DB_URL, dest_path)
logger.debug("Parsing %s file", dest_path)
robjects.r(f"cran_info_db_df <- readRDS('{dest_path}')")
r_df = robjects.r["cran_info_db_df"]
colnames = list(r_df.colnames)
def _get_col_value(row, colname):
return r_df[colnames.index(colname)][row]
cran_db_df = pyreadr.read_r(dest_path)[None]
logger.debug("Processing CRAN packages")
for i in range(r_df.nrow):
tarball_path = r_df.rownames[i]
for package_artifact_metadata in cran_db_df.itertuples():
tarball_path = package_artifact_metadata[0]
package_info = tarball_path.split("/")[-1].replace(".tar.gz", "")
if "_" not in package_info and "-" not in package_info:
# skip package artifact with no version
......@@ -98,11 +89,9 @@ class CRANLister(StatelessLister[PageType]):
),
"version": package_version,
"package": package_name,
"checksums": {"length": int(_get_col_value(i, "size"))},
"mtime": (
datetime.fromtimestamp(
_get_col_value(i, "mtime"), tz=timezone.utc
)
"checksums": {"length": int(package_artifact_metadata.size)},
"mtime": iso8601.parse_date(
package_artifact_metadata.mtime.isoformat()
),
}
......
......@@ -6,18 +6,10 @@
from os import path
import pandas
import pyreadr
import pytest
from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
try:
from rpy2.robjects.conversion import py2rpy
except ImportError:
# for old rpy2 versions (fix debian buster package build)
from rpy2.robjects.pandas2ri import py2ri as py2rpy # noqa
from swh.lister.cran.lister import CRAN_INFO_DB_URL, CRAN_MIRROR_URL, CRANLister
from swh.lister.cran.lister import CRAN_MIRROR_URL, CRANLister
CRAN_INFO_DB_DATA = {
"/srv/ftp/pub/R/src/contrib/Archive/zooimage/zooimage_3.0-3.tar.gz": {
......@@ -103,31 +95,46 @@ def cran_info_db_rds_path(tmp_path):
orient="index",
)
rds_path = path.join(tmp_path, "cran_info_db.rds")
# Convert pandas dataframe to R dataframe
with localconverter(robjects.default_converter + pandas2ri.converter):
r_df = py2rpy(df)
robjects.r.assign("cran_info_db_df", r_df)
robjects.r(f"saveRDS(cran_info_db_df, file='{rds_path}')")
pyreadr.write_rds(rds_path, df)
return rds_path
def test_cran_lister_cran(swh_scheduler, requests_mock, cran_info_db_rds_path):
with open(cran_info_db_rds_path, "rb") as cran_db_rds:
requests_mock.get(CRAN_INFO_DB_URL, body=cran_db_rds)
lister = CRANLister(swh_scheduler)
def test_cran_lister_cran(swh_scheduler, mocker, cran_info_db_rds_path):
lister = CRANLister(swh_scheduler)
mock_download_file = mocker.patch("swh.lister.cran.lister.pyreadr.download_file")
mock_download_file.return_value = cran_info_db_rds_path
read_r = pyreadr.read_r
def read_r_restore_data_lost_by_write_r(*args, **kwargs):
result = read_r(*args, **kwargs)
# DataFrame index is lost when calling pyreadr.write_rds so recreate
# the same one as in original cran_info_db.rds file
# https://github.com/ofajardo/pyreadr/issues/68
result[None]["rownames"] = list(CRAN_INFO_DB_DATA.keys())
result[None].set_index("rownames", inplace=True)
# pyreadr.write_rds serializes datetime to string so restore datetime type
# as in original cran_info_db.rds file
for dt_column in ("mtime", "ctime", "atime"):
result[None][dt_column] = pandas.to_datetime(
result[None][dt_column], utc=True
)
return result
mocker.patch(
"swh.lister.cran.lister.pyreadr.read_r",
wraps=read_r_restore_data_lost_by_write_r,
)
stats = lister.run()
stats = lister.run()
assert stats.pages == 1
assert stats.origins == 2
assert stats.pages == 1
assert stats.origins == 2
scheduler_origins = {
o.url: o
for o in swh_scheduler.get_listed_origins(lister.lister_obj.id).results
}
scheduler_origins = {
o.url: o for o in swh_scheduler.get_listed_origins(lister.lister_obj.id).results
}
assert set(scheduler_origins.keys()) == {
f"{CRAN_MIRROR_URL}/package=zooimage",
......@@ -189,7 +196,6 @@ def test_cran_lister_cran(swh_scheduler, requests_mock, cran_info_db_rds_path):
def test_lister_cran_instantiation_with_credentials(
credentials, expected_credentials, swh_scheduler
):
lister = CRANLister(swh_scheduler, credentials=credentials)
# Credentials are allowed in constructor
......@@ -197,7 +203,6 @@ def test_lister_cran_instantiation_with_credentials(
def test_lister_cran_from_configfile(swh_scheduler_config, mocker):
load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
load_from_envvar.return_value = {
"scheduler": {"cls": "local", **swh_scheduler_config},
......