From 4aee4da78406be00a41ddff4d0fc10f1edf6b2bf Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Thu, 17 Aug 2023 16:52:39 +0200
Subject: [PATCH] cran: Use pyreadr instead of rpy2 to read a RDS file from
 Python

The CRAN lister improvements introduced in 91e4e33 originally used pyreadr
to read a RDS file from Python instead of rpy2.

As swh-lister was still packaged for debian at the time, the choice of using
rpy2 instead was made as a debian package is available for it while it is not
for pyreadr.

Now debian packaging was dropped for swh-lister we can reinstate the pyreadr
based implementation which has the advantages of being faster and not depending
on the R language runtime.

Related to swh/meta#1709.
---
 mypy.ini                             |  6 +--
 requirements.txt                     |  2 +-
 swh/lister/cran/lister.py            | 31 +++++--------
 swh/lister/cran/tests/test_lister.py | 67 +++++++++++++++-------------
 4 files changed, 50 insertions(+), 56 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 67c2c7cf..baa6ce50 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -31,6 +31,9 @@ ignore_missing_imports = True
 [mypy-pkg_resources.*]
 ignore_missing_imports = True
 
+[mypy-pyreadr.*]
+ignore_missing_imports = True
+
 [mypy-pytest.*]
 ignore_missing_imports = True
 
@@ -40,9 +43,6 @@ ignore_missing_imports = True
 [mypy-requests_mock.*]
 ignore_missing_imports = True
 
-[mypy-rpy2.*]
-ignore_missing_imports = True
-
 [mypy-urllib3.util.*]
 ignore_missing_imports = True
 
diff --git a/requirements.txt b/requirements.txt
index bc64858a..6c03204c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,10 +5,10 @@ iso8601
 launchpadlib
 lxml
 psycopg2
+pyreadr
 python_debian
 repomd
 requests
-rpy2
 setuptools
 tenacity >= 6.2
 testing.postgresql
diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py
index 26db72e6..c4ec3fc8 100644
--- a/swh/lister/cran/lister.py
+++ b/swh/lister/cran/lister.py
@@ -3,14 +3,14 @@
 # See top-level LICENSE file for more information
 
 from collections import defaultdict
-from datetime import datetime, timezone
 import logging
 import os
 import tempfile
 from typing import Any, Dict, Iterator, List, Optional, Tuple
 from urllib.parse import urljoin
 
-from rpy2 import robjects
+import iso8601
+import pyreadr
 
 from swh.lister.pattern import CredentialsType, StatelessLister
 from swh.scheduler.interface import SchedulerInterface
@@ -64,23 +64,14 @@ class CRANLister(StatelessLister[PageType]):
         with tempfile.TemporaryDirectory() as tmpdir:
             package_artifacts: Dict[str, Dict[str, Any]] = defaultdict(dict)
             dest_path = os.path.join(tmpdir, os.path.basename(CRAN_INFO_DB_URL))
-
-            response = self.http_request(CRAN_INFO_DB_URL, stream=True)
-            with open(dest_path, "wb") as rds_file:
-                for chunk in response.iter_content(chunk_size=1024):
-                    rds_file.write(chunk)
-
+            logger.debug("Fetching %s file to %s", CRAN_INFO_DB_URL, dest_path)
+            dest_path = pyreadr.download_file(CRAN_INFO_DB_URL, dest_path)
             logger.debug("Parsing %s file", dest_path)
-            robjects.r(f"cran_info_db_df <- readRDS('{dest_path}')")
-            r_df = robjects.r["cran_info_db_df"]
-            colnames = list(r_df.colnames)
-
-            def _get_col_value(row, colname):
-                return r_df[colnames.index(colname)][row]
+            cran_db_df = pyreadr.read_r(dest_path)[None]
 
             logger.debug("Processing CRAN packages")
-            for i in range(r_df.nrow):
-                tarball_path = r_df.rownames[i]
+            for package_artifact_metadata in cran_db_df.itertuples():
+                tarball_path = package_artifact_metadata[0]
                 package_info = tarball_path.split("/")[-1].replace(".tar.gz", "")
                 if "_" not in package_info and "-" not in package_info:
                     # skip package artifact with no version
@@ -98,11 +89,9 @@ class CRANLister(StatelessLister[PageType]):
                     ),
                     "version": package_version,
                     "package": package_name,
-                    "checksums": {"length": int(_get_col_value(i, "size"))},
-                    "mtime": (
-                        datetime.fromtimestamp(
-                            _get_col_value(i, "mtime"), tz=timezone.utc
-                        )
+                    "checksums": {"length": int(package_artifact_metadata.size)},
+                    "mtime": iso8601.parse_date(
+                        package_artifact_metadata.mtime.isoformat()
                     ),
                 }
 
diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py
index c7f70568..89f9da54 100644
--- a/swh/lister/cran/tests/test_lister.py
+++ b/swh/lister/cran/tests/test_lister.py
@@ -6,18 +6,10 @@
 from os import path
 
 import pandas
+import pyreadr
 import pytest
-from rpy2 import robjects
-from rpy2.robjects import pandas2ri
-from rpy2.robjects.conversion import localconverter
 
-try:
-    from rpy2.robjects.conversion import py2rpy
-except ImportError:
-    # for old rpy2 versions (fix debian buster package build)
-    from rpy2.robjects.pandas2ri import py2ri as py2rpy  # noqa
-
-from swh.lister.cran.lister import CRAN_INFO_DB_URL, CRAN_MIRROR_URL, CRANLister
+from swh.lister.cran.lister import CRAN_MIRROR_URL, CRANLister
 
 CRAN_INFO_DB_DATA = {
     "/srv/ftp/pub/R/src/contrib/Archive/zooimage/zooimage_3.0-3.tar.gz": {
@@ -103,31 +95,46 @@ def cran_info_db_rds_path(tmp_path):
         orient="index",
     )
     rds_path = path.join(tmp_path, "cran_info_db.rds")
-    # Convert pandas dataframe to R dataframe
-    with localconverter(robjects.default_converter + pandas2ri.converter):
-        r_df = py2rpy(df)
-    robjects.r.assign("cran_info_db_df", r_df)
-    robjects.r(f"saveRDS(cran_info_db_df, file='{rds_path}')")
+    pyreadr.write_rds(rds_path, df)
     return rds_path
 
 
-def test_cran_lister_cran(swh_scheduler, requests_mock, cran_info_db_rds_path):
-
-    with open(cran_info_db_rds_path, "rb") as cran_db_rds:
-
-        requests_mock.get(CRAN_INFO_DB_URL, body=cran_db_rds)
-
-        lister = CRANLister(swh_scheduler)
+def test_cran_lister_cran(swh_scheduler, mocker, cran_info_db_rds_path):
+    lister = CRANLister(swh_scheduler)
+
+    mock_download_file = mocker.patch("swh.lister.cran.lister.pyreadr.download_file")
+    mock_download_file.return_value = cran_info_db_rds_path
+
+    read_r = pyreadr.read_r
+
+    def read_r_restore_data_lost_by_write_r(*args, **kwargs):
+        result = read_r(*args, **kwargs)
+        # DataFrame index is lost when calling pyreadr.write_rds so recreate
+        # the same one as in original cran_info_db.rds file
+        # https://github.com/ofajardo/pyreadr/issues/68
+        result[None]["rownames"] = list(CRAN_INFO_DB_DATA.keys())
+        result[None].set_index("rownames", inplace=True)
+        # pyreadr.write_rds serializes datetime to string so restore datetime type
+        # as in original cran_info_db.rds file
+        for dt_column in ("mtime", "ctime", "atime"):
+            result[None][dt_column] = pandas.to_datetime(
+                result[None][dt_column], utc=True
+            )
+        return result
+
+    mocker.patch(
+        "swh.lister.cran.lister.pyreadr.read_r",
+        wraps=read_r_restore_data_lost_by_write_r,
+    )
 
-        stats = lister.run()
+    stats = lister.run()
 
-        assert stats.pages == 1
-        assert stats.origins == 2
+    assert stats.pages == 1
+    assert stats.origins == 2
 
-        scheduler_origins = {
-            o.url: o
-            for o in swh_scheduler.get_listed_origins(lister.lister_obj.id).results
-        }
+    scheduler_origins = {
+        o.url: o for o in swh_scheduler.get_listed_origins(lister.lister_obj.id).results
+    }
 
     assert set(scheduler_origins.keys()) == {
         f"{CRAN_MIRROR_URL}/package=zooimage",
@@ -189,7 +196,6 @@ def test_cran_lister_cran(swh_scheduler, requests_mock, cran_info_db_rds_path):
 def test_lister_cran_instantiation_with_credentials(
     credentials, expected_credentials, swh_scheduler
 ):
-
     lister = CRANLister(swh_scheduler, credentials=credentials)
 
     # Credentials are allowed in constructor
@@ -197,7 +203,6 @@ def test_lister_cran_instantiation_with_credentials(
 
 
 def test_lister_cran_from_configfile(swh_scheduler_config, mocker):
-
     load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
     load_from_envvar.return_value = {
         "scheduler": {"cls": "local", **swh_scheduler_config},
-- 
GitLab