From 1803b707e4ba6e41e84976abfd18ff1d530b7ac7 Mon Sep 17 00:00:00 2001
From: Antoine Lambert <antoine.lambert@inria.fr>
Date: Fri, 5 Feb 2021 12:51:20 +0100
Subject: [PATCH] cran: Prevent multiple listing of an origin

A CRAN package can appear twice in the JSON list returned by the
list_all_packages.R script, most recent version of the package
appearing first.

So handle that edge case to avoid error when sending origins to
the scheduler.
---
 swh/lister/cran/lister.py            |  9 +++++++++
 swh/lister/cran/tests/test_lister.py | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py
index d843c2fd..635a7a65 100644
--- a/swh/lister/cran/lister.py
+++ b/swh/lister/cran/lister.py
@@ -45,9 +45,18 @@ class CRANLister(StatelessLister[PageType]):
 
     def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]:
         assert self.lister_obj.id is not None
+
+        seen_urls = set()
         for package_info in page:
             origin_url, artifact_url = compute_origin_urls(package_info)
 
+            if origin_url in seen_urls:
+                # prevent multiple listing of an origin,
+                # most recent version will be listed first
+                continue
+
+            seen_urls.add(origin_url)
+
             yield ListedOrigin(
                 lister_id=self.lister_obj.id,
                 url=origin_url,
diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py
index b8822ec3..f8707d11 100644
--- a/swh/lister/cran/tests/test_lister.py
+++ b/swh/lister/cran/tests/test_lister.py
@@ -105,6 +105,22 @@ def test_cran_lister_cran(datadir, swh_scheduler, mocker):
         filtered_origins[0].last_update == parse_packaged_date(package_info)
 
 
+def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker):
+    with open(path.join(datadir, "list-r-packages.json")) as f:
+        cran_data = json.loads(f.read())
+
+    lister = CRANLister(swh_scheduler)
+
+    mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data")
+
+    mock_cran.return_value = cran_data + cran_data
+
+    stats = lister.run()
+
+    assert stats.pages == 1
+    assert stats.origins == len(cran_data)
+
+
 @pytest.mark.parametrize(
     "credentials, expected_credentials",
     [
-- 
GitLab