From 1803b707e4ba6e41e84976abfd18ff1d530b7ac7 Mon Sep 17 00:00:00 2001 From: Antoine Lambert <antoine.lambert@inria.fr> Date: Fri, 5 Feb 2021 12:51:20 +0100 Subject: [PATCH] cran: Prevent multiple listing of an origin A CRAN package can appear twice in the JSON list returned by the list_all_packages.R script, most recent version of the package appearing first. So handle that edge case to avoid error when sending origins to the scheduler. --- swh/lister/cran/lister.py | 9 +++++++++ swh/lister/cran/tests/test_lister.py | 16 ++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index d843c2fd..635a7a65 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -45,9 +45,18 @@ class CRANLister(StatelessLister[PageType]): def get_origins_from_page(self, page: PageType) -> Iterator[ListedOrigin]: assert self.lister_obj.id is not None + + seen_urls = set() for package_info in page: origin_url, artifact_url = compute_origin_urls(package_info) + if origin_url in seen_urls: + # prevent multiple listing of an origin, + # most recent version will be listed first + continue + + seen_urls.add(origin_url) + yield ListedOrigin( lister_id=self.lister_obj.id, url=origin_url, diff --git a/swh/lister/cran/tests/test_lister.py b/swh/lister/cran/tests/test_lister.py index b8822ec3..f8707d11 100644 --- a/swh/lister/cran/tests/test_lister.py +++ b/swh/lister/cran/tests/test_lister.py @@ -105,6 +105,22 @@ def test_cran_lister_cran(datadir, swh_scheduler, mocker): filtered_origins[0].last_update == parse_packaged_date(package_info) +def test_cran_lister_duplicated_origins(datadir, swh_scheduler, mocker): + with open(path.join(datadir, "list-r-packages.json")) as f: + cran_data = json.loads(f.read()) + + lister = CRANLister(swh_scheduler) + + mock_cran = mocker.patch("swh.lister.cran.lister.read_cran_data") + + mock_cran.return_value = cran_data + cran_data + + stats = lister.run() + + assert stats.pages == 1 + assert stats.origins == len(cran_data) + + @pytest.mark.parametrize( "credentials, expected_credentials", [ -- GitLab