From b02144b4f9d4184fa9c984fcb10d4db3ae5e6f1c Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Thu, 3 Aug 2023 16:12:13 +0200
Subject: [PATCH] packagist: Yield pages of origins to regularly record origins

Instead of sending one page with all origins listed which is britle.
When something goes wrong during the listing, the lister currently records nothing.
---
 swh/lister/packagist/lister.py | 8 ++++----
 swh/lister/pattern.py          | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py
index 98dcf672..99dd9863 100644
--- a/swh/lister/packagist/lister.py
+++ b/swh/lister/packagist/lister.py
@@ -13,6 +13,7 @@ import iso8601
 import requests
 from tenacity import RetryError
 
+from swh.core.utils import grouper
 from swh.scheduler.interface import SchedulerInterface
 from swh.scheduler.model import ListedOrigin
 
@@ -126,12 +127,11 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
             return {}
 
     def get_pages(self) -> Iterator[PackagistPageType]:
-        """
-        Yield a single page listing all Packagist projects (randomly).
-        """
+        """Retrieve & randomize unique list of packages into pages of packages."""
         package_names = self.api_request(self.url)["packageNames"]
         shuffle(package_names)
-        yield package_names
+        for page_packages in grouper(package_names, n=self.record_batch_size):
+            yield page_packages
 
     def _get_metadata_from_page(
         self, package_url_format: str, package_name: str
diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py
index ca133358..cc9ded96 100644
--- a/swh/lister/pattern.py
+++ b/swh/lister/pattern.py
@@ -356,6 +356,7 @@ class Lister(Generic[StateType, PageType]):
                 else:
                     logger.warning("Skipping invalid origin: %s", origin.url)
 
+            logger.debug("Record valid %s origins in the scheduler", len(valid_origins))
             ret = self.scheduler.record_listed_origins(valid_origins)
             recorded_origins.extend(origin.url for origin in ret)
 
-- 
GitLab