From b02144b4f9d4184fa9c984fcb10d4db3ae5e6f1c Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org> Date: Thu, 3 Aug 2023 16:12:13 +0200 Subject: [PATCH] packagist: Yield pages of origins to regularly record origins Instead of sending one page with all origins listed which is britle. When something goes wrong during the listing, the lister currently records nothing. --- swh/lister/packagist/lister.py | 8 ++++---- swh/lister/pattern.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py index 98dcf672..99dd9863 100644 --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -13,6 +13,7 @@ import iso8601 import requests from tenacity import RetryError +from swh.core.utils import grouper from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -126,12 +127,11 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): return {} def get_pages(self) -> Iterator[PackagistPageType]: - """ - Yield a single page listing all Packagist projects (randomly). - """ + """Retrieve & randomize unique list of packages into pages of packages.""" package_names = self.api_request(self.url)["packageNames"] shuffle(package_names) - yield package_names + for page_packages in grouper(package_names, n=self.record_batch_size): + yield page_packages def _get_metadata_from_page( self, package_url_format: str, package_name: str diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index ca133358..cc9ded96 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -356,6 +356,7 @@ class Lister(Generic[StateType, PageType]): else: logger.warning("Skipping invalid origin: %s", origin.url) + logger.debug("Record valid %s origins in the scheduler", len(valid_origins)) ret = self.scheduler.record_listed_origins(valid_origins) recorded_origins.extend(origin.url for origin in ret) -- GitLab