diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py index 98dcf67237285938be6259a6614ba30e3a3c9abc..99dd986353cb440d9f5987bb372147351461c92d 100644 --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -13,6 +13,7 @@ import iso8601 import requests from tenacity import RetryError +from swh.core.utils import grouper from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import ListedOrigin @@ -126,12 +127,11 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): return {} def get_pages(self) -> Iterator[PackagistPageType]: - """ - Yield a single page listing all Packagist projects (randomly). - """ + """Retrieve & randomize unique list of packages into pages of packages.""" package_names = self.api_request(self.url)["packageNames"] shuffle(package_names) - yield package_names + for page_packages in grouper(package_names, n=self.record_batch_size): + yield page_packages def _get_metadata_from_page( self, package_url_format: str, package_name: str diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index ca1333582531842f697db7b3d1822accc61ea845..cc9ded9614890d14b45d8a1c2d7e3698b14ad20b 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -356,6 +356,7 @@ class Lister(Generic[StateType, PageType]): else: logger.warning("Skipping invalid origin: %s", origin.url) + logger.debug("Record valid %s origins in the scheduler", len(valid_origins)) ret = self.scheduler.record_listed_origins(valid_origins) recorded_origins.extend(origin.url for origin in ret)