From 35871896b22033218a32d667ebad4f794355db5b Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Tue, 21 Mar 2023 16:46:59 +0100 Subject: [PATCH] pattern: Improve handling of max_origins_per_page parameter Instead of fully consuming the get_origins_from_page generator into a list and truncate it, prefer to consume the generator origin per origin and abort the process when the max number of origin per page is reached. Indeed some non trivial listers like the cgit one can perform costly processing, HTTP request for instance, for each origin in a page. So better not consuming the full generator in a row to avoid such side effects. --- swh/lister/pattern.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index 621b643e..014fdf3e 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -182,17 +182,20 @@ class Lister(Generic[StateType, PageType]): try: for page in self.get_pages(): full_stats.pages += 1 - origins = list(self.get_origins_from_page(page)) - if ( - self.max_origins_per_page - and len(origins) > self.max_origins_per_page - ): - logger.info( - "Max origins per page set, truncated %s page results down to %s", - len(origins), - self.max_origins_per_page, - ) - origins = origins[: self.max_origins_per_page] + origins = [] + for origin in self.get_origins_from_page(page): + origins.append(origin) + if ( + self.max_origins_per_page + and len(origins) == self.max_origins_per_page + ): + logger.info( + "Max origins per page set to %s and reached, " + "aborting page processing", + self.max_origins_per_page, + ) + break + if not self.enable_origins: logger.info( "Disabling origins before sending them to the scheduler" -- GitLab