From 920ed0d529fddbef6d92cea68385b91a3efba774 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org> Date: Tue, 1 Aug 2023 10:04:48 +0200 Subject: [PATCH] lister.pattern: Restore flushing origin batch in the scheduler Prior to this commit, the newly introduced check on url validity was consuming the stream of origins. In effect, this would no longer write origin records regularly. For all listers, that would translate to flush origins only at the end of the listing which could take a while for some (e.g. packagist lister has been running for more than 12h currently without writing anything in the scheduler). --- swh/lister/pattern.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index dd3295c6..852ab174 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -337,21 +337,23 @@ class Lister(Generic[StateType, PageType]): pass def send_origins(self, origins: Iterable[model.ListedOrigin]) -> List[str]: - """Record a list of :class:`model.ListedOrigin` in the scheduler. + """Record the stream of valid :class:`model.ListedOrigin` in the scheduler. + + This will filter out invalid urls prior to record origins to the scheduler. Returns: the list of origin URLs recorded in scheduler database """ - valid_origins = [] - for origin in origins: - if is_valid_origin_url(origin.url): - valid_origins.append(origin) - else: - logger.warning("Skipping invalid origin: %s", origin.url) - recorded_origins = [] - for batch_origins in grouper(valid_origins, n=1000): - ret = self.scheduler.record_listed_origins(batch_origins) + for origins in grouper(origins, n=1000): + valid_origins = [] + for origin in origins: + if is_valid_origin_url(origin.url): + valid_origins.append(origin) + else: + logger.warning("Skipping invalid origin: %s", origin.url) + + ret = self.scheduler.record_listed_origins(valid_origins) recorded_origins += [origin.url for origin in ret] return recorded_origins -- GitLab