From 920ed0d529fddbef6d92cea68385b91a3efba774 Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <ardumont@softwareheritage.org>
Date: Tue, 1 Aug 2023 10:04:48 +0200
Subject: [PATCH] lister.pattern: Restore flushing origin batch in the
 scheduler

Prior to this commit, the newly introduced check on url validity was consuming the
stream of origins. In effect, this would no longer write origin records regularly.

For all listers, that would translate to flush origins only at the end of the listing
which could take a while for some (e.g. packagist lister has been running for more than
12h currently without writing anything in the scheduler).
---
 swh/lister/pattern.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py
index dd3295c6..852ab174 100644
--- a/swh/lister/pattern.py
+++ b/swh/lister/pattern.py
@@ -337,21 +337,23 @@ class Lister(Generic[StateType, PageType]):
         pass
 
     def send_origins(self, origins: Iterable[model.ListedOrigin]) -> List[str]:
-        """Record a list of :class:`model.ListedOrigin` in the scheduler.
+        """Record the stream of valid :class:`model.ListedOrigin` in the scheduler.
+
+        This will filter out invalid urls prior to record origins to the scheduler.
 
         Returns:
           the list of origin URLs recorded in scheduler database
         """
-        valid_origins = []
-        for origin in origins:
-            if is_valid_origin_url(origin.url):
-                valid_origins.append(origin)
-            else:
-                logger.warning("Skipping invalid origin: %s", origin.url)
-
         recorded_origins = []
-        for batch_origins in grouper(valid_origins, n=1000):
-            ret = self.scheduler.record_listed_origins(batch_origins)
+        for origins in grouper(origins, n=1000):
+            valid_origins = []
+            for origin in origins:
+                if is_valid_origin_url(origin.url):
+                    valid_origins.append(origin)
+                else:
+                    logger.warning("Skipping invalid origin: %s", origin.url)
+
+            ret = self.scheduler.record_listed_origins(valid_origins)
             recorded_origins += [origin.url for origin in ret]
 
         return recorded_origins
-- 
GitLab