From eadb704494ff8e409b5d72e7f8043af2be615a02 Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Thu, 24 Oct 2024 12:33:20 +0200
Subject: [PATCH] pattern: Ensure termination date is set at the end of listing
 process

Previously it could be set by any call to the `set_state_in_scheduler`
method.

This was leading to side effects on the save bulk lister while updating
the scheduler state when encountering an invalid or not found origin,
and thus the listing failed.

Fixes #4712.
---
 swh/lister/hackage/tests/test_lister.py   |  4 ++--
 swh/lister/pattern.py                     | 15 ++++++++----
 swh/lister/save_bulk/tests/test_lister.py | 28 +++++++++++++++++++++++
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/swh/lister/hackage/tests/test_lister.py b/swh/lister/hackage/tests/test_lister.py
index f44e9add..48224f0f 100644
--- a/swh/lister/hackage/tests/test_lister.py
+++ b/swh/lister/hackage/tests/test_lister.py
@@ -133,7 +133,7 @@ def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir):
     lister = HackageLister(scheduler=swh_scheduler)
     # force lister.last_listing_date to not being 'now'
     lister.state.last_listing_date = iso8601.parse_date("2022-08-26T02:27:45.073759Z")
-    lister.set_state_in_scheduler(force=True)
+    lister.set_state_in_scheduler(force_state=True)
     assert lister.get_state_from_scheduler() == HackageListerState(
         last_listing_date=iso8601.parse_date("2022-08-26T02:27:45.073759Z")
     )
@@ -157,7 +157,7 @@ def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir):
     lister.state.last_listing_date = iso8601.parse_date(
         "2022-09-30T08:00:34.348551203Z"
     )
-    lister.set_state_in_scheduler(force=True)
+    lister.set_state_in_scheduler(force_state=True)
     assert lister.get_state_from_scheduler() == HackageListerState(
         last_listing_date=iso8601.parse_date("2022-09-30T08:00:34.348551203Z")
     )
diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py
index 61ed8d7c..ec697e80 100644
--- a/swh/lister/pattern.py
+++ b/swh/lister/pattern.py
@@ -251,7 +251,7 @@ class Lister(Generic[StateType, PageType]):
                     break
         finally:
             self.finalize()
-            self.set_state_in_scheduler()
+            self.set_state_in_scheduler(with_listing_finished_date=True)
 
         return full_stats
 
@@ -271,21 +271,26 @@ class Lister(Generic[StateType, PageType]):
         )
         return self.state_from_dict(copy.deepcopy(self.lister_obj.current_state))
 
-    def set_state_in_scheduler(self, force: bool = False) -> None:
+    def set_state_in_scheduler(
+        self, with_listing_finished_date: bool = False, force_state: bool = False
+    ) -> None:
         """Update the state in the scheduler backend from the state of the current
         instance.
 
         Args:
-            force: Update lister state even when lister has ``updated`` attribute
+            with_listing_finished_date: Update the ``last_listing_finished_at`` column
+                value for the lister in scheduler database if set to const:`True`.
+            force_state: Update lister state even when lister has ``updated`` attribute
                 set to :const:`False`, this is useful for tests
 
         Raises:
           swh.scheduler.exc.StaleData: in case of a race condition between
             concurrent listers (from :meth:`swh.scheduler.Scheduler.update_lister`).
         """
-        if self.updated or force:
+        if self.updated or force_state:
             self.lister_obj.current_state = self.state_to_dict(self.state)
-        self.lister_obj.last_listing_finished_at = utcnow()
+        if with_listing_finished_date:
+            self.lister_obj.last_listing_finished_at = utcnow()
         self.lister_obj = self.scheduler.update_lister(self.lister_obj)
 
     # State management to/from the scheduler
diff --git a/swh/lister/save_bulk/tests/test_lister.py b/swh/lister/save_bulk/tests/test_lister.py
index b3365965..5b3a8e6d 100644
--- a/swh/lister/save_bulk/tests/test_lister.py
+++ b/swh/lister/save_bulk/tests/test_lister.py
@@ -133,6 +133,9 @@ def test_bulk_lister_not_found_origins(swh_scheduler, requests_mock, mocker):
         scheduler=swh_scheduler,
         per_page=PER_PAGE,
     )
+
+    set_state_in_scheduler = mocker.spy(lister_bulk, "set_state_in_scheduler")
+
     stats = lister_bulk.run()
 
     assert stats == ListerStats(pages=len(SUBMITTED_ORIGINS) // PER_PAGE, origins=0)
@@ -162,6 +165,12 @@ def test_bulk_lister_not_found_origins(swh_scheduler, requests_mock, mocker):
         )
     )
 
+    # check scheduler state is updated at each not found origin
+    # plus at the end of the listing process to set the termination date
+    expected_calls = [mocker.call()] * len(SUBMITTED_ORIGINS)
+    expected_calls.append(mocker.call(with_listing_finished_date=True))
+    assert set_state_in_scheduler.mock_calls == expected_calls
+
 
 def test_bulk_lister_connection_errors(swh_scheduler, requests_mock, mocker):
     requests_mock.head(
@@ -178,6 +187,9 @@ def test_bulk_lister_connection_errors(swh_scheduler, requests_mock, mocker):
         scheduler=swh_scheduler,
         per_page=PER_PAGE,
     )
+
+    set_state_in_scheduler = mocker.spy(lister_bulk, "set_state_in_scheduler")
+
     stats = lister_bulk.run()
 
     assert stats == ListerStats(pages=len(SUBMITTED_ORIGINS) // PER_PAGE, origins=0)
@@ -207,6 +219,12 @@ def test_bulk_lister_connection_errors(swh_scheduler, requests_mock, mocker):
         )
     )
 
+    # check scheduler state is updated at each origin connection error
+    # plus at the end of the listing process to set the termination date
+    expected_calls = [mocker.call()] * len(SUBMITTED_ORIGINS)
+    expected_calls.append(mocker.call(with_listing_finished_date=True))
+    assert set_state_in_scheduler.mock_calls == expected_calls
+
 
 def test_bulk_lister_invalid_origins(swh_scheduler, requests_mock, mocker):
     requests_mock.head(re.compile(".*"), status_code=200)
@@ -236,6 +254,9 @@ def test_bulk_lister_invalid_origins(swh_scheduler, requests_mock, mocker):
         scheduler=swh_scheduler,
         per_page=PER_PAGE,
     )
+
+    set_state_in_scheduler = mocker.spy(lister_bulk, "set_state_in_scheduler")
+
     stats = lister_bulk.run()
 
     assert stats == ListerStats(pages=len(SUBMITTED_ORIGINS) // PER_PAGE, origins=1)
@@ -264,3 +285,10 @@ def test_bulk_lister_invalid_origins(swh_scheduler, requests_mock, mocker):
             key=attrgetter("origin_url"),
         )
     )
+
+    # check scheduler state is updated at each invalid origin
+    # plus at the end of the listing process to set the termination date
+    expected_calls = [mocker.call()] * (len(SUBMITTED_ORIGINS) - 1)
+    expected_calls.append(mocker.call(with_listing_finished_date=True))
+
+    assert set_state_in_scheduler.mock_calls == expected_calls
-- 
GitLab