From 7609ebf7e17488bad2022961609527ed07d91f9c Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Wed, 9 Oct 2024 18:32:23 +0200
Subject: [PATCH] pattern: Store termination date to scheduler database at end
 of listing

It enables to track last lister execution date and will be used to schedule
first visits with high priority for listed origins.

Related to swh/devel/swh-scheduler#4687.
---
 requirements-swh.txt                    |  2 +-
 swh/lister/hackage/tests/test_lister.py |  6 +++---
 swh/lister/pattern.py                   | 22 +++++++++++++++-------
 swh/lister/save_bulk/lister.py          |  1 +
 swh/lister/tests/test_pattern.py        | 17 +++++++++++++++--
 5 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/requirements-swh.txt b/requirements-swh.txt
index 85d12d42..211d36cc 100644
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
@@ -1,2 +1,2 @@
 swh.core[db] >= 3.4.0
-swh.scheduler >= 2.4.0
+swh.scheduler >= 2.5.0
diff --git a/swh/lister/hackage/tests/test_lister.py b/swh/lister/hackage/tests/test_lister.py
index 80d4c494..f44e9add 100644
--- a/swh/lister/hackage/tests/test_lister.py
+++ b/swh/lister/hackage/tests/test_lister.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2022  The Software Heritage developers
+# Copyright (C) 2022-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -133,7 +133,7 @@ def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir):
     lister = HackageLister(scheduler=swh_scheduler)
     # force lister.last_listing_date to not being 'now'
     lister.state.last_listing_date = iso8601.parse_date("2022-08-26T02:27:45.073759Z")
-    lister.set_state_in_scheduler()
+    lister.set_state_in_scheduler(force=True)
     assert lister.get_state_from_scheduler() == HackageListerState(
         last_listing_date=iso8601.parse_date("2022-08-26T02:27:45.073759Z")
     )
@@ -157,7 +157,7 @@ def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir):
     lister.state.last_listing_date = iso8601.parse_date(
         "2022-09-30T08:00:34.348551203Z"
     )
-    lister.set_state_in_scheduler()
+    lister.set_state_in_scheduler(force=True)
     assert lister.get_state_from_scheduler() == HackageListerState(
         last_listing_date=iso8601.parse_date("2022-09-30T08:00:34.348551203Z")
     )
diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py
index cc9ded96..1b149d55 100644
--- a/swh/lister/pattern.py
+++ b/swh/lister/pattern.py
@@ -1,10 +1,11 @@
-# Copyright (C) 2020-2023  The Software Heritage developers
+# Copyright (C) 2020-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from __future__ import annotations
 
+import copy
 from dataclasses import dataclass
 import logging
 from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, Set, TypeVar
@@ -20,6 +21,7 @@ from swh.core.retry import http_retry
 from swh.core.utils import grouper
 from swh.scheduler import get_scheduler, model
 from swh.scheduler.interface import SchedulerInterface
+from swh.scheduler.utils import utcnow
 
 from . import USER_AGENT_TEMPLATE
 from .utils import is_valid_origin_url
@@ -247,8 +249,7 @@ class Lister(Generic[StateType, PageType]):
                     break
         finally:
             self.finalize()
-            if self.updated:
-                self.set_state_in_scheduler()
+            self.set_state_in_scheduler()
 
         return full_stats
 
@@ -262,19 +263,26 @@ class Lister(Generic[StateType, PageType]):
           the state retrieved from the scheduler backend
         """
         self.lister_obj = self.scheduler.get_or_create_lister(
-            name=self.LISTER_NAME, instance_name=self.instance
+            name=self.LISTER_NAME,
+            instance_name=self.instance,
         )
-        return self.state_from_dict(self.lister_obj.current_state)
+        return self.state_from_dict(copy.deepcopy(self.lister_obj.current_state))
 
-    def set_state_in_scheduler(self) -> None:
+    def set_state_in_scheduler(self, force: bool = False) -> None:
         """Update the state in the scheduler backend from the state of the current
         instance.
 
+        Args:
+            force: Update lister state even when lister has ``updated`` attribute
+                set to :const:`False`, this is useful for tests
+
         Raises:
           swh.scheduler.exc.StaleData: in case of a race condition between
             concurrent listers (from :meth:`swh.scheduler.Scheduler.update_lister`).
         """
-        self.lister_obj.current_state = self.state_to_dict(self.state)
+        if self.updated or force:
+            self.lister_obj.current_state = self.state_to_dict(self.state)
+        self.lister_obj.last_listing_finished_at = utcnow()
         self.lister_obj = self.scheduler.update_lister(self.lister_obj)
 
     # State management to/from the scheduler
diff --git a/swh/lister/save_bulk/lister.py b/swh/lister/save_bulk/lister.py
index 4b0e2f53..65b3fbe4 100644
--- a/swh/lister/save_bulk/lister.py
+++ b/swh/lister/save_bulk/lister.py
@@ -413,4 +413,5 @@ class SaveBulkLister(Lister[SaveBulkListerState, SaveBulkListerPage]):
                 # update scheduler state at each rejected origin to get feedback
                 # using Web API before end of listing
                 self.state.rejected_origins = list(self.rejected_origins)
+                self.updated = True
                 self.set_state_in_scheduler()
diff --git a/swh/lister/tests/test_pattern.py b/swh/lister/tests/test_pattern.py
index 88fd2b3e..9abb8f70 100644
--- a/swh/lister/tests/test_pattern.py
+++ b/swh/lister/tests/test_pattern.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2021  The Software Heritage developers
+# Copyright (C) 2020-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -151,21 +151,34 @@ def test_run(swh_scheduler):
 
     update_date = lister.lister_obj.updated
 
+    assert lister.lister_obj.last_listing_finished_at is None
+
     run_result = lister.run()
 
     assert run_result.pages == 2
     assert run_result.origins == 20
 
     stored_lister = swh_scheduler.get_or_create_lister(
-        name="test-pattern-lister", instance_name="example.com"
+        name=lister.lister_obj.name, instance_name=lister.lister_obj.instance_name
     )
 
     # Check that the finalize operation happened
     assert stored_lister.updated > update_date
     assert stored_lister.current_state["updated"] == "yes"
+    assert stored_lister.last_listing_finished_at is not None
+
+    last_listing_finished_at = stored_lister.last_listing_finished_at
 
     check_listed_origins(swh_scheduler, lister, stored_lister)
 
+    lister.run()
+
+    stored_lister = swh_scheduler.get_or_create_lister(
+        name=lister.lister_obj.name, instance_name=lister.lister_obj.instance_name
+    )
+
+    assert stored_lister.last_listing_finished_at > last_listing_finished_at
+
 
 class InstantiableStatelessLister(pattern.StatelessLister[PageType]):
     LISTER_NAME = "test-stateless-lister"
-- 
GitLab