From 7609ebf7e17488bad2022961609527ed07d91f9c Mon Sep 17 00:00:00 2001 From: Antoine Lambert <anlambert@softwareheritage.org> Date: Wed, 9 Oct 2024 18:32:23 +0200 Subject: [PATCH] pattern: Store termination date to scheduler database at end of listing It enables to track last lister execution date and will be used to schedule first visits with high priority for listed origins. Related to swh/devel/swh-scheduler#4687. --- requirements-swh.txt | 2 +- swh/lister/hackage/tests/test_lister.py | 6 +++--- swh/lister/pattern.py | 22 +++++++++++++++------- swh/lister/save_bulk/lister.py | 1 + swh/lister/tests/test_pattern.py | 17 +++++++++++++++-- 5 files changed, 35 insertions(+), 13 deletions(-) diff --git a/requirements-swh.txt b/requirements-swh.txt index 85d12d42..211d36cc 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,2 @@ swh.core[db] >= 3.4.0 -swh.scheduler >= 2.4.0 +swh.scheduler >= 2.5.0 diff --git a/swh/lister/hackage/tests/test_lister.py b/swh/lister/hackage/tests/test_lister.py index 80d4c494..f44e9add 100644 --- a/swh/lister/hackage/tests/test_lister.py +++ b/swh/lister/hackage/tests/test_lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -133,7 +133,7 @@ def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir): lister = HackageLister(scheduler=swh_scheduler) # force lister.last_listing_date to not being 'now' lister.state.last_listing_date = iso8601.parse_date("2022-08-26T02:27:45.073759Z") - lister.set_state_in_scheduler() + lister.set_state_in_scheduler(force=True) assert lister.get_state_from_scheduler() == HackageListerState( last_listing_date=iso8601.parse_date("2022-08-26T02:27:45.073759Z") ) @@ -157,7 +157,7 @@ def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir): lister.state.last_listing_date = iso8601.parse_date( "2022-09-30T08:00:34.348551203Z" ) - lister.set_state_in_scheduler() + lister.set_state_in_scheduler(force=True) assert lister.get_state_from_scheduler() == HackageListerState( last_listing_date=iso8601.parse_date("2022-09-30T08:00:34.348551203Z") ) diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index cc9ded96..1b149d55 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -1,10 +1,11 @@ -# Copyright (C) 2020-2023 The Software Heritage developers +# Copyright (C) 2020-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations +import copy from dataclasses import dataclass import logging from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, Set, TypeVar @@ -20,6 +21,7 @@ from swh.core.retry import http_retry from swh.core.utils import grouper from swh.scheduler import get_scheduler, model from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.utils import utcnow from . import USER_AGENT_TEMPLATE from .utils import is_valid_origin_url @@ -247,8 +249,7 @@ class Lister(Generic[StateType, PageType]): break finally: self.finalize() - if self.updated: - self.set_state_in_scheduler() + self.set_state_in_scheduler() return full_stats @@ -262,19 +263,26 @@ class Lister(Generic[StateType, PageType]): the state retrieved from the scheduler backend """ self.lister_obj = self.scheduler.get_or_create_lister( - name=self.LISTER_NAME, instance_name=self.instance + name=self.LISTER_NAME, + instance_name=self.instance, ) - return self.state_from_dict(self.lister_obj.current_state) + return self.state_from_dict(copy.deepcopy(self.lister_obj.current_state)) - def set_state_in_scheduler(self) -> None: + def set_state_in_scheduler(self, force: bool = False) -> None: """Update the state in the scheduler backend from the state of the current instance. + Args: + force: Update lister state even when lister has ``updated`` attribute + set to :const:`False`, this is useful for tests + Raises: swh.scheduler.exc.StaleData: in case of a race condition between concurrent listers (from :meth:`swh.scheduler.Scheduler.update_lister`). """ - self.lister_obj.current_state = self.state_to_dict(self.state) + if self.updated or force: + self.lister_obj.current_state = self.state_to_dict(self.state) + self.lister_obj.last_listing_finished_at = utcnow() self.lister_obj = self.scheduler.update_lister(self.lister_obj) # State management to/from the scheduler diff --git a/swh/lister/save_bulk/lister.py b/swh/lister/save_bulk/lister.py index 4b0e2f53..65b3fbe4 100644 --- a/swh/lister/save_bulk/lister.py +++ b/swh/lister/save_bulk/lister.py @@ -413,4 +413,5 @@ class SaveBulkLister(Lister[SaveBulkListerState, SaveBulkListerPage]): # update scheduler state at each rejected origin to get feedback # using Web API before end of listing self.state.rejected_origins = list(self.rejected_origins) + self.updated = True self.set_state_in_scheduler() diff --git a/swh/lister/tests/test_pattern.py b/swh/lister/tests/test_pattern.py index 88fd2b3e..9abb8f70 100644 --- a/swh/lister/tests/test_pattern.py +++ b/swh/lister/tests/test_pattern.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 The Software Heritage developers +# Copyright (C) 2020-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -151,21 +151,34 @@ def test_run(swh_scheduler): update_date = lister.lister_obj.updated + assert lister.lister_obj.last_listing_finished_at is None + run_result = lister.run() assert run_result.pages == 2 assert run_result.origins == 20 stored_lister = swh_scheduler.get_or_create_lister( - name="test-pattern-lister", instance_name="example.com" + name=lister.lister_obj.name, instance_name=lister.lister_obj.instance_name ) # Check that the finalize operation happened assert stored_lister.updated > update_date assert stored_lister.current_state["updated"] == "yes" + assert stored_lister.last_listing_finished_at is not None + + last_listing_finished_at = stored_lister.last_listing_finished_at check_listed_origins(swh_scheduler, lister, stored_lister) + lister.run() + + stored_lister = swh_scheduler.get_or_create_lister( + name=lister.lister_obj.name, instance_name=lister.lister_obj.instance_name + ) + + assert stored_lister.last_listing_finished_at > last_listing_finished_at + class InstantiableStatelessLister(pattern.StatelessLister[PageType]): LISTER_NAME = "test-stateless-lister" -- GitLab