Skip to content
Snippets Groups Projects
Verified Commit 01ffb31f authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

Schedule save code now as recurring origins to ingest when successful

To allow users to request only once their save code now origins, once the first
ingestion is successfully ingested, we also mark it as recurrent origin to crawl.

Implementation wise, the scheduling routine in charge of updating the save code now
statuses reported in the save code now ui is in charge of this.

Related to T1524
parent 8b4e77ca
No related branches found
Tags v0.0.313
No related merge requests found
pytest_plugins = ["swh.auth.pytest_plugin"]
pytest_plugins = ["swh.auth.pytest_plugin", "swh.scheduler.pytest_plugin"]
......@@ -9,6 +9,7 @@ pytest-mock
requests-mock != 1.9.0, != 1.9.1
swh.core[http] >= 0.0.95
swh.loader.git >= 0.8.0
swh-scheduler[testing] >= 0.5.0
swh.storage >= 0.1.1
types-docutils
types-pyyaml
......
......@@ -3,16 +3,57 @@
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Set
from django.core.management.base import BaseCommand
from swh.scheduler.model import ListedOrigin
from swh.web.common.models import VISIT_STATUS_FULL, VISIT_STATUS_PARTIAL
from swh.web.common.origin_save import refresh_save_origin_request_statuses
from swh.web.config import get_config
from swh.web.config import scheduler as get_scheduler
class Command(BaseCommand):
help = "Refresh save code now origin request statuses periodically"
def handle(self, *args, **options):
"""Refresh origin save code now requests.
For the origin visit types, svn, git, hg, this also installs the origins as
recurring origins to visit.
"""
refreshed_statuses = refresh_save_origin_request_statuses()
scheduler = get_scheduler()
# then schedule the origins with meaningful status and type to be ingested
# regularly
lister = scheduler.get_or_create_lister(
name="save-code-now", instance_name=get_config()["instance_name"]
)
origins: Set[str, str] = set()
listed_origins = []
for status in refreshed_statuses:
visit_type = status["visit_type"]
# only deal with git, svn, hg visit types
if visit_type == "archives":
continue
# only keep satisfying visit statuses
if status["visit_status"] not in (VISIT_STATUS_PARTIAL, VISIT_STATUS_FULL):
continue
origin = status["origin_url"]
# drop duplicates within the same batch
if (visit_type, origin) in origins:
continue
origins.add((visit_type, origin))
listed_origins.append(
ListedOrigin(lister_id=lister.id, visit_type=visit_type, url=origin)
)
if listed_origins:
scheduler.record_listed_origins(listed_origins)
if len(refreshed_statuses) > 0:
msg = f"Successfully updated {len(refreshed_statuses)} save request(s)."
......
......@@ -131,9 +131,10 @@ DEFAULT_CONFIG = {
"metadata_search_backend": ("string", "swh-indexer-storage"), # or "swh-search"
"counters_backend": ("string", "swh-storage"), # or "swh-counters"
"staging_server_names": ("list", STAGING_SERVER_NAMES),
"instance_name": ("str", "archive-test.softwareheritage.org"),
}
swhweb_config = {} # type: Dict[str, Any]
swhweb_config: Dict[str, Any] = {}
def get_config(config_file="web/web"):
......
......@@ -3,33 +3,175 @@
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timedelta, timezone
from io import StringIO
import pytest
from django.core.management import call_command
from swh.core.api.classes import stream_results
from swh.web.common.models import (
SAVE_REQUEST_ACCEPTED,
SAVE_TASK_FAILED,
SAVE_TASK_SCHEDULED,
SAVE_TASK_SUCCEEDED,
VISIT_STATUS_FAILED,
VISIT_STATUS_FULL,
VISIT_STATUS_PARTIAL,
)
from swh.web.common.typing import SaveOriginRequestInfo
from swh.web.config import get_config
MODULE_FQDN = "swh.web.common.management.commands"
COMMAND_NAME = "refresh_savecodenow_statuses"
AUTHORIZED_ORIGIN_URL = "https://scm.ourproject.org/anonscm/%s"
@pytest.fixture
def mock_refresh(mocker):
return mocker.patch(
f"{MODULE_FQDN}.{COMMAND_NAME}.refresh_save_origin_request_statuses"
)
@pytest.fixture
def mock_scheduler(mocker, swh_scheduler):
mock_scheduler = mocker.patch(f"{MODULE_FQDN}.{COMMAND_NAME}.get_scheduler")
mock_scheduler.return_value = swh_scheduler
return mock_scheduler
@pytest.mark.parametrize("nb_results", [0, 10, 20])
def test_command_refresh__with_statuses_refreshed(mocker, nb_results):
"""Refresh status command reported updated non-terminal statuses.
def test_command_refresh__with_statuses_refreshed(
mock_scheduler, mock_refresh, nb_results
):
"""Refresh status command reports non-terminal statuses updates.
"""
command_name = "refresh_savecodenow_statuses"
module_fqdn = "swh.web.common.management.commands"
mock_refresh = mocker.patch(
f"{module_fqdn}.{command_name}.refresh_save_origin_request_statuses"
)
# fake returned refreshed status
mock_refresh.return_value = [{"": ""}] * nb_results
# fake returned refreshed status for 'archives' visit type
mock_refresh.return_value = [{"visit_type": "archives",}] * nb_results
out = StringIO()
call_command(command_name, stdout=out)
assert mock_refresh.called
call_command(COMMAND_NAME, stdout=out)
actual_output = out.getvalue()
if nb_results > 0:
assert f"updated {nb_results}" in actual_output
else:
assert "Nothing" in actual_output
assert mock_scheduler.called
assert mock_refresh.called
@pytest.fixture
def fake_refreshed_data():
"""Prepare test data within the scheduler and the swh-web model db
"""
duplicated_origin_url = AUTHORIZED_ORIGIN_URL % "specific-origin"
entries = (
[
{
"visit_type": "archives", # ignored from recurring task scheduling
"visit_status": VISIT_STATUS_FULL,
"task_status": SAVE_TASK_SUCCEEDED,
},
{
"visit_type": "hg", # scheduled as recurring task
"visit_status": VISIT_STATUS_PARTIAL,
"task_status": SAVE_TASK_SUCCEEDED,
},
{
"visit_type": "svn", # scheduled as recurring task
"visit_status": VISIT_STATUS_PARTIAL,
"task_status": SAVE_TASK_SCHEDULED,
},
{
"visit_type": "svn", # ignored from recurring task scheduling
"visit_status": VISIT_STATUS_FAILED,
"task_status": SAVE_TASK_FAILED,
},
{
"visit_type": "hg", # ignored from recurring task scheduling
"visit_status": "created",
"task_status": SAVE_TASK_SCHEDULED,
},
]
+ [
{
"visit_type": "git",
"visit_status": VISIT_STATUS_FULL,
"task_status": SAVE_TASK_SUCCEEDED,
"origin": duplicated_origin_url,
}
]
* 3
) # only 1 of the origin duplicates will be scheduled as recurring task
time_now = datetime.now(tz=timezone.utc) - timedelta(days=len(entries))
return [
SaveOriginRequestInfo(
visit_type=meta["visit_type"],
visit_status=meta["visit_status"],
origin_url=(
meta["origin"] if "origin" in meta else AUTHORIZED_ORIGIN_URL % i
),
save_request_date=time_now + timedelta(days=i - 1),
save_request_status=SAVE_REQUEST_ACCEPTED,
visit_date=time_now + timedelta(days=i),
save_task_status=meta["task_status"],
id=i,
loading_task_id=i,
)
for i, meta in enumerate(entries)
]
def test_command_refresh__with_recurrent_tasks_scheduling(
mock_scheduler, mock_refresh, fake_refreshed_data, swh_scheduler
):
"""Refresh status command report updates of statuses. The successful ones without the
type 'archived' are also scheduled recurringly.
"""
mock_refresh.return_value = fake_refreshed_data
# only visit types (git, hg, svn) types with status (full, partial) are taken into
# account for scheduling, so only 3 of those matches in the fake data set.
expected_nb_scheduled = 0
origins = set()
expected_nb_scheduled = 0
for entry in fake_refreshed_data:
visit_type = entry["visit_type"]
if visit_type == "archives": # only deal with git, svn, hg
continue
if entry["visit_status"] not in ("partial", "full"):
continue
origin = entry["origin_url"]
if (visit_type, origin) in origins:
continue
origins.add((visit_type, origin))
expected_nb_scheduled += 1
assert expected_nb_scheduled == 3
out = StringIO()
call_command(COMMAND_NAME, stdout=out)
actual_output = out.getvalue()
assert f"Successfully updated {len(fake_refreshed_data)}" in actual_output
lister = swh_scheduler.get_or_create_lister(
name="save-code-now", instance_name=get_config()["instance_name"]
)
result = list(stream_results(swh_scheduler.get_listed_origins, lister.id))
assert len(result) == expected_nb_scheduled
assert mock_scheduler.called
assert mock_refresh.called
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment