From a62003397d6eb1d814aebf9816e5e0495337b2bf Mon Sep 17 00:00:00 2001
From: Vincent SELLIER <vincent.sellier@softwareheritage.org>
Date: Tue, 12 Jan 2021 12:13:07 +0100
Subject: [PATCH] Add an new origin visit info model object and related backend
 api

Upsert and Read methods

Related to T2443
---
 sql/updates/19.sql                     | 18 +++++++
 swh/scheduler/backend.py               | 58 ++++++++++++++++++++++-
 swh/scheduler/interface.py             | 16 ++++++-
 swh/scheduler/model.py                 | 40 +++++++++++++++-
 swh/scheduler/sql/30-schema.sql        | 18 ++++++-
 swh/scheduler/tests/test_api_client.py |  2 +
 swh/scheduler/tests/test_scheduler.py  | 65 +++++++++++++++++++++++++-
 7 files changed, 211 insertions(+), 6 deletions(-)
 create mode 100644 sql/updates/19.sql

diff --git a/sql/updates/19.sql b/sql/updates/19.sql
new file mode 100644
index 00000000..bc8f7e2d
--- /dev/null
+++ b/sql/updates/19.sql
@@ -0,0 +1,18 @@
+insert into dbversion (version, release, description)
+       values (19, now(), 'Work In Progress');
+
+create table origin_visit_stats (
+  url text not null,
+  visit_type text not null,
+  last_eventful timestamptz,
+  last_uneventful timestamptz,
+  last_failed timestamptz,
+
+  primary key (url, visit_type)
+);
+
+comment on column origin_visit_stats.url is 'Origin URL';
+comment on column origin_visit_stats.visit_type is 'Type of the visit for the given url';
+comment on column origin_visit_stats.last_eventful is 'Date of the last eventful event';
+comment on column origin_visit_stats.last_uneventful is 'Date of the last uneventful event';
+comment on column origin_visit_stats.last_failed is 'Date of the last failed event';
diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py
index d5b70583..c873280f 100644
--- a/swh/scheduler/backend.py
+++ b/swh/scheduler/backend.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2020  The Software Heritage developers
+# Copyright (C) 2015-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -21,6 +21,7 @@ from .model import (
     ListedOrigin,
     ListedOriginPageToken,
     Lister,
+    OriginVisitStats,
     PaginatedListedOriginList,
 )
 
@@ -757,3 +758,58 @@ class SchedulerBackend:
     def get_priority_ratios(self, db=None, cur=None):
         cur.execute("select id, ratio from priority_ratio")
         return {row["id"]: row["ratio"] for row in cur.fetchall()}
+
+    @db_transaction()
+    def origin_visit_stats_upsert(
+        self, visit_stats: OriginVisitStats, db=None, cur=None
+    ) -> None:
+        query = """
+            INSERT into origin_visit_stats AS ovi (
+                    url,
+                    visit_type,
+                    last_eventful,
+                    last_uneventful,
+                    last_failed
+                )
+            VALUES (%s, %s, %s, %s, %s) ON CONFLICT (url, visit_type) DO
+            UPDATE
+            SET last_eventful = coalesce(
+                    excluded.last_eventful,
+                    ovi.last_eventful
+                ),
+                last_uneventful = coalesce(
+                    excluded.last_uneventful,
+                    ovi.last_uneventful
+                ),
+                last_failed = coalesce(
+                    excluded.last_failed,
+                    ovi.last_failed
+                )
+        """
+
+        cur.execute(
+            query,
+            (
+                visit_stats.url,
+                visit_stats.visit_type,
+                visit_stats.last_eventful,
+                visit_stats.last_uneventful,
+                visit_stats.last_failed,
+            ),
+        )
+
+    @db_transaction()
+    def origin_visit_stats_get(
+        self, url: str, visit_type: str, db=None, cur=None
+    ) -> Optional[OriginVisitStats]:
+        query = format_query(
+            "SELECT {keys} FROM origin_visit_stats WHERE url=%s AND visit_type=%s",
+            OriginVisitStats.select_columns(),
+        )
+        cur.execute(query, (url, visit_type))
+        row = cur.fetchone()
+
+        if row:
+            return OriginVisitStats(**row)
+        else:
+            return None
diff --git a/swh/scheduler/interface.py b/swh/scheduler/interface.py
index 7b7be7e0..09a2567d 100644
--- a/swh/scheduler/interface.py
+++ b/swh/scheduler/interface.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2015-2020  The Software Heritage developers
+# Copyright (C) 2015-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -13,6 +13,7 @@ from swh.scheduler.model import (
     ListedOrigin,
     ListedOriginPageToken,
     Lister,
+    OriginVisitStats,
     PaginatedListedOriginList,
 )
 
@@ -322,3 +323,16 @@ class SchedulerInterface(Protocol):
     @remote_api_endpoint("priority_ratios/get")
     def get_priority_ratios(self):
         ...
+
+    @remote_api_endpoint("visit_stats/upsert")
+    def origin_visit_stats_upsert(self, visit_stats: OriginVisitStats) -> None:
+        """Create a new origin visit stats
+        """
+        ...
+
+    @remote_api_endpoint("visit_stats/get")
+    def origin_visit_stats_get(
+        self, url: str, visit_type: str
+    ) -> Optional[OriginVisitStats]:
+        """Retrieve the stats for an origin with a given visit type"""
+        ...
diff --git a/swh/scheduler/model.py b/swh/scheduler/model.py
index 0275a432..f404ff98 100644
--- a/swh/scheduler/model.py
+++ b/swh/scheduler/model.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020  The Software Heritage developers
+# Copyright (C) 2020-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -12,6 +12,12 @@ import attr.converters
 from attrs_strict import type_validator
 
 
+def check_timestamptz(value) -> None:
+    """Checks the date has a timezone."""
+    if value is not None and value.tzinfo is None:
+        raise ValueError("date must be a timezone-aware datetime.")
+
+
 @attr.s
 class BaseSchedulerModel:
     """Base class for database-backed objects.
@@ -195,3 +201,35 @@ class PaginatedListedOriginList(BaseSchedulerModel):
         converter=convert_listed_origin_page_token,
         default=None,
     )
+
+
+@attr.s(frozen=True, slots=True)
+class OriginVisitStats(BaseSchedulerModel):
+    """Represents an aggregated origin visits view.
+    """
+
+    url = attr.ib(
+        type=str, validator=[type_validator()], metadata={"primary_key": True}
+    )
+    visit_type = attr.ib(
+        type=str, validator=[type_validator()], metadata={"primary_key": True}
+    )
+    last_eventful = attr.ib(
+        type=Optional[datetime.datetime], validator=type_validator()
+    )
+    last_uneventful = attr.ib(
+        type=Optional[datetime.datetime], validator=type_validator()
+    )
+    last_failed = attr.ib(type=Optional[datetime.datetime], validator=type_validator())
+
+    @last_eventful.validator
+    def check_last_eventful(self, attribute, value):
+        check_timestamptz(value)
+
+    @last_uneventful.validator
+    def check_last_uneventful(self, attribute, value):
+        check_timestamptz(value)
+
+    @last_failed.validator
+    def check_last_failed(self, attribute, value):
+        check_timestamptz(value)
diff --git a/swh/scheduler/sql/30-schema.sql b/swh/scheduler/sql/30-schema.sql
index 30c53239..949912c0 100644
--- a/swh/scheduler/sql/30-schema.sql
+++ b/swh/scheduler/sql/30-schema.sql
@@ -11,7 +11,7 @@ comment on column dbversion.release is 'Version deployment timestamp';
 comment on column dbversion.description is 'Version description';
 
 insert into dbversion (version, release, description)
-       values (18, now(), 'Work In Progress');
+       values (19, now(), 'Work In Progress');
 
 create table task_type (
   type text primary key,
@@ -164,3 +164,19 @@ comment on column listed_origins.last_seen is 'Time at which the origin was last
 comment on column listed_origins.last_update is 'Time of the last update to the origin recorded by the remote';
 
 comment on column listed_origins.last_scheduled is 'Time when this origin was scheduled to be visited last';
+
+create table origin_visit_stats (
+  url text not null,
+  visit_type text not null,
+  last_eventful timestamptz,
+  last_uneventful timestamptz,
+  last_failed timestamptz,
+
+  primary key (url, visit_type)
+);
+
+comment on column origin_visit_stats.url is 'Origin URL';
+comment on column origin_visit_stats.visit_type is 'Type of the visit for the given url';
+comment on column origin_visit_stats.last_eventful is 'Date of the last eventful event';
+comment on column origin_visit_stats.last_uneventful is 'Date of the last uneventful event';
+comment on column origin_visit_stats.last_failed is 'Date of the last failed event';
diff --git a/swh/scheduler/tests/test_api_client.py b/swh/scheduler/tests/test_api_client.py
index 2cf1c309..9792093e 100644
--- a/swh/scheduler/tests/test_api_client.py
+++ b/swh/scheduler/tests/test_api_client.py
@@ -65,6 +65,8 @@ def test_site_map(flask_app_client):
             "task_type/create",
             "task_type/get",
             "task_type/get_all",
+            "visit_stats/get",
+            "visit_stats/upsert",
         )
     )
     assert rules == expected_rules
diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py
index 38717bbb..17391c41 100644
--- a/swh/scheduler/tests/test_scheduler.py
+++ b/swh/scheduler/tests/test_scheduler.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2019  The Software Heritage developers
+# Copyright (C) 2017-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -16,7 +16,7 @@ import pytest
 
 from swh.scheduler.exc import StaleData, UnknownPolicy
 from swh.scheduler.interface import SchedulerInterface
-from swh.scheduler.model import ListedOrigin, ListedOriginPageToken
+from swh.scheduler.model import ListedOrigin, ListedOriginPageToken, OriginVisitStats
 from swh.scheduler.utils import utcnow
 
 from .common import LISTERS, TASK_TYPES, TEMPLATES, tasks_from_template
@@ -762,3 +762,64 @@ class TestScheduler:
     def _create_task_types(self, scheduler):
         for tt in TASK_TYPES.values():
             scheduler.create_task_type(tt)
+
+    def test_origin_visit_stats_upsert(self, swh_scheduler) -> None:
+        eventful_date = utcnow()
+        url = "https://github.com/test"
+
+        visit_stats = OriginVisitStats(
+            url=url,
+            visit_type="git",
+            last_eventful=eventful_date,
+            last_uneventful=None,
+            last_failed=None,
+        )
+        swh_scheduler.origin_visit_stats_upsert(visit_stats)
+        swh_scheduler.origin_visit_stats_upsert(visit_stats)
+
+        assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats
+        assert swh_scheduler.origin_visit_stats_get(url, "svn") is None
+
+        uneventful_date = utcnow()
+        visit_stats = OriginVisitStats(
+            url=url,
+            visit_type="git",
+            last_eventful=None,
+            last_uneventful=uneventful_date,
+            last_failed=None,
+        )
+        swh_scheduler.origin_visit_stats_upsert(visit_stats)
+
+        uneventful_visit = swh_scheduler.origin_visit_stats_get(url, "git")
+
+        expected_visit_stats = OriginVisitStats(
+            url=url,
+            visit_type="git",
+            last_eventful=eventful_date,
+            last_uneventful=uneventful_date,
+            last_failed=None,
+        )
+
+        assert uneventful_visit == expected_visit_stats
+
+        failed_date = utcnow()
+        visit_stats = OriginVisitStats(
+            url=url,
+            visit_type="git",
+            last_eventful=None,
+            last_uneventful=None,
+            last_failed=failed_date,
+        )
+        swh_scheduler.origin_visit_stats_upsert(visit_stats)
+
+        failed_visit = swh_scheduler.origin_visit_stats_get(url, "git")
+
+        expected_visit_stats = OriginVisitStats(
+            url=url,
+            visit_type="git",
+            last_eventful=eventful_date,
+            last_uneventful=uneventful_date,
+            last_failed=failed_date,
+        )
+
+        assert failed_visit == expected_visit_stats
-- 
GitLab