From 566db2ac43110f8a820a5954d0eb661dda4ca957 Mon Sep 17 00:00:00 2001
From: David Douard <david.douard@sdfa3.org>
Date: Thu, 12 Oct 2023 11:32:16 +0200
Subject: [PATCH] Add support for 2 check config flags in the check_config
 table

These flags allow to configure a checking session including only one of
the 2 possible checks (hash computation and reference validation).
---
 swh/scrubber/db.py              | 55 +++++++++++++++++++++++++++++--
 swh/scrubber/sql/30-schema.sql  |  4 +++
 swh/scrubber/sql/60-indexes.sql |  2 +-
 swh/scrubber/sql/upgrades/7.sql | 10 ++++++
 swh/scrubber/storage_checker.py | 36 +++++++++++++-------
 swh/scrubber/tests/test_db.py   | 58 ++++++++++++++++++++++++++++++---
 6 files changed, 144 insertions(+), 21 deletions(-)

diff --git a/swh/scrubber/db.py b/swh/scrubber/db.py
index c77e417..3c472b5 100644
--- a/swh/scrubber/db.py
+++ b/swh/scrubber/db.py
@@ -40,6 +40,8 @@ class ConfigEntry:
     datastore: Datastore
     object_type: ObjectType
     nb_partitions: int
+    check_hashes: bool
+    check_references: bool
 
 
 @dataclasses.dataclass(frozen=True)
@@ -136,6 +138,8 @@ class ScrubberDb(BaseDb):
         datastore: Datastore,
         object_type: ObjectType,
         nb_partitions: int,
+        check_hashes: bool = True,
+        check_references: bool = True,
     ) -> int:
         """Creates a configuration entry (and potentially a datastore);
 
@@ -143,6 +147,10 @@ class ScrubberDb(BaseDb):
         already exists.
         """
 
+        if not (check_hashes or check_references):
+            raise ValueError(
+                "At least one of the 2 check_hashes and check_references flags must be set"
+            )
         datastore_id = self.datastore_get_or_add(datastore)
         if not name:
             name = (
@@ -154,13 +162,19 @@ class ScrubberDb(BaseDb):
             "datastore_id": datastore_id,
             "object_type": object_type.name.lower(),
             "nb_partitions": nb_partitions,
+            "check_hashes": check_hashes,
+            "check_references": check_references,
         }
         with self.transaction() as cur:
             cur.execute(
                 """
                 WITH inserted AS (
-                    INSERT INTO check_config (name, datastore, object_type, nb_partitions)
-                    VALUES (%(name)s, %(datastore_id)s, %(object_type)s, %(nb_partitions)s)
+                    INSERT INTO check_config
+                      (name, datastore, object_type, nb_partitions,
+                       check_hashes, check_references)
+                    VALUES
+                      (%(name)s, %(datastore_id)s, %(object_type)s, %(nb_partitions)s,
+                       %(check_hashes)s, %(check_references)s)
                     RETURNING id
                 )
                 SELECT id
@@ -181,6 +195,7 @@ class ScrubberDb(BaseDb):
                 """
                     SELECT
                       cc.name, cc.object_type, cc.nb_partitions,
+                      cc.check_hashes, cc.check_references,
                       ds.package, ds.class, ds.instance
                     FROM check_config AS cc
                     INNER JOIN datastore As ds ON (cc.datastore=ds.id)
@@ -193,7 +208,16 @@ class ScrubberDb(BaseDb):
             res = cur.fetchone()
             if res is None:
                 raise ValueError(f"No config with id {config_id}")
-            (name, object_type, nb_partitions, ds_package, ds_class, ds_instance) = res
+            (
+                name,
+                object_type,
+                nb_partitions,
+                chk_hashes,
+                chk_refs,
+                ds_package,
+                ds_class,
+                ds_instance,
+            ) = res
             return ConfigEntry(
                 name=name,
                 datastore=Datastore(
@@ -201,6 +225,8 @@ class ScrubberDb(BaseDb):
                 ),
                 object_type=getattr(ObjectType, object_type.upper()),
                 nb_partitions=nb_partitions,
+                check_hashes=chk_hashes,
+                check_references=chk_refs,
             )
 
     def config_get_by_name(
@@ -233,6 +259,7 @@ class ScrubberDb(BaseDb):
                 """
                     SELECT
                       cc.id, cc.name, cc.object_type, cc.nb_partitions,
+                      cc.check_hashes, cc.check_references,
                       ds.package, ds.class, ds.instance
                     FROM check_config AS cc
                     INNER JOIN datastore AS ds ON (cc.datastore=ds.id)
@@ -245,6 +272,8 @@ class ScrubberDb(BaseDb):
                     name,
                     object_type,
                     nb_partitions,
+                    chk_hashes,
+                    chk_refs,
                     ds_package,
                     ds_class,
                     ds_instance,
@@ -258,6 +287,8 @@ class ScrubberDb(BaseDb):
                         ),
                         object_type=object_type,
                         nb_partitions=nb_partitions,
+                        check_hashes=chk_hashes,
+                        check_references=chk_refs,
                     ),
                 )
 
@@ -508,6 +539,8 @@ class ScrubberDb(BaseDb):
                 cc_object_type,
                 cc_nb_partitions,
                 cc_name,
+                cc_chk_hashes,
+                cc_chk_refs,
                 ds_package,
                 ds_class,
                 ds_instance,
@@ -523,6 +556,8 @@ class ScrubberDb(BaseDb):
                     ),
                     object_type=cc_object_type,
                     nb_partitions=cc_nb_partitions,
+                    check_hashes=cc_chk_hashes,
+                    check_references=cc_chk_refs,
                 ),
             )
 
@@ -534,6 +569,7 @@ class ScrubberDb(BaseDb):
                 SELECT
                     co.id, co.first_occurrence, co.object,
                     cc.object_type, cc.nb_partitions, cc.name,
+                    cc.check_hashes, cc.check_references,
                     ds.package, ds.class, ds.instance
                 FROM corrupt_object AS co
                 INNER JOIN check_config AS cc ON (cc.id=co.config_id)
@@ -562,6 +598,7 @@ class ScrubberDb(BaseDb):
                 SELECT
                     co.id, co.first_occurrence, co.object,
                     cc.object_type, cc.nb_partitions, cc.name,
+                    cc.check_hashes, cc.check_references,
                     ds.package, ds.class, ds.instance
                 FROM corrupt_object AS co
                 INNER JOIN check_config AS cc ON (cc.id=co.config_id)
@@ -599,6 +636,7 @@ class ScrubberDb(BaseDb):
             SELECT
                 co.id, co.first_occurrence, co.object,
                 cc.object_type, cc.nb_partitions, cc.name,
+                cc.check_hashes, cc.check_references,
                 ds.package, ds.class, ds.instance
             FROM corrupt_object AS co
             INNER JOIN check_config AS cc ON (cc.id=co.config_id)
@@ -642,6 +680,7 @@ class ScrubberDb(BaseDb):
             SELECT
                 co.id, co.first_occurrence, co.object,
                 cc.object_type, cc.nb_partitions, cc.name,
+                cc.check_hashes, cc.check_references,
                 ds.package, ds.class, ds.instance
             FROM corrupt_object AS co
             INNER JOIN check_config AS cc ON (cc.id=co.config_id)
@@ -719,6 +758,7 @@ class ScrubberDb(BaseDb):
                 SELECT
                     mo.id, mo.first_occurrence,
                     cc.name, cc.object_type, cc.nb_partitions,
+                    cc.check_hashes, cc.check_references,
                     ds.package, ds.class, ds.instance
                 FROM missing_object AS mo
                 INNER JOIN check_config AS cc ON (cc.id=mo.config_id)
@@ -733,6 +773,8 @@ class ScrubberDb(BaseDb):
                     cc_name,
                     cc_object_type,
                     cc_nb_partitions,
+                    cc_chk_hashes,
+                    cc_chk_refs,
                     ds_package,
                     ds_class,
                     ds_instance,
@@ -744,6 +786,8 @@ class ScrubberDb(BaseDb):
                         name=cc_name,
                         object_type=cc_object_type,
                         nb_partitions=cc_nb_partitions,
+                        check_hashes=cc_chk_hashes,
+                        check_references=cc_chk_refs,
                         datastore=Datastore(
                             package=ds_package, cls=ds_class, instance=ds_instance
                         ),
@@ -760,6 +804,7 @@ class ScrubberDb(BaseDb):
                 SELECT
                     mor.reference_id, mor.first_occurrence,
                     cc.name, cc.object_type, cc.nb_partitions,
+                    cc.check_hashes, cc.check_references,
                     ds.package, ds.class, ds.instance
                 FROM missing_object_reference AS mor
                 INNER JOIN check_config AS cc ON (cc.id=mor.config_id)
@@ -776,6 +821,8 @@ class ScrubberDb(BaseDb):
                     cc_name,
                     cc_object_type,
                     cc_nb_partitions,
+                    cc_chk_hashes,
+                    cc_chk_refs,
                     ds_package,
                     ds_class,
                     ds_instance,
@@ -788,6 +835,8 @@ class ScrubberDb(BaseDb):
                         name=cc_name,
                         object_type=cc_object_type,
                         nb_partitions=cc_nb_partitions,
+                        check_hashes=cc_chk_hashes,
+                        check_references=cc_chk_refs,
                         datastore=Datastore(
                             package=ds_package, cls=ds_class, instance=ds_instance
                         ),
diff --git a/swh/scrubber/sql/30-schema.sql b/swh/scrubber/sql/30-schema.sql
index c6d6001..3f1e7f7 100644
--- a/swh/scrubber/sql/30-schema.sql
+++ b/swh/scrubber/sql/30-schema.sql
@@ -29,6 +29,8 @@ create table check_config
   datastore             int not null,
   object_type           object_type not null,
   nb_partitions         bigint not null,
+  check_hashes			boolean not null default TRUE,
+  check_references		boolean not null default TRUE,
   name                  text,
   comment               text
 );
@@ -37,6 +39,8 @@ comment on table check_config is 'Configuration of a checker for a given object
 comment on column check_config.datastore is 'The datastore this checker config is about.';
 comment on column check_config.object_type is 'The type of checked objects.';
 comment on column check_config.nb_partitions is 'Number of partitions the set of objects is split into.';
+comment on column check_config.check_hashes is 'Flag: Check the hash of each object.';
+comment on column check_config.check_hashes is 'Flag: Check the references of each object.';
 
 -------------------------------------
 -- Checkpointing/progress tracking
diff --git a/swh/scrubber/sql/60-indexes.sql b/swh/scrubber/sql/60-indexes.sql
index 2ddca9f..4020c21 100644
--- a/swh/scrubber/sql/60-indexes.sql
+++ b/swh/scrubber/sql/60-indexes.sql
@@ -14,7 +14,7 @@ create unique index datastore_package_class_instance on datastore(package, class
 -------------------------------------
 
 create unique index check_config_pkey on check_config(id);
-create unique index check_config_unicity_idx on check_config(datastore, object_type, nb_partitions);
+create unique index check_config_unicity_idx on check_config(datastore, object_type, nb_partitions, check_hashes, check_references);
 alter table check_config add primary key using index check_config_pkey;
 
 -------------------------------------
diff --git a/swh/scrubber/sql/upgrades/7.sql b/swh/scrubber/sql/upgrades/7.sql
index 5aa4e11..adf86b3 100644
--- a/swh/scrubber/sql/upgrades/7.sql
+++ b/swh/scrubber/sql/upgrades/7.sql
@@ -4,6 +4,16 @@
 -- description: Replace datastore column by a config_id one in missing_object,
 --              corrupt_object and missing_object_reference
 
+
+drop index check_config_unicity_idx;
+
+alter table check_config
+  add column   check_hashes boolean not null default TRUE,
+  add column   check_references boolean not null default TRUE;
+
+create unique index check_config_unicity_idx
+    on check_config(datastore, object_type, nb_partitions, check_hashes, check_references);
+
 --- First, we look if there are datastores used by several check_config entries
 --- (for a given object type); If there are, we cannot automatically upgrade
 --- the DB, since we cannot choose the config_id to use in missing_object,
diff --git a/swh/scrubber/storage_checker.py b/swh/scrubber/storage_checker.py
index 33b7fc3..e52da6f 100644
--- a/swh/scrubber/storage_checker.py
+++ b/swh/scrubber/storage_checker.py
@@ -144,6 +144,14 @@ class StorageChecker:
     def object_type(self) -> swhids.ObjectType:
         return self.config.object_type
 
+    @property
+    def check_hashes(self) -> bool:
+        return self.config.check_hashes
+
+    @property
+    def check_references(self) -> bool:
+        return self.config.check_references
+
     @property
     def datastore(self) -> Datastore:
         """Returns a :class:`Datastore` instance representing the swh-storage instance
@@ -237,18 +245,22 @@ class StorageChecker:
             else:
                 assert False, f"Unexpected object type: {object_type}"
 
-            with self.statsd.timed(
-                "batch_duration_seconds", tags={"operation": "check_hashes"}
-            ):
-                logger.debug("Checking %s %s object hashes", len(objects), object_type)
-                self.check_object_hashes(objects)
-            with self.statsd.timed(
-                "batch_duration_seconds", tags={"operation": "check_references"}
-            ):
-                logger.debug(
-                    "Checking %s %s object references", len(objects), object_type
-                )
-                self.check_object_references(objects)
+            if self.check_hashes:
+                with self.statsd.timed(
+                    "batch_duration_seconds", tags={"operation": "check_hashes"}
+                ):
+                    logger.debug(
+                        "Checking %s %s object hashes", len(objects), object_type
+                    )
+                    self.check_object_hashes(objects)
+            if self.check_references:
+                with self.statsd.timed(
+                    "batch_duration_seconds", tags={"operation": "check_references"}
+                ):
+                    logger.debug(
+                        "Checking %s %s object references", len(objects), object_type
+                    )
+                    self.check_object_references(objects)
 
             page_token = page.next_page_token
             if page_token is None:
diff --git a/swh/scrubber/tests/test_db.py b/swh/scrubber/tests/test_db.py
index 4af7430..ded4d11 100644
--- a/swh/scrubber/tests/test_db.py
+++ b/swh/scrubber/tests/test_db.py
@@ -26,6 +26,7 @@ def test_config_add(datastore: Datastore, scrubber_db: ScrubberDb, config_id: in
     cfg_snp2 = scrubber_db.config_add("cfg snp 2", datastore, ObjectType.SNAPSHOT, 43)
     assert cfg_snp2 == 3
 
+    # if not given, a config name is computed
     cfg_snp3 = scrubber_db.config_add(None, datastore, ObjectType.SNAPSHOT, 44)
     assert cfg_snp3 == 4
     assert (
@@ -39,15 +40,62 @@ def test_config_add(datastore: Datastore, scrubber_db: ScrubberDb, config_id: in
         scrubber_db.config_add("cfg4", datastore, OBJECT_TYPE, NB_PARTITIONS)
 
 
+def test_config_add_flags(
+    datastore: Datastore, scrubber_db: ScrubberDb, config_id: int
+):
+    id_cfg2 = scrubber_db.config_add("cfg snp", datastore, ObjectType.SNAPSHOT, 42)
+    assert id_cfg2 == 2
+    id_cfg3 = scrubber_db.config_add(
+        "cfg3", datastore, ObjectType.SNAPSHOT, 43, check_hashes=False
+    )
+    assert id_cfg3 == 3
+    id_cfg4 = scrubber_db.config_add(
+        "cfg4", datastore, ObjectType.SNAPSHOT, 43, check_references=False
+    )
+    assert id_cfg4 == 4
+
+    # but cannot add another config entry with the same name, ds, objtype and part
+    # number but different flags
+    with pytest.raises(UniqueViolation):
+        scrubber_db.config_add(
+            "cfg4", datastore, ObjectType.SNAPSHOT, 43, check_hashes=False
+        )
+
+    with pytest.raises(ValueError):
+        scrubber_db.config_add(
+            "cfg4",
+            datastore,
+            OBJECT_TYPE,
+            NB_PARTITIONS,
+            check_hashes=False,
+            check_references=False,
+        )
+
+
 def test_config_get(datastore: Datastore, scrubber_db: ScrubberDb, config_id: int):
-    cfg2 = scrubber_db.config_add("cfg2", datastore, ObjectType.SNAPSHOT, 42)
-    cfg3 = scrubber_db.config_add("cfg3", datastore, ObjectType.SNAPSHOT, 43)
+    id_cfg2 = scrubber_db.config_add("cfg2", datastore, ObjectType.SNAPSHOT, 42)
+    id_cfg3 = scrubber_db.config_add(
+        "cfg3", datastore, ObjectType.SNAPSHOT, 43, False, True
+    )
+    id_cfg4 = scrubber_db.config_add(
+        "cfg4", datastore, ObjectType.SNAPSHOT, 43, True, False
+    )
 
-    assert scrubber_db.config_get(cfg2)
-    assert scrubber_db.config_get(cfg3)
+    cfg2 = scrubber_db.config_get(id_cfg2)
+    assert cfg2
+    assert cfg2.check_hashes is True
+    assert cfg2.check_references is True
+    cfg3 = scrubber_db.config_get(id_cfg3)
+    assert cfg3
+    assert cfg3.check_hashes is False
+    assert cfg3.check_references is True
+    cfg4 = scrubber_db.config_get(id_cfg4)
+    assert cfg4
+    assert cfg4.check_hashes is True
+    assert cfg4.check_references is False
 
     with pytest.raises(ValueError):
-        scrubber_db.config_get(cfg3 + 1)
+        scrubber_db.config_get(id_cfg4 + 1)
 
 
 @pytest.fixture
-- 
GitLab