From 566db2ac43110f8a820a5954d0eb661dda4ca957 Mon Sep 17 00:00:00 2001 From: David Douard <david.douard@sdfa3.org> Date: Thu, 12 Oct 2023 11:32:16 +0200 Subject: [PATCH] Add support for 2 check config flags in the check_config table These flags allow to configure a checking session including only one of the 2 possible checks (hash computation and reference validation). --- swh/scrubber/db.py | 55 +++++++++++++++++++++++++++++-- swh/scrubber/sql/30-schema.sql | 4 +++ swh/scrubber/sql/60-indexes.sql | 2 +- swh/scrubber/sql/upgrades/7.sql | 10 ++++++ swh/scrubber/storage_checker.py | 36 +++++++++++++------- swh/scrubber/tests/test_db.py | 58 ++++++++++++++++++++++++++++++--- 6 files changed, 144 insertions(+), 21 deletions(-) diff --git a/swh/scrubber/db.py b/swh/scrubber/db.py index c77e417..3c472b5 100644 --- a/swh/scrubber/db.py +++ b/swh/scrubber/db.py @@ -40,6 +40,8 @@ class ConfigEntry: datastore: Datastore object_type: ObjectType nb_partitions: int + check_hashes: bool + check_references: bool @dataclasses.dataclass(frozen=True) @@ -136,6 +138,8 @@ class ScrubberDb(BaseDb): datastore: Datastore, object_type: ObjectType, nb_partitions: int, + check_hashes: bool = True, + check_references: bool = True, ) -> int: """Creates a configuration entry (and potentially a datastore); @@ -143,6 +147,10 @@ class ScrubberDb(BaseDb): already exists. """ + if not (check_hashes or check_references): + raise ValueError( + "At least one of the 2 check_hashes and check_references flags must be set" + ) datastore_id = self.datastore_get_or_add(datastore) if not name: name = ( @@ -154,13 +162,19 @@ class ScrubberDb(BaseDb): "datastore_id": datastore_id, "object_type": object_type.name.lower(), "nb_partitions": nb_partitions, + "check_hashes": check_hashes, + "check_references": check_references, } with self.transaction() as cur: cur.execute( """ WITH inserted AS ( - INSERT INTO check_config (name, datastore, object_type, nb_partitions) - VALUES (%(name)s, %(datastore_id)s, %(object_type)s, %(nb_partitions)s) + INSERT INTO check_config + (name, datastore, object_type, nb_partitions, + check_hashes, check_references) + VALUES + (%(name)s, %(datastore_id)s, %(object_type)s, %(nb_partitions)s, + %(check_hashes)s, %(check_references)s) RETURNING id ) SELECT id @@ -181,6 +195,7 @@ class ScrubberDb(BaseDb): """ SELECT cc.name, cc.object_type, cc.nb_partitions, + cc.check_hashes, cc.check_references, ds.package, ds.class, ds.instance FROM check_config AS cc INNER JOIN datastore As ds ON (cc.datastore=ds.id) @@ -193,7 +208,16 @@ class ScrubberDb(BaseDb): res = cur.fetchone() if res is None: raise ValueError(f"No config with id {config_id}") - (name, object_type, nb_partitions, ds_package, ds_class, ds_instance) = res + ( + name, + object_type, + nb_partitions, + chk_hashes, + chk_refs, + ds_package, + ds_class, + ds_instance, + ) = res return ConfigEntry( name=name, datastore=Datastore( @@ -201,6 +225,8 @@ class ScrubberDb(BaseDb): ), object_type=getattr(ObjectType, object_type.upper()), nb_partitions=nb_partitions, + check_hashes=chk_hashes, + check_references=chk_refs, ) def config_get_by_name( @@ -233,6 +259,7 @@ class ScrubberDb(BaseDb): """ SELECT cc.id, cc.name, cc.object_type, cc.nb_partitions, + cc.check_hashes, cc.check_references, ds.package, ds.class, ds.instance FROM check_config AS cc INNER JOIN datastore AS ds ON (cc.datastore=ds.id) @@ -245,6 +272,8 @@ class ScrubberDb(BaseDb): name, object_type, nb_partitions, + chk_hashes, + chk_refs, ds_package, ds_class, ds_instance, @@ -258,6 +287,8 @@ class ScrubberDb(BaseDb): ), object_type=object_type, nb_partitions=nb_partitions, + check_hashes=chk_hashes, + check_references=chk_refs, ), ) @@ -508,6 +539,8 @@ class ScrubberDb(BaseDb): cc_object_type, cc_nb_partitions, cc_name, + cc_chk_hashes, + cc_chk_refs, ds_package, ds_class, ds_instance, @@ -523,6 +556,8 @@ class ScrubberDb(BaseDb): ), object_type=cc_object_type, nb_partitions=cc_nb_partitions, + check_hashes=cc_chk_hashes, + check_references=cc_chk_refs, ), ) @@ -534,6 +569,7 @@ class ScrubberDb(BaseDb): SELECT co.id, co.first_occurrence, co.object, cc.object_type, cc.nb_partitions, cc.name, + cc.check_hashes, cc.check_references, ds.package, ds.class, ds.instance FROM corrupt_object AS co INNER JOIN check_config AS cc ON (cc.id=co.config_id) @@ -562,6 +598,7 @@ class ScrubberDb(BaseDb): SELECT co.id, co.first_occurrence, co.object, cc.object_type, cc.nb_partitions, cc.name, + cc.check_hashes, cc.check_references, ds.package, ds.class, ds.instance FROM corrupt_object AS co INNER JOIN check_config AS cc ON (cc.id=co.config_id) @@ -599,6 +636,7 @@ class ScrubberDb(BaseDb): SELECT co.id, co.first_occurrence, co.object, cc.object_type, cc.nb_partitions, cc.name, + cc.check_hashes, cc.check_references, ds.package, ds.class, ds.instance FROM corrupt_object AS co INNER JOIN check_config AS cc ON (cc.id=co.config_id) @@ -642,6 +680,7 @@ class ScrubberDb(BaseDb): SELECT co.id, co.first_occurrence, co.object, cc.object_type, cc.nb_partitions, cc.name, + cc.check_hashes, cc.check_references, ds.package, ds.class, ds.instance FROM corrupt_object AS co INNER JOIN check_config AS cc ON (cc.id=co.config_id) @@ -719,6 +758,7 @@ class ScrubberDb(BaseDb): SELECT mo.id, mo.first_occurrence, cc.name, cc.object_type, cc.nb_partitions, + cc.check_hashes, cc.check_references, ds.package, ds.class, ds.instance FROM missing_object AS mo INNER JOIN check_config AS cc ON (cc.id=mo.config_id) @@ -733,6 +773,8 @@ class ScrubberDb(BaseDb): cc_name, cc_object_type, cc_nb_partitions, + cc_chk_hashes, + cc_chk_refs, ds_package, ds_class, ds_instance, @@ -744,6 +786,8 @@ class ScrubberDb(BaseDb): name=cc_name, object_type=cc_object_type, nb_partitions=cc_nb_partitions, + check_hashes=cc_chk_hashes, + check_references=cc_chk_refs, datastore=Datastore( package=ds_package, cls=ds_class, instance=ds_instance ), @@ -760,6 +804,7 @@ class ScrubberDb(BaseDb): SELECT mor.reference_id, mor.first_occurrence, cc.name, cc.object_type, cc.nb_partitions, + cc.check_hashes, cc.check_references, ds.package, ds.class, ds.instance FROM missing_object_reference AS mor INNER JOIN check_config AS cc ON (cc.id=mor.config_id) @@ -776,6 +821,8 @@ class ScrubberDb(BaseDb): cc_name, cc_object_type, cc_nb_partitions, + cc_chk_hashes, + cc_chk_refs, ds_package, ds_class, ds_instance, @@ -788,6 +835,8 @@ class ScrubberDb(BaseDb): name=cc_name, object_type=cc_object_type, nb_partitions=cc_nb_partitions, + check_hashes=cc_chk_hashes, + check_references=cc_chk_refs, datastore=Datastore( package=ds_package, cls=ds_class, instance=ds_instance ), diff --git a/swh/scrubber/sql/30-schema.sql b/swh/scrubber/sql/30-schema.sql index c6d6001..3f1e7f7 100644 --- a/swh/scrubber/sql/30-schema.sql +++ b/swh/scrubber/sql/30-schema.sql @@ -29,6 +29,8 @@ create table check_config datastore int not null, object_type object_type not null, nb_partitions bigint not null, + check_hashes boolean not null default TRUE, + check_references boolean not null default TRUE, name text, comment text ); @@ -37,6 +39,8 @@ comment on table check_config is 'Configuration of a checker for a given object comment on column check_config.datastore is 'The datastore this checker config is about.'; comment on column check_config.object_type is 'The type of checked objects.'; comment on column check_config.nb_partitions is 'Number of partitions the set of objects is split into.'; +comment on column check_config.check_hashes is 'Flag: Check the hash of each object.'; +comment on column check_config.check_hashes is 'Flag: Check the references of each object.'; ------------------------------------- -- Checkpointing/progress tracking diff --git a/swh/scrubber/sql/60-indexes.sql b/swh/scrubber/sql/60-indexes.sql index 2ddca9f..4020c21 100644 --- a/swh/scrubber/sql/60-indexes.sql +++ b/swh/scrubber/sql/60-indexes.sql @@ -14,7 +14,7 @@ create unique index datastore_package_class_instance on datastore(package, class ------------------------------------- create unique index check_config_pkey on check_config(id); -create unique index check_config_unicity_idx on check_config(datastore, object_type, nb_partitions); +create unique index check_config_unicity_idx on check_config(datastore, object_type, nb_partitions, check_hashes, check_references); alter table check_config add primary key using index check_config_pkey; ------------------------------------- diff --git a/swh/scrubber/sql/upgrades/7.sql b/swh/scrubber/sql/upgrades/7.sql index 5aa4e11..adf86b3 100644 --- a/swh/scrubber/sql/upgrades/7.sql +++ b/swh/scrubber/sql/upgrades/7.sql @@ -4,6 +4,16 @@ -- description: Replace datastore column by a config_id one in missing_object, -- corrupt_object and missing_object_reference + +drop index check_config_unicity_idx; + +alter table check_config + add column check_hashes boolean not null default TRUE, + add column check_references boolean not null default TRUE; + +create unique index check_config_unicity_idx + on check_config(datastore, object_type, nb_partitions, check_hashes, check_references); + --- First, we look if there are datastores used by several check_config entries --- (for a given object type); If there are, we cannot automatically upgrade --- the DB, since we cannot choose the config_id to use in missing_object, diff --git a/swh/scrubber/storage_checker.py b/swh/scrubber/storage_checker.py index 33b7fc3..e52da6f 100644 --- a/swh/scrubber/storage_checker.py +++ b/swh/scrubber/storage_checker.py @@ -144,6 +144,14 @@ class StorageChecker: def object_type(self) -> swhids.ObjectType: return self.config.object_type + @property + def check_hashes(self) -> bool: + return self.config.check_hashes + + @property + def check_references(self) -> bool: + return self.config.check_references + @property def datastore(self) -> Datastore: """Returns a :class:`Datastore` instance representing the swh-storage instance @@ -237,18 +245,22 @@ class StorageChecker: else: assert False, f"Unexpected object type: {object_type}" - with self.statsd.timed( - "batch_duration_seconds", tags={"operation": "check_hashes"} - ): - logger.debug("Checking %s %s object hashes", len(objects), object_type) - self.check_object_hashes(objects) - with self.statsd.timed( - "batch_duration_seconds", tags={"operation": "check_references"} - ): - logger.debug( - "Checking %s %s object references", len(objects), object_type - ) - self.check_object_references(objects) + if self.check_hashes: + with self.statsd.timed( + "batch_duration_seconds", tags={"operation": "check_hashes"} + ): + logger.debug( + "Checking %s %s object hashes", len(objects), object_type + ) + self.check_object_hashes(objects) + if self.check_references: + with self.statsd.timed( + "batch_duration_seconds", tags={"operation": "check_references"} + ): + logger.debug( + "Checking %s %s object references", len(objects), object_type + ) + self.check_object_references(objects) page_token = page.next_page_token if page_token is None: diff --git a/swh/scrubber/tests/test_db.py b/swh/scrubber/tests/test_db.py index 4af7430..ded4d11 100644 --- a/swh/scrubber/tests/test_db.py +++ b/swh/scrubber/tests/test_db.py @@ -26,6 +26,7 @@ def test_config_add(datastore: Datastore, scrubber_db: ScrubberDb, config_id: in cfg_snp2 = scrubber_db.config_add("cfg snp 2", datastore, ObjectType.SNAPSHOT, 43) assert cfg_snp2 == 3 + # if not given, a config name is computed cfg_snp3 = scrubber_db.config_add(None, datastore, ObjectType.SNAPSHOT, 44) assert cfg_snp3 == 4 assert ( @@ -39,15 +40,62 @@ def test_config_add(datastore: Datastore, scrubber_db: ScrubberDb, config_id: in scrubber_db.config_add("cfg4", datastore, OBJECT_TYPE, NB_PARTITIONS) +def test_config_add_flags( + datastore: Datastore, scrubber_db: ScrubberDb, config_id: int +): + id_cfg2 = scrubber_db.config_add("cfg snp", datastore, ObjectType.SNAPSHOT, 42) + assert id_cfg2 == 2 + id_cfg3 = scrubber_db.config_add( + "cfg3", datastore, ObjectType.SNAPSHOT, 43, check_hashes=False + ) + assert id_cfg3 == 3 + id_cfg4 = scrubber_db.config_add( + "cfg4", datastore, ObjectType.SNAPSHOT, 43, check_references=False + ) + assert id_cfg4 == 4 + + # but cannot add another config entry with the same name, ds, objtype and part + # number but different flags + with pytest.raises(UniqueViolation): + scrubber_db.config_add( + "cfg4", datastore, ObjectType.SNAPSHOT, 43, check_hashes=False + ) + + with pytest.raises(ValueError): + scrubber_db.config_add( + "cfg4", + datastore, + OBJECT_TYPE, + NB_PARTITIONS, + check_hashes=False, + check_references=False, + ) + + def test_config_get(datastore: Datastore, scrubber_db: ScrubberDb, config_id: int): - cfg2 = scrubber_db.config_add("cfg2", datastore, ObjectType.SNAPSHOT, 42) - cfg3 = scrubber_db.config_add("cfg3", datastore, ObjectType.SNAPSHOT, 43) + id_cfg2 = scrubber_db.config_add("cfg2", datastore, ObjectType.SNAPSHOT, 42) + id_cfg3 = scrubber_db.config_add( + "cfg3", datastore, ObjectType.SNAPSHOT, 43, False, True + ) + id_cfg4 = scrubber_db.config_add( + "cfg4", datastore, ObjectType.SNAPSHOT, 43, True, False + ) - assert scrubber_db.config_get(cfg2) - assert scrubber_db.config_get(cfg3) + cfg2 = scrubber_db.config_get(id_cfg2) + assert cfg2 + assert cfg2.check_hashes is True + assert cfg2.check_references is True + cfg3 = scrubber_db.config_get(id_cfg3) + assert cfg3 + assert cfg3.check_hashes is False + assert cfg3.check_references is True + cfg4 = scrubber_db.config_get(id_cfg4) + assert cfg4 + assert cfg4.check_hashes is True + assert cfg4.check_references is False with pytest.raises(ValueError): - scrubber_db.config_get(cfg3 + 1) + scrubber_db.config_get(id_cfg4 + 1) @pytest.fixture -- GitLab