[cassandra] Validate the replayed data
We should ensure all the data was correctly imported in cassandra.
To do so the scrubbers should be executed at least once to detect any incoherence compared to kafka
The replayer error reporter should be deployed too to track the import errors
plan:
-
Develop cli to easily compare journal/cassandra/postgres representations -
Debug issues -
Refactor -
Let it run basically on machine -
Package and deploy -
Deploy in staging (3 replicas for now) -
Monitor - steady workload [1]
- kafka eta in the future ;) [2]
-
Deploy in production (6 replicas) -
Analysis and fix papercuts -
RawExtrinsicMetadata comparison -
Content: check all hashes -
OriginVisit [0] (not fixed, further analysis required to know what to do) -
OriginVisitStatus~> not reproduced -
SkippedContent (null?)~> no issue so far -
Fetch content by each hash (so 4 queries per content), consider it None if any is missing. -
Adapt comparison behavior for various object types -
Add debug logging instructions and the means to turn them on on deployment -
Dump all representations on disk so we can compare what went wrong during comparison if any problem actually exists -
Add script to analyze disk representations after the facts
-
-
swh-charts: Deploy per object type -
Deploy through swh-charts in staging -
Issue with ceph volume mounting... (too many small files) -
Use local path on rancher node instead -
Fix remaining issue on journal client -
Deploy
-
-
Monitor
[0] Date of the origin is older in the journal than it is in the backend (same date between cassandra and postgresql) #4707 (comment 167740)
[1] https://grafana.softwareheritage.org/goto/0UosGx0Sk?orgId=1
[2] https://grafana.softwareheritage.org/goto/VhGLMbASz?orgId=1
[3] analyze_disk_representation.py
#!/usr/bin/env python3
"""Journal client dumped disk representation analysis.
"""
import click
import os
from os.path import join
from yaml import safe_load
# For eval_read function
import datetime # noqa
from swh.model.model import * # noqa
from swh.model.swhids import * # noqa
from typing import Any, Dict, Optional
def eval_read(path):
#from swh.model.model import *
if not os.path.exists(path):
return None
with open(path, "r") as f:
return eval(f.read())
def from_path_to_rep(dir_path):
dir_path_str = str(dir_path)
cass_rep_path = join(dir_path_str, "cassandra_representation")
jn_rep_path = join(dir_path_str, "journal_representation")
pg_rep_path = join(dir_path_str, "postgresql_representation")
jn_rep = eval_read(jn_rep_path)
cass_rep = eval_read(cass_rep_path)
pg_rep = eval_read(pg_rep_path)
return jn_rep, cass_rep, pg_rep
def print_reps(jn_rep, cass_rep, pg_rep):
from pprint import pprint # noqa
print("Journal representation:")
pprint(jn_rep)
print("Cassandra representation:")
pprint(cass_rep.to_dict() if cass_rep else None)
print("Postgresql representation:")
pprint(pg_rep.to_dict() if pg_rep else None)
def read_config(config_file: Optional[Any] = None) -> Dict:
"""Read configuration from config_file if provided, from the SWH_CONFIG_FILENAME if
set or fallback to the DEFAULT_CONFIG.
"""
from os import environ
if not config_file:
config_file = environ.get("SWH_CONFIG_FILENAME")
if not config_file:
raise ValueError("You must provide a configuration file.")
with open(config_file) as f:
data = f.read()
config = safe_load(data)
return config
@click.command()
@click.option(
"--config-file",
"-C",
default=None,
type=click.Path(
exists=True,
dir_okay=False,
),
help=(
"Configuration file. This has a higher priority than SWH_CONFIG_FILENAME "
"environment variable if set."
),
)
@click.option(
"--dir-path",
"-d",
required=True,
type=click.Path(
exists=True,
dir_okay=True,
file_okay=False,
),
help=(
"Path of objects to analyze"
),
)
def main(config_file, dir_path):
# dir_path = "/volume/production-check-cassandra/journal_only/origin_visit/b605c6f290ec146d537b193a03a7ea55571f177a"
print_reps(*from_path_to_rep(dir_path))
if __name__ == "__main__":
main()