Skip to content
Snippets Groups Projects
Commit f1f88413 authored by vlorentz's avatar vlorentz Committed by vlorentz
Browse files

Add diagram of the Cassandra DB schema

parent 39839d93
No related branches found
No related tags found
1 merge request!1150Add diagram of the Cassandra DB schema
Pipeline #12164 canceled
......@@ -6,7 +6,10 @@ sql-autodoc:
make -C ../sql/ doc
cp ../sql/doc/sql/db-schema.svg images/
images: sql-autodoc
cql-autodoc:
python3 -c "from swh.storage.cassandra.diagram import dot_diagram; print(dot_diagram())" | dot -T svg > images/cassandra-schema.svg
images: sql-autodoc cql-autodoc
make -C images/
clean-images:
make -C images/ clean
......
:orphan:
.. _sql-storage:
SQL storage
===========
Database schema
===============
Postgres DB schema
------------------
.. _swh-storage-db-schema:
.. _swh-storage-postgresql-schema:
.. figure:: images/db-schema.svg
:width: 1024px
:align: center
Postgres DB schema of high-level Software Heritage storage (click to zoom).
Cassandra DB schema
-------------------
.. _swh-storage-cassandra-schema:
.. figure:: images/cassandra-schema.svg
:width: 1024px
:align: center
Cassandra DB schema of high-level Software Heritage storage (click to zoom).
......@@ -61,11 +61,6 @@ Or, using :py:func:`swh.core.api.classes.stream_results` for convenience:
for visit in visits:
print(visit)
Database schema
---------------
* :ref:`sql-storage`
Archive copies
--------------
......@@ -87,6 +82,7 @@ Reference Documentation
.. toctree::
:maxdepth: 2
db-schema
cli
.. only:: standalone_package_doc
......
# Copyright (C) 2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Generates a graphical representation of the Cassandra schema using
:mod:`swh.storage.cassandra.model`.
"""
import dataclasses
from typing import Tuple, Union
from . import model
def dot_diagram() -> str:
"""Generates a diagram of the database in PlantUML format"""
import io
import textwrap
from .schema import HASH_ALGORITHMS
out = io.StringIO()
classes = {
cls.TABLE: cls for cls in model.__dict__.values() if hasattr(cls, "TABLE")
}
out.write(
textwrap.dedent(
"""
digraph g {
graph [
rankdir = "LR",
concentrate = true,
ratio = auto
];
node [
fontsize = "10",
shape = record
];
edge [
];
subgraph "logical_grouping" {
style = rounded;
bgcolor = gray95;
color = gray;
subgraph cluster_content {
label = <<b>content</b>>;
content;
content_by_sha1;
content_by_sha1_git;
content_by_sha256;
content_by_blake2s256;
}
subgraph cluster_skipped_content {
label = <<b>skipped_content</b>>;
skipped_content;
skipped_content_by_sha1;
skipped_content_by_sha1_git;
skipped_content_by_sha256;
skipped_content_by_blake2s256;
}
subgraph cluster_directory {
label = <<b>directories</b>>;
directory;
directory_entry;
}
subgraph cluster_revision {
label = <<b>revisions</b>>;
revision;
revision_parent;
}
subgraph cluster_release {
label = <<b>releases</b>>;
release;
}
subgraph cluster_snapshots {
label = <<b>snapshots</b>>;
snapshot;
snapshot_branch;
}
subgraph cluster_origins {
label = <<b>origins</b>>;
origin;
origin_visit;
origin_visit_status;
}
subgraph cluster_metadata {
label = <<b>metadata</b>>;
metadata_authority;
metadata_fetcher;
raw_extrinsic_metadata;
raw_extrinsic_metadata_by_id;
}
subgraph cluster_extid {
label = <<b>external identifiers</b>>;
extid;
extid_by_target;
}
}
"""
)
)
def write_table_header(table_name: str) -> None:
out.write(
f'"{table_name}" [shape = plaintext, label = < '
f'<TABLE BORDER="1" CELLBORDER="0" CELLSPACING="0">'
# header row:
f'<TR ><TD PORT="ltcol0"> </TD> '
f'<TD bgcolor="grey90" border="1" COLSPAN="4"> \\N </TD> '
f'<TD PORT="rtcol0"></TD></TR>'
)
def get_target_field(field_full_name: str) -> Tuple[str, int]:
"""Given a string like 'table.col', returns the table name and the index of the column
within that table (1-indexed)"""
(points_to_table, points_to_col) = points_to.split(".")
try:
target_cls = classes[points_to_table]
except KeyError:
raise Exception(f"Unknown table {points_to_table}") from None
target_field_ids = [
i
for (i, field) in enumerate(dataclasses.fields(target_cls), start=1)
if field.name == points_to_col
]
try:
(target_field_id,) = target_field_ids
except ValueError:
raise Exception(
f"Expected exactly one field {target_cls.__name__}.{points_to_col}, "
f"got: {target_field_ids}"
) from None
return (points_to_table, target_field_id)
# write main tables
for cls in classes.values():
write_table_header(cls.TABLE)
for i, field in enumerate(dataclasses.fields(cls), start=1):
if field.name in cls.PARTITION_KEY:
assert (
field.name not in cls.CLUSTERING_KEY
), f"{field.name} is both PK and CK"
key = "PK"
elif field.name in cls.CLUSTERING_KEY:
key = "CK"
else:
key = ""
# TODO: use CQL types instead of Python types
ty = field.type
if getattr(ty, "__origin__", None) is Union:
assert (
len(ty.__args__) == 2 and type(None) in ty.__args__
), f"{cls.__name__}.{field.name} as unsupported type: {ty}"
# this is Optional[], unwrap it
(ty,) = [arg for arg in ty.__args__ if arg is not type(None)] # noqa
col_type = ty.__name__
out.write(
textwrap.dedent(
f"""
<TR><TD PORT="ltcol{i}" ></TD>
<TD align="left" > {field.name} </TD>
<TD align="left" > {col_type} </TD>
<TD align="left" > {key} </TD>
<TD align="left" PORT="rtcol{i}"> </TD></TR>
"""
)
)
out.write("</TABLE>> ];\n")
# add content_by_* and skipped_content_by_*, which don't have their own Python classes
for algo in HASH_ALGORITHMS:
for main_table in ("content", "skipped_content"):
write_table_header(f"{main_table}_by_{algo}")
out.write(
textwrap.dedent(
f"""
<TR><TD PORT="ltcol1" ></TD>
<TD align="left" > {algo} </TD>
<TD align="left" > bytes </TD>
<TD align="left" > PK </TD>
<TD align="left" PORT="rtcol1"> </TD></TR>
<TR><TD PORT="ltcol2" ></TD>
<TD align="left" > token </TD>
<TD align="left" > token </TD>
<TD align="left" > CK </TD>
<TD align="left" PORT="rtcol2"> </TD></TR>
"""
)
)
out.write("</TABLE>> ];\n")
out.write(
f'"{main_table}_by_{algo}":rtcol2 -> "{main_table}":ltcol0 [style = solid];\n'
)
# write "links" between tables
for cls_name, cls in classes.items():
for i, field in enumerate(dataclasses.fields(cls), start=1):
links = [] # pairs of (is_strong, target)
for points_to in field.metadata.get("fk") or []:
links.append((True, points_to))
for points_to in field.metadata.get("points_to") or []:
links.append((False, points_to))
for is_strong, points_to in links:
(target_table, target_field_id) = get_target_field(points_to)
if is_strong:
style = "[style = solid]"
else:
style = "[style = dashed]"
out.write(
f'"{cls.TABLE}":rtcol{i} -> "{target_table}":ltcol{target_field_id} '
f"{style};\n"
)
out.write("}\n")
return out.getvalue()
......@@ -17,6 +17,13 @@ them are subtly different:
Therefore, this model doesn't reuse swh.model.model, except for types
that can be mapped to UDTs (Person and TimestampWithTimezone).
Fields may have :func:`dataclasses metadata <dataclasses.field>` keys ``fk``
if the existence of a corresponding row in a different table is almost guaranteed
(up to loaders not crashing and eventual-consistency settling down) and
``points_to`` if they are a Merkle-DAG link to another object (which is more likely
to be missing).
This is used by :func:`swh.storage.cassandra.diagram.dot_diagram`.
"""
import dataclasses
......@@ -155,10 +162,19 @@ class DirectoryEntryRow(BaseRow):
PARTITION_KEY = ("directory_id",)
CLUSTERING_KEY = ("name",)
directory_id: bytes
directory_id: bytes = dataclasses.field(metadata={"fk": ["directory.id"]})
name: bytes
"""path name, relative to containing dir"""
target: bytes
target: bytes = dataclasses.field(
metadata={
"points_to": [
"content.sha1_git",
"skipped_content.sha1_git",
"directory.id",
"revision.id",
]
}
)
perms: int
"""unix-like permissions"""
type: str
......@@ -174,7 +190,7 @@ class RevisionRow(BaseRow):
date: Optional[TimestampWithTimezone]
committer_date: Optional[TimestampWithTimezone]
type: str
directory: bytes
directory: bytes = dataclasses.field(metadata={"points_to": ["directory.id"]})
"""source code "root" directory"""
message: bytes
author: Person
......@@ -195,10 +211,10 @@ class RevisionParentRow(BaseRow):
PARTITION_KEY = ("id",)
CLUSTERING_KEY = ("parent_rank",)
id: bytes
id: bytes = dataclasses.field(metadata={"fk": ["revision.id"]})
parent_rank: int
"""parent position in merge commits, 0-based"""
parent_id: bytes
parent_id: bytes = dataclasses.field(metadata={"points_to": ["revision.id"]})
@dataclasses.dataclass
......@@ -208,7 +224,16 @@ class ReleaseRow(BaseRow):
id: bytes
target_type: str
target: bytes
target: bytes = dataclasses.field(
metadata={
"points_to": [
"content.sha1_git",
"skipped_content.sha1_git",
"directory.id",
"revision.id",
]
}
)
date: TimestampWithTimezone
name: bytes
message: bytes
......@@ -238,10 +263,19 @@ class SnapshotBranchRow(BaseRow):
PARTITION_KEY = ("snapshot_id",)
CLUSTERING_KEY = ("name",)
snapshot_id: bytes
snapshot_id: bytes = dataclasses.field(metadata={"fk": ["snapshot.id"]})
name: bytes
target_type: Optional[str]
target: Optional[bytes]
target: Optional[bytes] = dataclasses.field(
metadata={
"points_to": [
"content.sha1_git",
"skipped_content.sha1_git",
"revision.id",
"release.id",
]
}
)
@dataclasses.dataclass
......@@ -250,7 +284,7 @@ class OriginVisitRow(BaseRow):
PARTITION_KEY = ("origin",)
CLUSTERING_KEY = ("visit",)
origin: str
origin: str = dataclasses.field(metadata={"fk": ["origin.url"]})
visit: int
date: datetime.datetime
type: str
......@@ -262,13 +296,13 @@ class OriginVisitStatusRow(BaseRow):
PARTITION_KEY = ("origin",)
CLUSTERING_KEY = ("visit", "date")
origin: str
visit: int
origin: str = dataclasses.field(metadata={"fk": ["origin_visit.origin"]})
visit: int = dataclasses.field(metadata={"fk": ["origin_visit.visit"]})
date: datetime.datetime
type: str
status: str
metadata: str
snapshot: bytes
snapshot: bytes = dataclasses.field(metadata={"fk": ["snapshot.id"]})
@classmethod
def from_dict(cls: Type[T], d: Dict[str, Any]) -> T:
......@@ -355,11 +389,15 @@ class RawExtrinsicMetadataRow(BaseRow):
target: str
# metadata source:
authority_type: str
authority_url: str
authority_type: str = dataclasses.field(
metadata={"fk": ["metadata_authority.type"]}
)
authority_url: str = dataclasses.field(metadata={"fk": ["metadata_authority.url"]})
discovery_date: datetime.datetime
fetcher_name: str
fetcher_version: str
fetcher_name: str = dataclasses.field(metadata={"fk": ["metadata_fetcher.name"]})
fetcher_version: str = dataclasses.field(
metadata={"fk": ["metadata_fetcher.version"]}
)
# metadata itself:
format: str
......@@ -385,8 +423,8 @@ class RawExtrinsicMetadataByIdRow(BaseRow):
PARTITION_KEY = ("id",)
CLUSTERING_KEY = ()
id: bytes
target: str
id: bytes = dataclasses.field(metadata={"fk": ["raw_extrinsic_metadata.id"]})
target: str = dataclasses.field(metadata={"fk": ["raw_extrinsic_metadata.target"]})
authority_type: str
authority_url: str
......@@ -422,7 +460,7 @@ class ExtIDByTargetRow(BaseRow):
CLUSTERING_KEY = ("target_token",)
target_type: str
target: bytes
target: bytes = dataclasses.field(metadata={"fk": ["extid.target"]})
target_token: int
"""value of token(pk) on the "primary" table"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment