Add diagram of the Cassandra DB schema

f1f88413 · vlorentz · vlorentz · 39839d93 · f1f88413 · f1f88413
Commit f1f88413 authored 4 months ago by vlorentz Committed by vlorentz 4 months ago
--- a/docs/Makefile.local
+++ b/docs/Makefile.local
@@ -6,7 +6,10 @@ sql-autodoc:
 	make -C ../sql/ doc
 	cp ../sql/doc/sql/db-schema.svg images/

-images: sql-autodoc
+cql-autodoc:
+	python3 -c "from swh.storage.cassandra.diagram import dot_diagram; print(dot_diagram())" | dot -T svg > images/cassandra-schema.svg
+
+images: sql-autodoc cql-autodoc
 	make -C images/
 clean-images:
 	make -C images/ clean

--- a/docs/sql-storage.rst
+++ b/docs/sql-storage.rst
-:orphan:
-
 .. _sql-storage:

-SQL storage
-===========
+Database schema
+===============

 Postgres DB schema
 ------------------

 .. _swh-storage-db-schema:
+.. _swh-storage-postgresql-schema:
 .. figure:: images/db-schema.svg
   :width: 1024px
   :align: center

   Postgres DB schema of high-level Software Heritage storage (click to zoom).
+
+Cassandra DB schema
+-------------------
+
+.. _swh-storage-cassandra-schema:
+.. figure:: images/cassandra-schema.svg
+   :width: 1024px
+   :align: center
+
+   Cassandra DB schema of high-level Software Heritage storage (click to zoom).
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -61,11 +61,6 @@ Or, using :py:func:`swh.core.api.classes.stream_results` for convenience:
   for visit in visits:
        print(visit)

-Database schema
---------------
-
-* :ref:`sql-storage`
-

 Archive copies
 --------------
@@ -87,6 +82,7 @@ Reference Documentation
 .. toctree::
   :maxdepth: 2

+   db-schema
   cli

 .. only:: standalone_package_doc

--- a/swh/storage/cassandra/diagram.py
+++ b/swh/storage/cassandra/diagram.py
+# Copyright (C) 2024  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Generates a graphical representation of the Cassandra schema using
+:mod:`swh.storage.cassandra.model`.
+"""
+import dataclasses
+from typing import Tuple, Union
+
+from . import model
+
+
+def dot_diagram() -> str:
+    """Generates a diagram of the database in PlantUML format"""
+    import io
+    import textwrap
+
+    from .schema import HASH_ALGORITHMS
+
+    out = io.StringIO()
+
+    classes = {
+        cls.TABLE: cls for cls in model.__dict__.values() if hasattr(cls, "TABLE")
+    }
+
+    out.write(
+        textwrap.dedent(
+            """
+            digraph g {
+            graph [
+            rankdir = "LR",
+            concentrate = true,
+            ratio = auto
+            ];
+            node [
+            fontsize = "10",
+            shape = record
+            ];
+            edge [
+            ];
+
+
+            subgraph "logical_grouping" {
+                style = rounded;
+                bgcolor = gray95;
+                color = gray;
+
+                subgraph cluster_content {
+                    label = <<b>content</b>>;
+                    content;
+                    content_by_sha1;
+                    content_by_sha1_git;
+                    content_by_sha256;
+                    content_by_blake2s256;
+                }
+
+                subgraph cluster_skipped_content {
+                    label = <<b>skipped_content</b>>;
+                    skipped_content;
+                    skipped_content_by_sha1;
+                    skipped_content_by_sha1_git;
+                    skipped_content_by_sha256;
+                    skipped_content_by_blake2s256;
+                }
+
+                subgraph cluster_directory {
+                    label = <<b>directories</b>>;
+                    directory;
+                    directory_entry;
+                }
+
+                subgraph cluster_revision {
+                    label = <<b>revisions</b>>;
+                    revision;
+                    revision_parent;
+                }
+
+                subgraph cluster_release {
+                    label = <<b>releases</b>>;
+                    release;
+                }
+
+                subgraph cluster_snapshots {
+                    label = <<b>snapshots</b>>;
+                    snapshot;
+                    snapshot_branch;
+                }
+
+                subgraph cluster_origins {
+                    label = <<b>origins</b>>;
+                    origin;
+                    origin_visit;
+                    origin_visit_status;
+                }
+
+                subgraph cluster_metadata {
+                    label = <<b>metadata</b>>;
+                    metadata_authority;
+                    metadata_fetcher;
+                    raw_extrinsic_metadata;
+                    raw_extrinsic_metadata_by_id;
+                }
+
+                subgraph cluster_extid {
+                    label = <<b>external identifiers</b>>;
+                    extid;
+                    extid_by_target;
+                }
+            }
+            """
+        )
+    )
+
+    def write_table_header(table_name: str) -> None:
+        out.write(
+            f'"{table_name}" [shape = plaintext, label = < '
+            f'<TABLE BORDER="1" CELLBORDER="0" CELLSPACING="0">'
+            # header row:
+            f'<TR ><TD PORT="ltcol0"> </TD> '
+            f'<TD bgcolor="grey90" border="1" COLSPAN="4"> \\N </TD> '
+            f'<TD PORT="rtcol0"></TD></TR>'
+        )
+
+    def get_target_field(field_full_name: str) -> Tuple[str, int]:
+        """Given a string like 'table.col', returns the table name and the index of the column
+        within that table (1-indexed)"""
+        (points_to_table, points_to_col) = points_to.split(".")
+        try:
+            target_cls = classes[points_to_table]
+        except KeyError:
+            raise Exception(f"Unknown table {points_to_table}") from None
+        target_field_ids = [
+            i
+            for (i, field) in enumerate(dataclasses.fields(target_cls), start=1)
+            if field.name == points_to_col
+        ]
+        try:
+            (target_field_id,) = target_field_ids
+        except ValueError:
+            raise Exception(
+                f"Expected exactly one field {target_cls.__name__}.{points_to_col}, "
+                f"got: {target_field_ids}"
+            ) from None
+        return (points_to_table, target_field_id)
+
+    # write main tables
+    for cls in classes.values():
+        write_table_header(cls.TABLE)
+
+        for i, field in enumerate(dataclasses.fields(cls), start=1):
+            if field.name in cls.PARTITION_KEY:
+                assert (
+                    field.name not in cls.CLUSTERING_KEY
+                ), f"{field.name} is both PK and CK"
+                key = "PK"
+            elif field.name in cls.CLUSTERING_KEY:
+                key = "CK"
+            else:
+                key = ""
+
+            # TODO: use CQL types instead of Python types
+            ty = field.type
+            if getattr(ty, "__origin__", None) is Union:
+                assert (
+                    len(ty.__args__) == 2 and type(None) in ty.__args__
+                ), f"{cls.__name__}.{field.name} as unsupported type: {ty}"
+                # this is Optional[], unwrap it
+                (ty,) = [arg for arg in ty.__args__ if arg is not type(None)]  # noqa
+            col_type = ty.__name__
+            out.write(
+                textwrap.dedent(
+                    f"""
+                    <TR><TD PORT="ltcol{i}" ></TD>
+                    <TD align="left" > {field.name} </TD>
+                    <TD align="left" > {col_type} </TD>
+                    <TD align="left" > {key} </TD>
+                    <TD align="left" PORT="rtcol{i}"> </TD></TR>
+                    """
+                )
+            )
+
+        out.write("</TABLE>> ];\n")
+
+    # add content_by_* and skipped_content_by_*, which don't have their own Python classes
+    for algo in HASH_ALGORITHMS:
+        for main_table in ("content", "skipped_content"):
+            write_table_header(f"{main_table}_by_{algo}")
+            out.write(
+                textwrap.dedent(
+                    f"""
+                    <TR><TD PORT="ltcol1" ></TD>
+                    <TD align="left" > {algo} </TD>
+                    <TD align="left" > bytes </TD>
+                    <TD align="left" > PK </TD>
+                    <TD align="left" PORT="rtcol1"> </TD></TR>
+                    <TR><TD PORT="ltcol2" ></TD>
+                    <TD align="left" > token </TD>
+                    <TD align="left" > token </TD>
+                    <TD align="left" > CK </TD>
+                    <TD align="left" PORT="rtcol2"> </TD></TR>
+                    """
+                )
+            )
+            out.write("</TABLE>> ];\n")
+
+            out.write(
+                f'"{main_table}_by_{algo}":rtcol2 -> "{main_table}":ltcol0 [style = solid];\n'
+            )
+
+    # write "links" between tables
+    for cls_name, cls in classes.items():
+        for i, field in enumerate(dataclasses.fields(cls), start=1):
+            links = []  # pairs of (is_strong, target)
+            for points_to in field.metadata.get("fk") or []:
+                links.append((True, points_to))
+            for points_to in field.metadata.get("points_to") or []:
+                links.append((False, points_to))
+            for is_strong, points_to in links:
+                (target_table, target_field_id) = get_target_field(points_to)
+                if is_strong:
+                    style = "[style = solid]"
+                else:
+                    style = "[style = dashed]"
+                out.write(
+                    f'"{cls.TABLE}":rtcol{i} -> "{target_table}":ltcol{target_field_id} '
+                    f"{style};\n"
+                )
+
+    out.write("}\n")
+
+    return out.getvalue()
--- a/swh/storage/cassandra/model.py
+++ b/swh/storage/cassandra/model.py
@@ -17,6 +17,13 @@ them are subtly different:

 Therefore, this model doesn't reuse swh.model.model, except for types
 that can be mapped to UDTs (Person and TimestampWithTimezone).
+
+Fields may have :func:`dataclasses metadata <dataclasses.field>` keys ``fk``
+if the existence of a corresponding row in a different table is almost guaranteed
+(up to loaders not crashing and eventual-consistency settling down) and
+``points_to`` if they are a Merkle-DAG link to another object (which is more likely
+to be missing).
+This is used by :func:`swh.storage.cassandra.diagram.dot_diagram`.
 """

 import dataclasses
@@ -155,10 +162,19 @@ class DirectoryEntryRow(BaseRow):
    PARTITION_KEY = ("directory_id",)
    CLUSTERING_KEY = ("name",)

-    directory_id: bytes
+    directory_id: bytes = dataclasses.field(metadata={"fk": ["directory.id"]})
    name: bytes
    """path name, relative to containing dir"""
-    target: bytes
+    target: bytes = dataclasses.field(
+        metadata={
+            "points_to": [
+                "content.sha1_git",
+                "skipped_content.sha1_git",
+                "directory.id",
+                "revision.id",
+            ]
+        }
+    )
    perms: int
    """unix-like permissions"""
    type: str
@@ -174,7 +190,7 @@ class RevisionRow(BaseRow):
    date: Optional[TimestampWithTimezone]
    committer_date: Optional[TimestampWithTimezone]
    type: str
-    directory: bytes
+    directory: bytes = dataclasses.field(metadata={"points_to": ["directory.id"]})
    """source code "root" directory"""
    message: bytes
    author: Person
@@ -195,10 +211,10 @@ class RevisionParentRow(BaseRow):
    PARTITION_KEY = ("id",)
    CLUSTERING_KEY = ("parent_rank",)

-    id: bytes
+    id: bytes = dataclasses.field(metadata={"fk": ["revision.id"]})
    parent_rank: int
    """parent position in merge commits, 0-based"""
-    parent_id: bytes
+    parent_id: bytes = dataclasses.field(metadata={"points_to": ["revision.id"]})


 @dataclasses.dataclass
@@ -208,7 +224,16 @@ class ReleaseRow(BaseRow):

    id: bytes
    target_type: str
-    target: bytes
+    target: bytes = dataclasses.field(
+        metadata={
+            "points_to": [
+                "content.sha1_git",
+                "skipped_content.sha1_git",
+                "directory.id",
+                "revision.id",
+            ]
+        }
+    )
    date: TimestampWithTimezone
    name: bytes
    message: bytes
@@ -238,10 +263,19 @@ class SnapshotBranchRow(BaseRow):
    PARTITION_KEY = ("snapshot_id",)
    CLUSTERING_KEY = ("name",)

-    snapshot_id: bytes
+    snapshot_id: bytes = dataclasses.field(metadata={"fk": ["snapshot.id"]})
    name: bytes
    target_type: Optional[str]
-    target: Optional[bytes]
+    target: Optional[bytes] = dataclasses.field(
+        metadata={
+            "points_to": [
+                "content.sha1_git",
+                "skipped_content.sha1_git",
+                "revision.id",
+                "release.id",
+            ]
+        }
+    )


 @dataclasses.dataclass
@@ -250,7 +284,7 @@ class OriginVisitRow(BaseRow):
    PARTITION_KEY = ("origin",)
    CLUSTERING_KEY = ("visit",)

-    origin: str
+    origin: str = dataclasses.field(metadata={"fk": ["origin.url"]})
    visit: int
    date: datetime.datetime
    type: str
@@ -262,13 +296,13 @@ class OriginVisitStatusRow(BaseRow):
    PARTITION_KEY = ("origin",)
    CLUSTERING_KEY = ("visit", "date")

-    origin: str
-    visit: int
+    origin: str = dataclasses.field(metadata={"fk": ["origin_visit.origin"]})
+    visit: int = dataclasses.field(metadata={"fk": ["origin_visit.visit"]})
    date: datetime.datetime
    type: str
    status: str
    metadata: str
-    snapshot: bytes
+    snapshot: bytes = dataclasses.field(metadata={"fk": ["snapshot.id"]})

    @classmethod
    def from_dict(cls: Type[T], d: Dict[str, Any]) -> T:
@@ -355,11 +389,15 @@ class RawExtrinsicMetadataRow(BaseRow):
    target: str

    # metadata source:
-    authority_type: str
-    authority_url: str
+    authority_type: str = dataclasses.field(
+        metadata={"fk": ["metadata_authority.type"]}
+    )
+    authority_url: str = dataclasses.field(metadata={"fk": ["metadata_authority.url"]})
    discovery_date: datetime.datetime
-    fetcher_name: str
-    fetcher_version: str
+    fetcher_name: str = dataclasses.field(metadata={"fk": ["metadata_fetcher.name"]})
+    fetcher_version: str = dataclasses.field(
+        metadata={"fk": ["metadata_fetcher.version"]}
+    )

    # metadata itself:
    format: str
@@ -385,8 +423,8 @@ class RawExtrinsicMetadataByIdRow(BaseRow):
    PARTITION_KEY = ("id",)
    CLUSTERING_KEY = ()

-    id: bytes
-    target: str
+    id: bytes = dataclasses.field(metadata={"fk": ["raw_extrinsic_metadata.id"]})
+    target: str = dataclasses.field(metadata={"fk": ["raw_extrinsic_metadata.target"]})
    authority_type: str
    authority_url: str

@@ -422,7 +460,7 @@ class ExtIDByTargetRow(BaseRow):
    CLUSTERING_KEY = ("target_token",)

    target_type: str
-    target: bytes
+    target: bytes = dataclasses.field(metadata={"fk": ["extid.target"]})
    target_token: int
    """value of token(pk) on the "primary" table"""