From f396177e17c390e8fb8e578a96e063b8d85532f3 Mon Sep 17 00:00:00 2001
From: Timothy Sample <samplet@ngyro.com>
Date: Tue, 27 Sep 2022 15:29:23 -0400
Subject: [PATCH] model: Add payload to ExtID class

---
 docs/data-model.rst                 |  9 +++++-
 swh/model/git_objects.py            | 14 +++++++++
 swh/model/model.py                  | 22 +++++++++++++++
 swh/model/tests/swh_model_data.py   | 44 ++++++++++++++++-------------
 swh/model/tests/test_identifiers.py | 11 ++++++++
 5 files changed, 80 insertions(+), 20 deletions(-)

diff --git a/docs/data-model.rst b/docs/data-model.rst
index ff81babb..d36b183b 100644
--- a/docs/data-model.rst
+++ b/docs/data-model.rst
@@ -271,13 +271,20 @@ artifacts.
 **extid**
   a relationship between an original identifier of an artifact, in its
   native/upstream environment, and a `core SWHID <persistent-identifiers>`,
-  which is specific to Software Heritage. As such, it is a triple made of:
+  which is specific to Software Heritage. As such, it includes:
 
   * the external identifier, stored as bytes whose format is opaque to the
     data model
   * a type (a simple name and a version), to identify the type of relationship
   * the "target", which is a core SWHID
 
+  An extid may also include a "payload", which is arbitrary data about the
+  relationship. For example, an extid might link a directory to the
+  cryptographic hash of the tarball that originally contained it. In this
+  case, the payload could include data useful for reconstructing the
+  original tarball from the directory. The payload data is stored
+  separately.  An extid refers to it by its ``sha1_git`` hash.
+
 **raw extrinsic metadata**
   an opaque bytestring, along with its format (a simple name), an identifier
   of the object the metadata is about and in which context (similar to a
diff --git a/swh/model/git_objects.py b/swh/model/git_objects.py
index bbd7556b..942b675c 100644
--- a/swh/model/git_objects.py
+++ b/swh/model/git_objects.py
@@ -631,6 +631,8 @@ def extid_git_object(extid: model.ExtID) -> bytes:
     [extid_version $Str]
     extid $Bytes
     target $CoreSwhid
+    [payload_type $StrWithoutSpaces]
+    [payload $ContentIdentifier]
     ```
 
     $StrWithoutSpaces is an ASCII string, and may not contain spaces.
@@ -639,6 +641,10 @@ def extid_git_object(extid: model.ExtID) -> bytes:
     space after them.
 
     The extid_version line is only generated if the version is non-zero.
+
+    The payload_type and payload lines are only generated if they are not
+    :const:`None`. $ContentIdentifier is the object ID of a content object.
+
     """
 
     headers = [
@@ -655,4 +661,12 @@ def extid_git_object(extid: model.ExtID) -> bytes:
         ]
     )
 
+    payload_type = extid.payload_type
+    if payload_type is not None:
+        headers.append((b"payload_type", payload_type.encode("ascii")))
+
+    payload = extid.payload
+    if payload is not None:
+        headers.append((b"payload", payload))
+
     return format_git_object_from_headers("extid", headers)
diff --git a/swh/model/model.py b/swh/model/model.py
index c1a6481f..9f0307e1 100644
--- a/swh/model/model.py
+++ b/swh/model/model.py
@@ -1908,10 +1908,30 @@ class ExtID(BaseHashableModel):
     target = attr.ib(type=CoreSWHID, validator=generic_type_validator)
     extid_version = attr.ib(type=int, validator=generic_type_validator, default=0)
 
+    payload_type = attr.ib(
+        type=Optional[str], validator=generic_type_validator, default=None
+    )
+    payload = attr.ib(
+        type=Optional[Sha1Git],
+        validator=generic_type_validator,
+        default=None,
+        repr=hash_repr,
+    )
+
     id = attr.ib(
         type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
     )
 
+    @payload_type.validator
+    def check_payload_type(self, attribute, value):
+        if value is not None and self.payload is None:
+            raise ValueError("'payload' must be set if 'payload_type' is.")
+
+    @payload.validator
+    def check_payload(self, attribute, value):
+        if value is not None and self.payload_type is None:
+            raise ValueError("'payload_type' must be set if 'payload' is.")
+
     @classmethod
     def from_dict(cls, d):
         return cls(
@@ -1919,6 +1939,8 @@ class ExtID(BaseHashableModel):
             extid_type=d["extid_type"],
             target=CoreSWHID.from_string(d["target"]),
             extid_version=d.get("extid_version", 0),
+            payload_type=d.get("payload_type"),
+            payload=d.get("payload"),
         )
 
     def _compute_hash_from_attributes(self) -> bytes:
diff --git a/swh/model/tests/swh_model_data.py b/swh/model/tests/swh_model_data.py
index 382b6433..4e9ec129 100644
--- a/swh/model/tests/swh_model_data.py
+++ b/swh/model/tests/swh_model_data.py
@@ -160,25 +160,6 @@ REVISIONS = [
     ),
 ]
 
-EXTIDS = [
-    ExtID(
-        extid_type="git256",
-        extid=b"\x03" * 32,
-        target=REVISIONS[0].swhid(),
-    ),
-    ExtID(
-        extid_type="hg",
-        extid=b"\x04" * 20,
-        target=REVISIONS[1].swhid(),
-    ),
-    ExtID(
-        extid_type="hg-nodeid",
-        extid=b"\x05" * 20,
-        target=REVISIONS[1].swhid(),
-        extid_version=1,
-    ),
-]
-
 RELEASES = [
     Release(
         id=hash_to_bytes("8059dc4e17fcd0e51ca3bcd6b80f4577d281fd08"),
@@ -441,6 +422,31 @@ RAW_EXTRINSIC_METADATA = [
     ),
 ]
 
+EXTIDS = [
+    ExtID(
+        extid_type="git256",
+        extid=b"\x03" * 32,
+        target=REVISIONS[0].swhid(),
+    ),
+    ExtID(
+        extid_type="hg",
+        extid=b"\x04" * 20,
+        target=REVISIONS[1].swhid(),
+    ),
+    ExtID(
+        extid_type="hg-nodeid",
+        extid=b"\x05" * 20,
+        target=REVISIONS[1].swhid(),
+        extid_version=1,
+    ),
+    ExtID(
+        extid_type="tarball-sha256",
+        extid=b"\x03" * 32,
+        target=DIRECTORIES[0].swhid(),
+        payload_type="disarchive",
+        payload=CONTENTS[0].sha1_git,
+    ),
+]
 
 TEST_OBJECTS: Dict[str, Sequence[BaseModel]] = {
     "content": CONTENTS,
diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py
index d5a0eb30..054687af 100644
--- a/swh/model/tests/test_identifiers.py
+++ b/swh/model/tests/test_identifiers.py
@@ -1343,3 +1343,14 @@ def test_extid_identifier_bwcompat():
         ExtID.from_dict({**extid_dict, "extid_version": 1}).id
         != ExtID.from_dict(extid_dict).id
     )
+
+    assert (
+        ExtID.from_dict(
+            {
+                **extid_dict,
+                "payload_type": "test",
+                "payload": bytes.fromhex("257cc5642cb1a054f08cc83f2d943e56fd3ebe99"),
+            }
+        ).id
+        != ExtID.from_dict(extid_dict).id
+    )
-- 
GitLab