From f396177e17c390e8fb8e578a96e063b8d85532f3 Mon Sep 17 00:00:00 2001 From: Timothy Sample <samplet@ngyro.com> Date: Tue, 27 Sep 2022 15:29:23 -0400 Subject: [PATCH] model: Add payload to ExtID class --- docs/data-model.rst | 9 +++++- swh/model/git_objects.py | 14 +++++++++ swh/model/model.py | 22 +++++++++++++++ swh/model/tests/swh_model_data.py | 44 ++++++++++++++++------------- swh/model/tests/test_identifiers.py | 11 ++++++++ 5 files changed, 80 insertions(+), 20 deletions(-) diff --git a/docs/data-model.rst b/docs/data-model.rst index ff81babb..d36b183b 100644 --- a/docs/data-model.rst +++ b/docs/data-model.rst @@ -271,13 +271,20 @@ artifacts. **extid** a relationship between an original identifier of an artifact, in its native/upstream environment, and a `core SWHID <persistent-identifiers>`, - which is specific to Software Heritage. As such, it is a triple made of: + which is specific to Software Heritage. As such, it includes: * the external identifier, stored as bytes whose format is opaque to the data model * a type (a simple name and a version), to identify the type of relationship * the "target", which is a core SWHID + An extid may also include a "payload", which is arbitrary data about the + relationship. For example, an extid might link a directory to the + cryptographic hash of the tarball that originally contained it. In this + case, the payload could include data useful for reconstructing the + original tarball from the directory. The payload data is stored + separately. An extid refers to it by its ``sha1_git`` hash. + **raw extrinsic metadata** an opaque bytestring, along with its format (a simple name), an identifier of the object the metadata is about and in which context (similar to a diff --git a/swh/model/git_objects.py b/swh/model/git_objects.py index bbd7556b..942b675c 100644 --- a/swh/model/git_objects.py +++ b/swh/model/git_objects.py @@ -631,6 +631,8 @@ def extid_git_object(extid: model.ExtID) -> bytes: [extid_version $Str] extid $Bytes target $CoreSwhid + [payload_type $StrWithoutSpaces] + [payload $ContentIdentifier] ``` $StrWithoutSpaces is an ASCII string, and may not contain spaces. @@ -639,6 +641,10 @@ def extid_git_object(extid: model.ExtID) -> bytes: space after them. The extid_version line is only generated if the version is non-zero. + + The payload_type and payload lines are only generated if they are not + :const:`None`. $ContentIdentifier is the object ID of a content object. + """ headers = [ @@ -655,4 +661,12 @@ def extid_git_object(extid: model.ExtID) -> bytes: ] ) + payload_type = extid.payload_type + if payload_type is not None: + headers.append((b"payload_type", payload_type.encode("ascii"))) + + payload = extid.payload + if payload is not None: + headers.append((b"payload", payload)) + return format_git_object_from_headers("extid", headers) diff --git a/swh/model/model.py b/swh/model/model.py index c1a6481f..9f0307e1 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -1908,10 +1908,30 @@ class ExtID(BaseHashableModel): target = attr.ib(type=CoreSWHID, validator=generic_type_validator) extid_version = attr.ib(type=int, validator=generic_type_validator, default=0) + payload_type = attr.ib( + type=Optional[str], validator=generic_type_validator, default=None + ) + payload = attr.ib( + type=Optional[Sha1Git], + validator=generic_type_validator, + default=None, + repr=hash_repr, + ) + id = attr.ib( type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr ) + @payload_type.validator + def check_payload_type(self, attribute, value): + if value is not None and self.payload is None: + raise ValueError("'payload' must be set if 'payload_type' is.") + + @payload.validator + def check_payload(self, attribute, value): + if value is not None and self.payload_type is None: + raise ValueError("'payload_type' must be set if 'payload' is.") + @classmethod def from_dict(cls, d): return cls( @@ -1919,6 +1939,8 @@ class ExtID(BaseHashableModel): extid_type=d["extid_type"], target=CoreSWHID.from_string(d["target"]), extid_version=d.get("extid_version", 0), + payload_type=d.get("payload_type"), + payload=d.get("payload"), ) def _compute_hash_from_attributes(self) -> bytes: diff --git a/swh/model/tests/swh_model_data.py b/swh/model/tests/swh_model_data.py index 382b6433..4e9ec129 100644 --- a/swh/model/tests/swh_model_data.py +++ b/swh/model/tests/swh_model_data.py @@ -160,25 +160,6 @@ REVISIONS = [ ), ] -EXTIDS = [ - ExtID( - extid_type="git256", - extid=b"\x03" * 32, - target=REVISIONS[0].swhid(), - ), - ExtID( - extid_type="hg", - extid=b"\x04" * 20, - target=REVISIONS[1].swhid(), - ), - ExtID( - extid_type="hg-nodeid", - extid=b"\x05" * 20, - target=REVISIONS[1].swhid(), - extid_version=1, - ), -] - RELEASES = [ Release( id=hash_to_bytes("8059dc4e17fcd0e51ca3bcd6b80f4577d281fd08"), @@ -441,6 +422,31 @@ RAW_EXTRINSIC_METADATA = [ ), ] +EXTIDS = [ + ExtID( + extid_type="git256", + extid=b"\x03" * 32, + target=REVISIONS[0].swhid(), + ), + ExtID( + extid_type="hg", + extid=b"\x04" * 20, + target=REVISIONS[1].swhid(), + ), + ExtID( + extid_type="hg-nodeid", + extid=b"\x05" * 20, + target=REVISIONS[1].swhid(), + extid_version=1, + ), + ExtID( + extid_type="tarball-sha256", + extid=b"\x03" * 32, + target=DIRECTORIES[0].swhid(), + payload_type="disarchive", + payload=CONTENTS[0].sha1_git, + ), +] TEST_OBJECTS: Dict[str, Sequence[BaseModel]] = { "content": CONTENTS, diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index d5a0eb30..054687af 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -1343,3 +1343,14 @@ def test_extid_identifier_bwcompat(): ExtID.from_dict({**extid_dict, "extid_version": 1}).id != ExtID.from_dict(extid_dict).id ) + + assert ( + ExtID.from_dict( + { + **extid_dict, + "payload_type": "test", + "payload": bytes.fromhex("257cc5642cb1a054f08cc83f2d943e56fd3ebe99"), + } + ).id + != ExtID.from_dict(extid_dict).id + ) -- GitLab