From f6eab95253f13f28fe4d4652fc471e3e8a0b5565 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <>
Date: Mon, 25 Jan 2021 12:31:12 +0100
Subject: [PATCH] identifiers: Add raw_extrinsic_metadata_identifier

This will be used to compute an intrisic identifier for RawExtrinsicMetadata;
which can be used for deduplication and refering to it like any other sha1_git
instead of needed to use a tuple of its fields.
 swh/model/               |   9 ++-
 swh/model/            |  79 +++++++++++++++++++
 swh/model/tests/ | 115 ++++++++++++++++++++++++++++
 3 files changed, 202 insertions(+), 1 deletion(-)

diff --git a/swh/model/ b/swh/model/
index cec87789..908b736b 100644
--- a/swh/model/
+++ b/swh/model/
@@ -289,7 +289,14 @@ def hash_git_data(data, git_type, base_algo="sha1"):
         ValueError if the git_type is unexpected.
-    git_object_types = {"blob", "tree", "commit", "tag", "snapshot"}
+    git_object_types = {
+        "blob",
+        "tree",
+        "commit",
+        "tag",
+        "snapshot",
+        "raw_extrinsic_metadata",
+    }
     if git_type not in git_object_types:
         raise ValueError(
diff --git a/swh/model/ b/swh/model/
index a07b047e..dce259ed 100644
--- a/swh/model/
+++ b/swh/model/
@@ -724,6 +724,85 @@ def origin_identifier(origin):
     return hashlib.sha1(origin["url"].encode("utf-8")).hexdigest()
+def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str:
+    """Return the intrinsic identifier for a RawExtrinsicMetadata object.
+    A raw_extrinsic_metadata identifier is a salted sha1 (using the git
+    hashing algorithm with the ``raw_extrinsic_metadata`` object type) of
+    a manifest following the format:
+    ```
+    target $ExtendedSwhid
+    discovery_date $ISO8601
+    authority $StrWithoutSpaces $IRI
+    fetcher $Str $Version
+    format $StrWithoutSpaces
+    origin $IRI                         <- optional
+    visit $IntInDecimal                 <- optional
+    snapshot $CoreSwhid                 <- optional
+    release $CoreSwhid                  <- optional
+    revision $CoreSwhid                 <- optional
+    path $Bytes                         <- optional
+    directory $CoreSwhid                <- optional
+    $MetadataBytes
+    ```
+    $IRI must be RFC 3987 IRIs (so they may contain newlines, that are escaped as
+    described below)
+    $StrWithoutSpaces and $Version are ASCII strings, and may not contain spaces.
+    $Str is an UTF-8 string.
+    $CoreSwhid are core SWHIDs, as defined in :ref:`persistent-identifiers`.
+    $ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for
+    origins and 'emd' for raw extrinsic metadata)
+    Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields,
+    ie. by adding a space after them.
+    Returns:
+      str: the intrinsic identifier for `metadata`
+    """
+    headers = [
+        (b"target", str(metadata["target"]).encode()),
+        (b"discovery_date", metadata["discovery_date"].isoformat().encode("ascii")),
+        (
+            b"authority",
+            f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(),
+        ),
+        (
+            b"fetcher",
+            f"{metadata['fetcher']['name']} {metadata['fetcher']['version']}".encode(),
+        ),
+        (b"format", metadata["format"].encode()),
+    ]
+    for key in (
+        "origin",
+        "visit",
+        "snapshot",
+        "release",
+        "revision",
+        "path",
+        "directory",
+    ):
+        if metadata.get(key) is not None:
+            value: bytes
+            if key == "path":
+                value = metadata[key]
+            else:
+                value = str(metadata[key]).encode()
+            headers.append((key.encode("ascii"), value))
+    return identifier_to_str(
+        hash_manifest("raw_extrinsic_metadata", headers, metadata["metadata"])
+    )
 # type of the "object_type" attribute of the SWHID class; either
 # ObjectType or ExtendedObjectType
 _TObjectType = TypeVar("_TObjectType", ObjectType, ExtendedObjectType)
diff --git a/swh/model/tests/ b/swh/model/tests/
index 38d7e357..8a0c4b81 100644
--- a/swh/model/tests/
+++ b/swh/model/tests/
@@ -5,6 +5,7 @@
 import binascii
 import datetime
+import hashlib
 import itertools
 from typing import Dict
 import unittest
@@ -767,6 +768,120 @@ class SnapshotIdentifier(unittest.TestCase):
+class RawExtrinsicMetadataIdentifier(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        self.authority = {
+            "type": "forge",
+            "url": "",
+        }
+        self.fetcher = {
+            "name": "swh-phabricator-metadata-fetcher",
+            "version": "0.0.1",
+        }
+        self.minimal = {
+            "type": "content",
+            "target": ExtendedSWHID.from_string(
+                "swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d"
+            ),
+            "discovery_date": datetime.datetime(
+                2021, 1, 25, 11, 27, 51, tzinfo=datetime.timezone.utc
+            ),
+            "authority": self.authority,
+            "fetcher": self.fetcher,
+            "format": "json",
+            "metadata": b'{"foo": "bar"}',
+        }
+        self.maximal = {
+            **self.minimal,
+            "origin": "",
+            "visit": 42,
+            "snapshot": CoreSWHID.from_string("swh:1:snp:" + "00" * 20),
+            "release": CoreSWHID.from_string("swh:1:rel:" + "01" * 20),
+            "revision": CoreSWHID.from_string("swh:1:rev:" + "02" * 20),
+            "path": b"/abc/def",
+            "directory": CoreSWHID.from_string("swh:1:dir:" + "03" * 20),
+        }
+    def test_minimal(self):
+        manifest = (
+            b"raw_extrinsic_metadata 225\0"
+            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
+            b"discovery_date 2021-01-25T11:27:51+00:00\n"
+            b"authority forge\n"
+            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
+            b"format json\n"
+            b"\n"
+            b'{"foo": "bar"}'
+        )
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_identifier(self.minimal),
+            hashlib.sha1(manifest).hexdigest(),
+        )
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_identifier(self.minimal),
+            "df16b5ea35b12f530fb7ecd0eb10b87a8b1fc3d2",
+        )
+    def test_maximal(self):
+        manifest = (
+            b"raw_extrinsic_metadata 548\0"
+            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
+            b"discovery_date 2021-01-25T11:27:51+00:00\n"
+            b"authority forge\n"
+            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
+            b"format json\n"
+            b"origin\n"
+            b"visit 42\n"
+            b"snapshot swh:1:snp:0000000000000000000000000000000000000000\n"
+            b"release swh:1:rel:0101010101010101010101010101010101010101\n"
+            b"revision swh:1:rev:0202020202020202020202020202020202020202\n"
+            b"path /abc/def\n"
+            b"directory swh:1:dir:0303030303030303030303030303030303030303\n"
+            b"\n"
+            b'{"foo": "bar"}'
+        )
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_identifier(self.maximal),
+            hashlib.sha1(manifest).hexdigest(),
+        )
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_identifier(self.maximal),
+            "55563d91a3f9cb41aa36c60c2b518433bf318ae4",
+        )
+    def test_nonascii_path(self):
+        metadata = {
+            **self.minimal,
+            "path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f",
+        }
+        manifest = (
+            b"raw_extrinsic_metadata 246\0"
+            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
+            b"discovery_date 2021-01-25T11:27:51+00:00\n"
+            b"authority forge\n"
+            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
+            b"format json\n"
+            b"path /ab\n"
+            b" c/d\xf0\x9f\xa4\xb7e\x00f\n"
+            b"\n"
+            b'{"foo": "bar"}'
+        )
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_identifier(metadata),
+            hashlib.sha1(manifest).hexdigest(),
+        )
+        self.assertEqual(
+            identifiers.raw_extrinsic_metadata_identifier(metadata),
+            "d8e5856601cdae96dfdfb5147235895949c9322d",
+        )
 origin_example = {
     "url": "",