Skip to content
Snippets Groups Projects
Commit f6eab952 authored by vlorentz's avatar vlorentz
Browse files

identifiers: Add raw_extrinsic_metadata_identifier

This will be used to compute an intrisic identifier for RawExtrinsicMetadata;
which can be used for deduplication and refering to it like any other sha1_git
instead of needed to use a tuple of its fields.
parent bf4ab433
No related branches found
No related tags found
No related merge requests found
......@@ -289,7 +289,14 @@ def hash_git_data(data, git_type, base_algo="sha1"):
ValueError if the git_type is unexpected.
"""
git_object_types = {"blob", "tree", "commit", "tag", "snapshot"}
git_object_types = {
"blob",
"tree",
"commit",
"tag",
"snapshot",
"raw_extrinsic_metadata",
}
if git_type not in git_object_types:
raise ValueError(
......
......@@ -724,6 +724,85 @@ def origin_identifier(origin):
return hashlib.sha1(origin["url"].encode("utf-8")).hexdigest()
def raw_extrinsic_metadata_identifier(metadata: Dict[str, Any]) -> str:
"""Return the intrinsic identifier for a RawExtrinsicMetadata object.
A raw_extrinsic_metadata identifier is a salted sha1 (using the git
hashing algorithm with the ``raw_extrinsic_metadata`` object type) of
a manifest following the format:
```
target $ExtendedSwhid
discovery_date $ISO8601
authority $StrWithoutSpaces $IRI
fetcher $Str $Version
format $StrWithoutSpaces
origin $IRI <- optional
visit $IntInDecimal <- optional
snapshot $CoreSwhid <- optional
release $CoreSwhid <- optional
revision $CoreSwhid <- optional
path $Bytes <- optional
directory $CoreSwhid <- optional
$MetadataBytes
```
$IRI must be RFC 3987 IRIs (so they may contain newlines, that are escaped as
described below)
$StrWithoutSpaces and $Version are ASCII strings, and may not contain spaces.
$Str is an UTF-8 string.
$CoreSwhid are core SWHIDs, as defined in :ref:`persistent-identifiers`.
$ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for
origins and 'emd' for raw extrinsic metadata)
Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields,
ie. by adding a space after them.
Returns:
str: the intrinsic identifier for `metadata`
"""
headers = [
(b"target", str(metadata["target"]).encode()),
(b"discovery_date", metadata["discovery_date"].isoformat().encode("ascii")),
(
b"authority",
f"{metadata['authority']['type']} {metadata['authority']['url']}".encode(),
),
(
b"fetcher",
f"{metadata['fetcher']['name']} {metadata['fetcher']['version']}".encode(),
),
(b"format", metadata["format"].encode()),
]
for key in (
"origin",
"visit",
"snapshot",
"release",
"revision",
"path",
"directory",
):
if metadata.get(key) is not None:
value: bytes
if key == "path":
value = metadata[key]
else:
value = str(metadata[key]).encode()
headers.append((key.encode("ascii"), value))
return identifier_to_str(
hash_manifest("raw_extrinsic_metadata", headers, metadata["metadata"])
)
# type of the "object_type" attribute of the SWHID class; either
# ObjectType or ExtendedObjectType
_TObjectType = TypeVar("_TObjectType", ObjectType, ExtendedObjectType)
......
......@@ -5,6 +5,7 @@
import binascii
import datetime
import hashlib
import itertools
from typing import Dict
import unittest
......@@ -767,6 +768,120 @@ class SnapshotIdentifier(unittest.TestCase):
)
class RawExtrinsicMetadataIdentifier(unittest.TestCase):
def setUp(self):
super().setUp()
self.authority = {
"type": "forge",
"url": "https://forge.softwareheritage.org/",
}
self.fetcher = {
"name": "swh-phabricator-metadata-fetcher",
"version": "0.0.1",
}
self.minimal = {
"type": "content",
"target": ExtendedSWHID.from_string(
"swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d"
),
"discovery_date": datetime.datetime(
2021, 1, 25, 11, 27, 51, tzinfo=datetime.timezone.utc
),
"authority": self.authority,
"fetcher": self.fetcher,
"format": "json",
"metadata": b'{"foo": "bar"}',
}
self.maximal = {
**self.minimal,
"origin": "https://forge.softwareheritage.org/source/swh-model/",
"visit": 42,
"snapshot": CoreSWHID.from_string("swh:1:snp:" + "00" * 20),
"release": CoreSWHID.from_string("swh:1:rel:" + "01" * 20),
"revision": CoreSWHID.from_string("swh:1:rev:" + "02" * 20),
"path": b"/abc/def",
"directory": CoreSWHID.from_string("swh:1:dir:" + "03" * 20),
}
def test_minimal(self):
manifest = (
b"raw_extrinsic_metadata 225\0"
b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
b"discovery_date 2021-01-25T11:27:51+00:00\n"
b"authority forge https://forge.softwareheritage.org/\n"
b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
b"format json\n"
b"\n"
b'{"foo": "bar"}'
)
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(self.minimal),
hashlib.sha1(manifest).hexdigest(),
)
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(self.minimal),
"df16b5ea35b12f530fb7ecd0eb10b87a8b1fc3d2",
)
def test_maximal(self):
manifest = (
b"raw_extrinsic_metadata 548\0"
b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
b"discovery_date 2021-01-25T11:27:51+00:00\n"
b"authority forge https://forge.softwareheritage.org/\n"
b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
b"format json\n"
b"origin https://forge.softwareheritage.org/source/swh-model/\n"
b"visit 42\n"
b"snapshot swh:1:snp:0000000000000000000000000000000000000000\n"
b"release swh:1:rel:0101010101010101010101010101010101010101\n"
b"revision swh:1:rev:0202020202020202020202020202020202020202\n"
b"path /abc/def\n"
b"directory swh:1:dir:0303030303030303030303030303030303030303\n"
b"\n"
b'{"foo": "bar"}'
)
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(self.maximal),
hashlib.sha1(manifest).hexdigest(),
)
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(self.maximal),
"55563d91a3f9cb41aa36c60c2b518433bf318ae4",
)
def test_nonascii_path(self):
metadata = {
**self.minimal,
"path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f",
}
manifest = (
b"raw_extrinsic_metadata 246\0"
b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
b"discovery_date 2021-01-25T11:27:51+00:00\n"
b"authority forge https://forge.softwareheritage.org/\n"
b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
b"format json\n"
b"path /ab\n"
b" c/d\xf0\x9f\xa4\xb7e\x00f\n"
b"\n"
b'{"foo": "bar"}'
)
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(metadata),
hashlib.sha1(manifest).hexdigest(),
)
self.assertEqual(
identifiers.raw_extrinsic_metadata_identifier(metadata),
"d8e5856601cdae96dfdfb5147235895949c9322d",
)
origin_example = {
"url": "https://github.com/torvalds/linux",
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment