From 67fade5f674a57fd8845ad57161a86a2d898d197 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz <vlorentz@softwareheritage.org> Date: Wed, 29 May 2019 16:15:39 +0200 Subject: [PATCH] Add origin persistent identifiers. --- docs/persistent-identifiers.rst | 11 +++++++++-- swh/model/identifiers.py | 13 ++++++++++++- swh/model/tests/test_identifiers.py | 11 +++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/docs/persistent-identifiers.rst b/docs/persistent-identifiers.rst index b0651d0a..cc194215 100644 --- a/docs/persistent-identifiers.rst +++ b/docs/persistent-identifiers.rst @@ -42,7 +42,8 @@ entry point of the grammar: <identifier> ::= "swh" ":" <scheme_version> ":" <object_type> ":" <object_id> ; <scheme_version> ::= "1" ; <object_type> ::= - "snp" (* snapshot *) + "ori" (* origin *) + | "snp" (* snapshot *) | "rel" (* release *) | "rev" (* revision *) | "dir" (* directory *) @@ -66,7 +67,8 @@ identifiers that conform to previous versions of the scheme). A persistent identifier points to a single object, whose type is explicitly captured by ``<object_type>``: -* ``snp`` identifiers points to **snapshots**, +* ``ori`` identifiers point to **origins** +* ``snp`` to **snapshots**, * ``rel`` to **releases**, * ``rev`` to **revisions**, * ``dir`` to **directories**, @@ -76,6 +78,9 @@ The actual object pointed to is identified by the intrinsic identifier ``<object_id>``, which is a hex-encoded (using lowercase ASCII characters) SHA1 computed on the content and metadata of the object itself, as follows: +* for **origins**, intrinsic identifiers are computed as per + :py:func:`swh.model.identifiers.origin_identifier` + * for **snapshots**, intrinsic identifiers are computed as per :py:func:`swh.model.identifiers.snapshot_identifier` @@ -128,6 +133,8 @@ Examples release 2.3.0, dated 24 December 2016 * ``swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453`` points to a snapshot of the entire Darktable Git repository taken on 4 May 2017 from GitHub +* ``swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f`` points to the + repository https://github.com/torvalds/linux . Contextual information diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index 267f7a25..d8f4b287 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -5,6 +5,7 @@ import binascii import datetime +import hashlib from collections import namedtuple from functools import lru_cache @@ -14,6 +15,7 @@ from .fields.hashes import validate_sha1 from .hashutil import hash_git_data, hash_to_hex, MultiHash +ORIGIN = 'origin' SNAPSHOT = 'snapshot' REVISION = 'revision' RELEASE = 'release' @@ -597,7 +599,16 @@ def snapshot_identifier(snapshot, *, ignore_unresolved=False): return identifier_to_str(hash_git_data(b''.join(lines), 'snapshot')) +def origin_identifier(origin): + """Return the intrinsic identifier for an origin.""" + return hashlib.sha1(origin['url'].encode('ascii')).hexdigest() + + _object_type_map = { + ORIGIN: { + 'short_name': 'ori', + 'key_id': 'id' + }, SNAPSHOT: { 'short_name': 'snp', 'key_id': 'id' @@ -620,7 +631,7 @@ _object_type_map = { } } -PERSISTENT_IDENTIFIER_TYPES = ['snp', 'rel', 'rev', 'dir', 'cnt'] +PERSISTENT_IDENTIFIER_TYPES = ['ori', 'snp', 'rel', 'rev', 'dir', 'cnt'] PERSISTENT_IDENTIFIER_KEYS = [ 'namespace', 'scheme_version', 'object_type', 'object_id', 'metadata'] diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index 1492b87e..410cf402 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -893,3 +893,14 @@ class SnapshotIdentifier(unittest.TestCase): with self.assertRaisesRegex( ValidationError, _error): identifiers.parse_persistent_identifier(pid) + + +class OriginIdentifier(unittest.TestCase): + def setUp(self): + self.origin = { + 'url': 'https://github.com/torvalds/linux', + } + + def test_content_identifier(self): + self.assertEqual(identifiers.origin_identifier(self.origin), + 'b63a575fe3faab7692c9f38fb09d4bb45651bb0f') -- GitLab