From 67fade5f674a57fd8845ad57161a86a2d898d197 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Wed, 29 May 2019 16:15:39 +0200
Subject: [PATCH] Add origin persistent identifiers.

---
 docs/persistent-identifiers.rst     | 11 +++++++++--
 swh/model/identifiers.py            | 13 ++++++++++++-
 swh/model/tests/test_identifiers.py | 11 +++++++++++
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/docs/persistent-identifiers.rst b/docs/persistent-identifiers.rst
index b0651d0a..cc194215 100644
--- a/docs/persistent-identifiers.rst
+++ b/docs/persistent-identifiers.rst
@@ -42,7 +42,8 @@ entry point of the grammar:
   <identifier> ::= "swh" ":" <scheme_version> ":" <object_type> ":" <object_id> ;
   <scheme_version> ::= "1" ;
   <object_type> ::=
-      "snp"  (* snapshot *)
+      "ori"  (* origin *)
+    | "snp"  (* snapshot *)
     | "rel"  (* release *)
     | "rev"  (* revision *)
     | "dir"  (* directory *)
@@ -66,7 +67,8 @@ identifiers that conform to previous versions of the scheme).
 A persistent identifier points to a single object, whose type is explicitly
 captured by ``<object_type>``:
 
-* ``snp`` identifiers points to **snapshots**,
+* ``ori`` identifiers point to **origins**
+* ``snp`` to **snapshots**,
 * ``rel`` to **releases**,
 * ``rev`` to **revisions**,
 * ``dir`` to **directories**,
@@ -76,6 +78,9 @@ The actual object pointed to is identified by the intrinsic identifier
 ``<object_id>``, which is a hex-encoded (using lowercase ASCII characters) SHA1
 computed on the content and metadata of the object itself, as follows:
 
+* for **origins**, intrinsic identifiers are computed as per
+  :py:func:`swh.model.identifiers.origin_identifier`
+
 * for **snapshots**, intrinsic identifiers are computed as per
   :py:func:`swh.model.identifiers.snapshot_identifier`
 
@@ -128,6 +133,8 @@ Examples
   release 2.3.0, dated 24 December 2016
 * ``swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453`` points to a snapshot
   of the entire Darktable Git repository taken on 4 May 2017 from GitHub
+* ``swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f`` points to the
+  repository https://github.com/torvalds/linux .
 
 
 Contextual information
diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py
index 267f7a25..d8f4b287 100644
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
@@ -5,6 +5,7 @@
 
 import binascii
 import datetime
+import hashlib
 
 from collections import namedtuple
 from functools import lru_cache
@@ -14,6 +15,7 @@ from .fields.hashes import validate_sha1
 from .hashutil import hash_git_data, hash_to_hex, MultiHash
 
 
+ORIGIN = 'origin'
 SNAPSHOT = 'snapshot'
 REVISION = 'revision'
 RELEASE = 'release'
@@ -597,7 +599,16 @@ def snapshot_identifier(snapshot, *, ignore_unresolved=False):
     return identifier_to_str(hash_git_data(b''.join(lines), 'snapshot'))
 
 
+def origin_identifier(origin):
+    """Return the intrinsic identifier for an origin."""
+    return hashlib.sha1(origin['url'].encode('ascii')).hexdigest()
+
+
 _object_type_map = {
+    ORIGIN: {
+        'short_name': 'ori',
+        'key_id': 'id'
+    },
     SNAPSHOT: {
         'short_name': 'snp',
         'key_id': 'id'
@@ -620,7 +631,7 @@ _object_type_map = {
     }
 }
 
-PERSISTENT_IDENTIFIER_TYPES = ['snp', 'rel', 'rev', 'dir', 'cnt']
+PERSISTENT_IDENTIFIER_TYPES = ['ori', 'snp', 'rel', 'rev', 'dir', 'cnt']
 
 PERSISTENT_IDENTIFIER_KEYS = [
     'namespace', 'scheme_version', 'object_type', 'object_id', 'metadata']
diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py
index 1492b87e..410cf402 100644
--- a/swh/model/tests/test_identifiers.py
+++ b/swh/model/tests/test_identifiers.py
@@ -893,3 +893,14 @@ class SnapshotIdentifier(unittest.TestCase):
             with self.assertRaisesRegex(
                     ValidationError, _error):
                 identifiers.parse_persistent_identifier(pid)
+
+
+class OriginIdentifier(unittest.TestCase):
+    def setUp(self):
+        self.origin = {
+            'url': 'https://github.com/torvalds/linux',
+        }
+
+    def test_content_identifier(self):
+        self.assertEqual(identifiers.origin_identifier(self.origin),
+                         'b63a575fe3faab7692c9f38fb09d4bb45651bb0f')
-- 
GitLab