From 1b1cc8d521f16d1ed5faa4882ffc7514d508dc56 Mon Sep 17 00:00:00 2001
From: Nicolas Dandrimont <nicolas@dandrimont.eu>
Date: Tue, 12 Dec 2017 20:08:47 +0100
Subject: [PATCH] hashutil: add `snapshot` object type for git hashes

Summary:
Add support for snapshot identifiers

Close T566.
Related to D268.

Test Plan: Unit tests included

Reviewers: zack, #reviewers!

Maniphest Tasks: T566

Differential Revision: https://forge.softwareheritage.org/D277
---
 swh/model/identifiers.py            | 85 +++++++++++++++++++++++++++
 swh/model/tests/test_identifiers.py | 89 +++++++++++++++++++++++++++++
 2 files changed, 174 insertions(+)

diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py
index c7a6ce96..b4ec15dd 100644
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
@@ -499,3 +499,88 @@ def release_identifier(release):
         components.extend([b'\n', release['message']])
 
     return identifier_to_str(hash_git_data(b''.join(components), 'tag'))
+
+
+def snapshot_identifier(snapshot, *, ignore_unresolved=False):
+    """Return the intrinsic identifier for a snapshot.
+
+    Snapshots are a set of named branches, which are pointers to objects at any
+    level of the Software Heritage DAG.
+
+    As well as pointing to other objects in the Software Heritage DAG, branches
+    can also be *alias*es, in which case their target is the name of another
+    branch in the same snapshot, or *dangling*, in which case the target is
+    unknown (and represented by the ``None`` value).
+
+    A snapshot identifier is a salted sha1 (using the git hashing algorithm
+    with the ``snapshot`` object type) of a manifest following the algorithm:
+
+    1. Branches are sorted using the name as key, in bytes order.
+
+    2. For each branch, the following bytes are output:
+
+      - the type of the branch target:
+
+        - ``content``, ``directory``, ``revision``, ``release`` or ``snapshot``
+          for the corresponding entries in the DAG;
+        - ``alias`` for branches referencing another branch;
+        - ``dangling`` for dangling branches
+
+      - an ascii space (``\\x20``)
+      - the branch name (as raw bytes)
+      - a null byte (``\\x00``)
+      - the length of the target identifier, as an ascii-encoded decimal number
+        (``20`` for current intrinisic identifiers, ``0`` for dangling
+        branches, the length of the target branch name for branch aliases)
+      - a colon (``:``)
+      - the identifier of the target object pointed at by the branch,
+        stored in the 'target' member:
+
+        - for contents: their *sha1_git*
+        - for directories, revisions, releases or snapshots: their intrinsic
+          identifier
+        - for branch aliases, the name of the target branch (as raw bytes)
+        - for dangling branches, the empty string
+
+      Note that, akin to directory manifests, there is no separator between
+      entries. Because of symbolic branches, identifiers are of arbitrary
+      length but are length-encoded to avoid ambiguity.
+
+    Args:
+      snapshot (dict): the snapshot of which to compute the identifier. A
+        single entry is needed, ``'branches'``, which is itself a :class:`dict`
+        mapping each branch to its target
+      ignore_unresolved (bool): if `True`, ignore unresolved branch aliases.
+
+    Returns:
+      str: the intrinsic identifier for `snapshot`
+
+    """
+
+    unresolved = []
+    lines = []
+
+    for name, target in sorted(snapshot['branches'].items()):
+        if not target:
+            target_type = b'dangling'
+            target_id = b''
+        elif target['target_type'] == 'alias':
+            target_type = b'alias'
+            target_id = target['target']
+            if target_id not in snapshot['branches'] or target_id == name:
+                unresolved.append((name, target_id))
+        else:
+            target_type = target['target_type'].encode()
+            target_id = identifier_to_bytes(target['target'])
+
+        lines.extend([
+            target_type, b'\x20', name, b'\x00',
+            ('%d:' % len(target_id)).encode(), target_id,
+        ])
+
+    if unresolved and not ignore_unresolved:
+        raise ValueError('Branch aliases unresolved: %s' %
+                         ', '.join('%s -> %s' % (name, target)
+                                   for name, target in unresolved))
+
+    return identifier_to_str(hash_git_data(b''.join(lines), 'snapshot'))
diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py
index 755dba60..4a56b0c2 100644
--- a/swh/model/tests/test_identifiers.py
+++ b/swh/model/tests/test_identifiers.py
@@ -679,3 +679,92 @@ o6X/3T+vm8K3bf3driRr34c=
             identifiers.release_identifier(self.release_newline_in_author),
             identifiers.identifier_to_str(self.release_newline_in_author['id'])
         )
+
+
+class SnapshotIdentifier(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+
+        self.empty = {
+            'id': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e',
+            'branches': {},
+        }
+
+        self.dangling_branch = {
+            'id': 'c84502e821eb21ed84e9fd3ec40973abc8b32353',
+            'branches': {
+                b'HEAD': None,
+            },
+        }
+
+        self.unresolved = {
+            'id': '84b4548ea486e4b0a7933fa541ff1503a0afe1e0',
+            'branches': {
+                b'foo': {
+                    'target': b'bar',
+                    'target_type': 'alias',
+                },
+            },
+        }
+
+        self.all_types = {
+            'id': '6e65b86363953b780d92b0a928f3e8fcdd10db36',
+            'branches': {
+                b'directory': {
+                    'target': '1bd0e65f7d2ff14ae994de17a1e7fe65111dcad8',
+                    'target_type': 'directory',
+                },
+                b'content': {
+                    'target': 'fe95a46679d128ff167b7c55df5d02356c5a1ae1',
+                    'target_type': 'content',
+                },
+                b'alias': {
+                    'target': b'revision',
+                    'target_type': 'alias',
+                },
+                b'revision': {
+                    'target': 'aafb16d69fd30ff58afdd69036a26047f3aebdc6',
+                    'target_type': 'revision',
+                },
+                b'release': {
+                    'target': '7045404f3d1c54e6473c71bbb716529fbad4be24',
+                    'target_type': 'release',
+                },
+                b'snapshot': {
+                    'target': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e',
+                    'target_type': 'snapshot',
+                },
+                b'dangling': None,
+            }
+        }
+
+    def test_empty_snapshot(self):
+        self.assertEqual(
+            identifiers.snapshot_identifier(self.empty),
+            identifiers.identifier_to_str(self.empty['id']),
+        )
+
+    def test_dangling_branch(self):
+        self.assertEqual(
+            identifiers.snapshot_identifier(self.dangling_branch),
+            identifiers.identifier_to_str(self.dangling_branch['id']),
+        )
+
+    def test_unresolved(self):
+        with self.assertRaisesRegex(ValueError, "b'foo' -> b'bar'"):
+            identifiers.snapshot_identifier(self.unresolved)
+
+    def test_unresolved_force(self):
+        self.assertEqual(
+            identifiers.snapshot_identifier(
+                self.unresolved,
+                ignore_unresolved=True,
+            ),
+            identifiers.identifier_to_str(self.unresolved['id']),
+        )
+
+    def test_all_types(self):
+        self.assertEqual(
+            identifiers.snapshot_identifier(self.all_types),
+            identifiers.identifier_to_str(self.all_types['id']),
+        )
-- 
GitLab