diff --git a/PKG-INFO b/PKG-INFO index 71abcc9579b2eabbda54cdb39a9597eaeef27372..a1069b6cee2c2c7b8c0710cbcddc2a2bfd3dc243 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.19 +Version: 0.0.20 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/docs/index.rst b/docs/index.rst index 22eccfdcec5757904f4fb768d85c504a96acaa69..db680710600a20d9a7d119477bf8798f968fa46b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,3 +1,5 @@ +.. _swh-model: + Software Heritage - Development Documentation ============================================= diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 71abcc9579b2eabbda54cdb39a9597eaeef27372..a1069b6cee2c2c7b8c0710cbcddc2a2bfd3dc243 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.19 +Version: 0.0.20 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index 4d6f9dba2088d6a0ceb8e489fe1587ba53c69d0c..96905d956f1dc2d2bbe0833a4a23f6009dccab06 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -167,12 +167,13 @@ def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): return hash -def hash_data(data, algorithms=DEFAULT_ALGORITHMS): +def hash_data(data, algorithms=DEFAULT_ALGORITHMS, with_length=False): """Hash the given binary blob with the given algorithms. Args: - data: a bytes object - algorithms: the hashing algorithms used + data (bytes): raw content to hash + algorithms (list): the hashing algorithms used + with_length (bool): add the length key in the resulting dict Returns: a dict mapping each algorithm to a bytes digest @@ -181,7 +182,11 @@ def hash_data(data, algorithms=DEFAULT_ALGORITHMS): ValueError if algorithms contains an unknown hash algorithm. """ fobj = BytesIO(data) - return hash_file(fobj, len(data), algorithms) + length = len(data) + data = hash_file(fobj, length, algorithms) + if with_length: + data['length'] = length + return data def hash_git_data(data, git_type, base_algo='sha1'): @@ -198,7 +203,7 @@ def hash_git_data(data, git_type, base_algo='sha1'): ValueError if the git_type is unexpected. """ - git_object_types = {'blob', 'tree', 'commit', 'tag'} + git_object_types = {'blob', 'tree', 'commit', 'tag', 'snapshot'} if git_type not in git_object_types: raise ValueError('Unexpected git object type %s, expected one of %s' % diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index c7a6ce96da6af54f1d19b357733a418730287cb9..b4ec15dd4ee13320e7d56b03e6244efc428a2bba 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -499,3 +499,88 @@ def release_identifier(release): components.extend([b'\n', release['message']]) return identifier_to_str(hash_git_data(b''.join(components), 'tag')) + + +def snapshot_identifier(snapshot, *, ignore_unresolved=False): + """Return the intrinsic identifier for a snapshot. + + Snapshots are a set of named branches, which are pointers to objects at any + level of the Software Heritage DAG. + + As well as pointing to other objects in the Software Heritage DAG, branches + can also be *alias*es, in which case their target is the name of another + branch in the same snapshot, or *dangling*, in which case the target is + unknown (and represented by the ``None`` value). + + A snapshot identifier is a salted sha1 (using the git hashing algorithm + with the ``snapshot`` object type) of a manifest following the algorithm: + + 1. Branches are sorted using the name as key, in bytes order. + + 2. For each branch, the following bytes are output: + + - the type of the branch target: + + - ``content``, ``directory``, ``revision``, ``release`` or ``snapshot`` + for the corresponding entries in the DAG; + - ``alias`` for branches referencing another branch; + - ``dangling`` for dangling branches + + - an ascii space (``\\x20``) + - the branch name (as raw bytes) + - a null byte (``\\x00``) + - the length of the target identifier, as an ascii-encoded decimal number + (``20`` for current intrinisic identifiers, ``0`` for dangling + branches, the length of the target branch name for branch aliases) + - a colon (``:``) + - the identifier of the target object pointed at by the branch, + stored in the 'target' member: + + - for contents: their *sha1_git* + - for directories, revisions, releases or snapshots: their intrinsic + identifier + - for branch aliases, the name of the target branch (as raw bytes) + - for dangling branches, the empty string + + Note that, akin to directory manifests, there is no separator between + entries. Because of symbolic branches, identifiers are of arbitrary + length but are length-encoded to avoid ambiguity. + + Args: + snapshot (dict): the snapshot of which to compute the identifier. A + single entry is needed, ``'branches'``, which is itself a :class:`dict` + mapping each branch to its target + ignore_unresolved (bool): if `True`, ignore unresolved branch aliases. + + Returns: + str: the intrinsic identifier for `snapshot` + + """ + + unresolved = [] + lines = [] + + for name, target in sorted(snapshot['branches'].items()): + if not target: + target_type = b'dangling' + target_id = b'' + elif target['target_type'] == 'alias': + target_type = b'alias' + target_id = target['target'] + if target_id not in snapshot['branches'] or target_id == name: + unresolved.append((name, target_id)) + else: + target_type = target['target_type'].encode() + target_id = identifier_to_bytes(target['target']) + + lines.extend([ + target_type, b'\x20', name, b'\x00', + ('%d:' % len(target_id)).encode(), target_id, + ]) + + if unresolved and not ignore_unresolved: + raise ValueError('Branch aliases unresolved: %s' % + ', '.join('%s -> %s' % (name, target) + for name, target in unresolved)) + + return identifier_to_str(hash_git_data(b''.join(lines), 'snapshot')) diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index 1a4f24a606fc0cda16b4fc3df92ba49799b3145e..8b883f16bf033376c1b8c7c91fb18e38b9b1b56c 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -47,6 +47,17 @@ class Hashutil(unittest.TestCase): def hash_data(self): checksums = hashutil.hash_data(self.data) self.assertEqual(checksums, self.checksums) + self.assertFalse('length' in checksums) + + @istest + def hash_data_with_length(self): + expected_checksums = self.checksums.copy() + expected_checksums['length'] = len(self.data) + + checksums = hashutil.hash_data(self.data, with_length=True) + + self.assertEqual(checksums, expected_checksums) + self.assertTrue('length' in checksums) @istest def hash_data_unknown_hash(self): diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index 755dba60906987d0a80616c6999864621ebaeac1..4a56b0c26434d86c5a263d20b042a15fc3c09baf 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -679,3 +679,92 @@ o6X/3T+vm8K3bf3driRr34c= identifiers.release_identifier(self.release_newline_in_author), identifiers.identifier_to_str(self.release_newline_in_author['id']) ) + + +class SnapshotIdentifier(unittest.TestCase): + def setUp(self): + super().setUp() + + self.empty = { + 'id': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e', + 'branches': {}, + } + + self.dangling_branch = { + 'id': 'c84502e821eb21ed84e9fd3ec40973abc8b32353', + 'branches': { + b'HEAD': None, + }, + } + + self.unresolved = { + 'id': '84b4548ea486e4b0a7933fa541ff1503a0afe1e0', + 'branches': { + b'foo': { + 'target': b'bar', + 'target_type': 'alias', + }, + }, + } + + self.all_types = { + 'id': '6e65b86363953b780d92b0a928f3e8fcdd10db36', + 'branches': { + b'directory': { + 'target': '1bd0e65f7d2ff14ae994de17a1e7fe65111dcad8', + 'target_type': 'directory', + }, + b'content': { + 'target': 'fe95a46679d128ff167b7c55df5d02356c5a1ae1', + 'target_type': 'content', + }, + b'alias': { + 'target': b'revision', + 'target_type': 'alias', + }, + b'revision': { + 'target': 'aafb16d69fd30ff58afdd69036a26047f3aebdc6', + 'target_type': 'revision', + }, + b'release': { + 'target': '7045404f3d1c54e6473c71bbb716529fbad4be24', + 'target_type': 'release', + }, + b'snapshot': { + 'target': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e', + 'target_type': 'snapshot', + }, + b'dangling': None, + } + } + + def test_empty_snapshot(self): + self.assertEqual( + identifiers.snapshot_identifier(self.empty), + identifiers.identifier_to_str(self.empty['id']), + ) + + def test_dangling_branch(self): + self.assertEqual( + identifiers.snapshot_identifier(self.dangling_branch), + identifiers.identifier_to_str(self.dangling_branch['id']), + ) + + def test_unresolved(self): + with self.assertRaisesRegex(ValueError, "b'foo' -> b'bar'"): + identifiers.snapshot_identifier(self.unresolved) + + def test_unresolved_force(self): + self.assertEqual( + identifiers.snapshot_identifier( + self.unresolved, + ignore_unresolved=True, + ), + identifiers.identifier_to_str(self.unresolved['id']), + ) + + def test_all_types(self): + self.assertEqual( + identifiers.snapshot_identifier(self.all_types), + identifiers.identifier_to_str(self.all_types['id']), + ) diff --git a/version.txt b/version.txt index b5266cae00ec939c83cf0e0c685ea1dd4bccf081..7b21cbd42663afc173ab30f0e9bacfdc715c6e05 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.19-0-g0b7f217 \ No newline at end of file +v0.0.20-0-g91d74ef \ No newline at end of file