Skip to content
Snippets Groups Projects
Commit eff2692a authored by Nicolas Dandrimont's avatar Nicolas Dandrimont
Browse files

Merge branch 'wip/snapshots'

parents 94bd8dd5 1b1cc8d5
No related branches found
Tags v0.0.19
No related merge requests found
......@@ -198,7 +198,7 @@ def hash_git_data(data, git_type, base_algo='sha1'):
ValueError if the git_type is unexpected.
"""
git_object_types = {'blob', 'tree', 'commit', 'tag'}
git_object_types = {'blob', 'tree', 'commit', 'tag', 'snapshot'}
if git_type not in git_object_types:
raise ValueError('Unexpected git object type %s, expected one of %s' %
......
......@@ -499,3 +499,88 @@ def release_identifier(release):
components.extend([b'\n', release['message']])
return identifier_to_str(hash_git_data(b''.join(components), 'tag'))
def snapshot_identifier(snapshot, *, ignore_unresolved=False):
"""Return the intrinsic identifier for a snapshot.
Snapshots are a set of named branches, which are pointers to objects at any
level of the Software Heritage DAG.
As well as pointing to other objects in the Software Heritage DAG, branches
can also be *alias*es, in which case their target is the name of another
branch in the same snapshot, or *dangling*, in which case the target is
unknown (and represented by the ``None`` value).
A snapshot identifier is a salted sha1 (using the git hashing algorithm
with the ``snapshot`` object type) of a manifest following the algorithm:
1. Branches are sorted using the name as key, in bytes order.
2. For each branch, the following bytes are output:
- the type of the branch target:
- ``content``, ``directory``, ``revision``, ``release`` or ``snapshot``
for the corresponding entries in the DAG;
- ``alias`` for branches referencing another branch;
- ``dangling`` for dangling branches
- an ascii space (``\\x20``)
- the branch name (as raw bytes)
- a null byte (``\\x00``)
- the length of the target identifier, as an ascii-encoded decimal number
(``20`` for current intrinisic identifiers, ``0`` for dangling
branches, the length of the target branch name for branch aliases)
- a colon (``:``)
- the identifier of the target object pointed at by the branch,
stored in the 'target' member:
- for contents: their *sha1_git*
- for directories, revisions, releases or snapshots: their intrinsic
identifier
- for branch aliases, the name of the target branch (as raw bytes)
- for dangling branches, the empty string
Note that, akin to directory manifests, there is no separator between
entries. Because of symbolic branches, identifiers are of arbitrary
length but are length-encoded to avoid ambiguity.
Args:
snapshot (dict): the snapshot of which to compute the identifier. A
single entry is needed, ``'branches'``, which is itself a :class:`dict`
mapping each branch to its target
ignore_unresolved (bool): if `True`, ignore unresolved branch aliases.
Returns:
str: the intrinsic identifier for `snapshot`
"""
unresolved = []
lines = []
for name, target in sorted(snapshot['branches'].items()):
if not target:
target_type = b'dangling'
target_id = b''
elif target['target_type'] == 'alias':
target_type = b'alias'
target_id = target['target']
if target_id not in snapshot['branches'] or target_id == name:
unresolved.append((name, target_id))
else:
target_type = target['target_type'].encode()
target_id = identifier_to_bytes(target['target'])
lines.extend([
target_type, b'\x20', name, b'\x00',
('%d:' % len(target_id)).encode(), target_id,
])
if unresolved and not ignore_unresolved:
raise ValueError('Branch aliases unresolved: %s' %
', '.join('%s -> %s' % (name, target)
for name, target in unresolved))
return identifier_to_str(hash_git_data(b''.join(lines), 'snapshot'))
......@@ -679,3 +679,92 @@ o6X/3T+vm8K3bf3driRr34c=
identifiers.release_identifier(self.release_newline_in_author),
identifiers.identifier_to_str(self.release_newline_in_author['id'])
)
class SnapshotIdentifier(unittest.TestCase):
def setUp(self):
super().setUp()
self.empty = {
'id': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e',
'branches': {},
}
self.dangling_branch = {
'id': 'c84502e821eb21ed84e9fd3ec40973abc8b32353',
'branches': {
b'HEAD': None,
},
}
self.unresolved = {
'id': '84b4548ea486e4b0a7933fa541ff1503a0afe1e0',
'branches': {
b'foo': {
'target': b'bar',
'target_type': 'alias',
},
},
}
self.all_types = {
'id': '6e65b86363953b780d92b0a928f3e8fcdd10db36',
'branches': {
b'directory': {
'target': '1bd0e65f7d2ff14ae994de17a1e7fe65111dcad8',
'target_type': 'directory',
},
b'content': {
'target': 'fe95a46679d128ff167b7c55df5d02356c5a1ae1',
'target_type': 'content',
},
b'alias': {
'target': b'revision',
'target_type': 'alias',
},
b'revision': {
'target': 'aafb16d69fd30ff58afdd69036a26047f3aebdc6',
'target_type': 'revision',
},
b'release': {
'target': '7045404f3d1c54e6473c71bbb716529fbad4be24',
'target_type': 'release',
},
b'snapshot': {
'target': '1a8893e6a86f444e8be8e7bda6cb34fb1735a00e',
'target_type': 'snapshot',
},
b'dangling': None,
}
}
def test_empty_snapshot(self):
self.assertEqual(
identifiers.snapshot_identifier(self.empty),
identifiers.identifier_to_str(self.empty['id']),
)
def test_dangling_branch(self):
self.assertEqual(
identifiers.snapshot_identifier(self.dangling_branch),
identifiers.identifier_to_str(self.dangling_branch['id']),
)
def test_unresolved(self):
with self.assertRaisesRegex(ValueError, "b'foo' -> b'bar'"):
identifiers.snapshot_identifier(self.unresolved)
def test_unresolved_force(self):
self.assertEqual(
identifiers.snapshot_identifier(
self.unresolved,
ignore_unresolved=True,
),
identifiers.identifier_to_str(self.unresolved['id']),
)
def test_all_types(self):
self.assertEqual(
identifiers.snapshot_identifier(self.all_types),
identifiers.identifier_to_str(self.all_types['id']),
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment