Skip to content
Snippets Groups Projects
Commit edcd3659 authored by Antoine Lambert's avatar Antoine Lambert
Browse files

identifiers: Improve persistent identifiers representation

This commit changes the swh pids representation returned by the
parse_persistent_identifier function from a dict to a named tuple.
Also, the 'object_type' attribute of that named tuple now contains
a full name instead of an abbreviated one.

Closes T1112
Closes T1125
parent 5eb055d8
No related branches found
Tags v0.0.25
No related merge requests found
......@@ -5,6 +5,8 @@
import binascii
import datetime
from collections import namedtuple
from functools import lru_cache
from .exceptions import ValidationError
......@@ -64,7 +66,7 @@ def identifier_to_str(identifier):
The length 40 string corresponding to the given identifier, hex encoded
Raises:
ValueError if the identifier is of an unexpected type or length.
ValueError: if the identifier is of an unexpected type or length.
"""
if isinstance(identifier, str):
......@@ -596,71 +598,121 @@ def snapshot_identifier(snapshot, *, ignore_unresolved=False):
return identifier_to_str(hash_git_data(b''.join(lines), 'snapshot'))
def persistent_identifier(type, object, version=1):
"""Compute persistent identifier (stable over time) as per
documentation.
Documentation:
https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html # noqa
_object_type_map = {
SNAPSHOT: {
'short_name': 'snp',
'key_id': 'id'
},
RELEASE: {
'short_name': 'rel',
'key_id': 'id'
},
REVISION: {
'short_name': 'rev',
'key_id': 'id'
},
DIRECTORY: {
'short_name': 'dir',
'key_id': 'id'
},
CONTENT: {
'short_name': 'cnt',
'key_id': 'sha1_git'
}
}
Args:
type (str): Object's type
object (dict/bytes/str): Object's dict representation or object
identifier
version (int): persistent identifier version (default to 1)
PERSISTENT_IDENTIFIER_TYPES = ['snp', 'rel', 'rev', 'dir', 'cnt']
Raises:
ValidationError (class) in case of:
PERSISTENT_IDENTIFIER_KEYS = [
'namespace', 'scheme_version', 'object_type', 'object_id', 'metadata']
invalid type
invalid hash object
PERSISTENT_IDENTIFIER_PARTS_SEP = ';'
Returns:
Persistent identifier as string.
class PersistentId(namedtuple('PersistentId', PERSISTENT_IDENTIFIER_KEYS)):
"""
_map = {
SNAPSHOT: {
'short_name': 'snp',
'key_id': 'id'
},
RELEASE: {
'short_name': 'rel',
'key_id': 'id'
},
REVISION: {
'short_name': 'rev',
'key_id': 'id'
},
DIRECTORY: {
'short_name': 'dir',
'key_id': 'id'
},
CONTENT: {
'short_name': 'cnt',
'key_id': 'sha1_git'
},
}
o = _map.get(type)
if not o:
raise ValidationError('Wrong input: Supported types are %s' % (
list(_map.keys())))
Named tuple holding the relevant info associated to a Software Heritage
persistent identifier.
if isinstance(object, dict): # internal swh representation resolution
_hash = object[o['key_id']]
else: # client passed direct identifier (bytes/str)
_hash = object
validate_sha1(_hash) # can raise if invalid hash
_hash = hash_to_hex(_hash)
return 'swh:%s:%s:%s' % (version, o['short_name'], _hash)
Args:
namespace (str): the namespace of the identifier, defaults to 'swh'
scheme_version (int): the scheme version of the identifier,
defaults to 1
object_type (str): the type of object the identifier points to,
either 'content', 'directory', 'release', 'revision' or 'snapshot'
object_id (dict/bytes/str): object's dict representation or
object identifier
metadata (dict): optional dict filled with metadata related to
pointed object
Raises:
swh.model.exceptions.ValidationError: In case of invalid object type or id
Once created, it contains the following attributes:
Attributes:
namespace (str): the namespace of the identifier
scheme_version (int): the scheme version of the identifier
object_type (str): the type of object the identifier points to
object_id (str): hexadecimal representation of the object hash
metadata (dict): metadata related to the pointed object
To get the raw persistent identifier string from an instance of
this named tuple, use the :func:`str` function::
pid = PersistentId(object_type='content', object_id='8ff44f081d43176474b267de5451f2c2e88089d0')
pid_str = str(pid) # 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0'
""" # noqa
__slots__ = ()
def __new__(cls, namespace='swh', scheme_version=1,
object_type='', object_id='', metadata={}):
o = _object_type_map.get(object_type)
if not o:
raise ValidationError('Wrong input: Supported types are %s' % (
list(_object_type_map.keys())))
# internal swh representation resolution
if isinstance(object_id, dict):
object_id = object_id[o['key_id']]
validate_sha1(object_id) # can raise if invalid hash
object_id = hash_to_hex(object_id)
return super(cls, PersistentId).__new__(
cls, namespace, scheme_version, object_type, object_id, metadata)
def __str__(self):
o = _object_type_map.get(self.object_type)
pid = '%s:%s:%s:%s' % (self.namespace, self.scheme_version,
o['short_name'], self.object_id)
if self.metadata:
for k, v in self.metadata.items():
pid += '%s%s=%s' % (PERSISTENT_IDENTIFIER_PARTS_SEP, k, v)
return pid
def persistent_identifier(object_type, object_id, scheme_version=1):
"""Compute persistent identifier (stable over time) as per
documentation.
PERSISTENT_IDENTIFIER_TYPES = ['snp', 'rel', 'rev', 'dir', 'cnt']
Documentation:
https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
PERSISTENT_IDENTIFIER_KEYS = [
'namespace', 'scheme_version', 'object_type', 'object_id', 'metadata']
Args:
object_type (str): object's type, either 'content', 'directory', 'release',
'revision' or 'snapshot'
object_id (dict/bytes/str): object's dict representation or object
identifier
scheme_version (int): persistent identifier scheme version, defaults to 1
PERSISTENT_IDENTIFIER_PARTS_SEP = ';'
Raises:
swh.model.exceptions.ValidationError: In case of invalid object type or id
Returns:
str: the persistent identifier
""" # noqa
pid = PersistentId(scheme_version=scheme_version, object_type=object_type,
object_id=object_id)
return str(pid)
def parse_persistent_identifier(persistent_id):
......@@ -670,23 +722,17 @@ def parse_persistent_identifier(persistent_id):
persistent_id (str): A persistent identifier
Raises:
ValidationError (class) in case of:
swh.model.exceptions.ValidationError: in case of:
missing mandatory values (4)
invalid namespace supplied
invalid version supplied
invalid type supplied
missing hash
invalid hash identifier supplied
* missing mandatory values (4)
* invalid namespace supplied
* invalid version supplied
* invalid type supplied
* missing hash
* invalid hash identifier supplied
Returns:
dict: dict with keys :
* namespace, holding str value
* scheme_version, holding str value
* object_type, holding str value
* object_id, holding str value
* metadata, holding dict value
PersistentId: a named tuple holding the parsing result
"""
# <pid>;<contextual-information>
......@@ -695,7 +741,7 @@ def parse_persistent_identifier(persistent_id):
if len(pid_data) != 4:
raise ValidationError(
'Wrong format: There should be 4 mandatory parameters')
'Wrong format: There should be 4 mandatory values')
# Checking for parsing errors
_ns, _version, _type, _id = pid_data
......@@ -707,12 +753,19 @@ def parse_persistent_identifier(persistent_id):
raise ValidationError(
'Wrong format: Supported version is 1')
pid_data[1] = int(pid_data[1])
expected_types = PERSISTENT_IDENTIFIER_TYPES
if _type not in expected_types:
raise ValidationError(
'Wrong format: Supported types are %s' % (
', '.join(expected_types)))
for otype, data in _object_type_map.items():
if _type == data['short_name']:
pid_data[2] = otype
break
if not _id:
raise ValidationError(
'Wrong format: Identifier should be present')
......@@ -732,4 +785,4 @@ def parse_persistent_identifier(persistent_id):
msg = 'Contextual data is badly formatted, form key=val expected'
raise ValidationError(msg)
pid_data.append(persistent_id_metadata)
return dict(zip(PERSISTENT_IDENTIFIER_KEYS, pid_data))
return PersistentId._make(pid_data)
......@@ -14,6 +14,7 @@ from swh.model import hashutil, identifiers
from swh.model.exceptions import ValidationError
from swh.model.identifiers import SNAPSHOT, RELEASE, REVISION, DIRECTORY
from swh.model.identifiers import CONTENT, PERSISTENT_IDENTIFIER_TYPES
from swh.model.identifiers import PersistentId
class UtilityFunctionsIdentifier(unittest.TestCase):
......@@ -831,58 +832,58 @@ class SnapshotIdentifier(unittest.TestCase):
def test_parse_persistent_identifier(self):
for pid, _type, _version, _hash in [
('swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 'cnt',
'1', '94a9ed024d3859793618152ea559a168bbcbb5e2'),
('swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir',
'1', 'd198bc9d7a6bcf6db04f476d29314f157507d505'),
('swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', 'rev',
'1', '309cf2674ee7a0749978cf8265ab91a60aea0f7d'),
('swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', 'rel',
'1', '22ece559cc7cc2364edc5e5593d63ae8bd229f9f'),
('swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', 'snp',
'1', 'c7c108084bc0bf3d81436bf980b46e98bd338453'),
('swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2',
CONTENT, 1, '94a9ed024d3859793618152ea559a168bbcbb5e2'),
('swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505',
DIRECTORY, 1, 'd198bc9d7a6bcf6db04f476d29314f157507d505'),
('swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d',
REVISION, 1, '309cf2674ee7a0749978cf8265ab91a60aea0f7d'),
('swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f',
RELEASE, 1, '22ece559cc7cc2364edc5e5593d63ae8bd229f9f'),
('swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453',
SNAPSHOT, 1, 'c7c108084bc0bf3d81436bf980b46e98bd338453'),
]:
expected_result = {
'namespace': 'swh',
'scheme_version': _version,
'object_type': _type,
'object_id': _hash,
'metadata': {}
}
expected_result = PersistentId(
namespace='swh',
scheme_version=_version,
object_type=_type,
object_id=_hash,
metadata={}
)
actual_result = identifiers.parse_persistent_identifier(pid)
self.assertEquals(actual_result, expected_result)
for pid, _type, _version, _hash, _metadata in [
('swh:1:cnt:9c95815d9e9d91b8dae8e05d8bbc696fe19f796b;lines=1-18;origin=https://github.com/python/cpython', # noqa
'cnt', '1', '9c95815d9e9d91b8dae8e05d8bbc696fe19f796b',
CONTENT, 1, '9c95815d9e9d91b8dae8e05d8bbc696fe19f796b',
{
'lines': '1-18',
'origin': 'https://github.com/python/cpython'
}),
('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;origin=deb://Debian/packages/linuxdoc-tools', # noqa
'dir', '1', '0b6959356d30f1a4e9b7f6bca59b9a336464c03d',
DIRECTORY, 1, '0b6959356d30f1a4e9b7f6bca59b9a336464c03d',
{
'origin': 'deb://Debian/packages/linuxdoc-tools'
})
]:
expected_result = {
'namespace': 'swh',
'scheme_version': _version,
'object_type': _type,
'object_id': _hash,
'metadata': _metadata
}
expected_result = PersistentId(
namespace='swh',
scheme_version=_version,
object_type=_type,
object_id=_hash,
metadata=_metadata
)
actual_result = identifiers.parse_persistent_identifier(pid)
self.assertEquals(actual_result, expected_result)
def test_parse_persistent_identifier_parsing_error(self):
for pid, _error in [
('swh:1:cnt',
'Wrong format: There should be 4 mandatory parameters'),
'Wrong format: There should be 4 mandatory values'),
('swh:1:',
'Wrong format: There should be 4 mandatory parameters'),
'Wrong format: There should be 4 mandatory values'),
('swh:',
'Wrong format: There should be 4 mandatory parameters'),
'Wrong format: There should be 4 mandatory values'),
('swh:1:cnt:',
'Wrong format: Identifier should be present'),
('foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment