Skip to content
Snippets Groups Projects
Commit 1ff05161 authored by Antoine Lambert's avatar Antoine Lambert
Browse files

identifiers: Rename some functions and types related to SWHIDs

When Software Heritage persistent identifiers were introduced, they were not
yet abbreviated as SWHIDs.

Now that abbreviation is growing adoption, rename some functions and types in
swh.model.identifiers for consistency:

  - PersistentId -> SWHID

  - persistent_identifier -> swhid

  - parse_persistent_identifier -> parse_swhid

Backward compatibility with previous naming is maintained but deprecation
warnings are introduced to encourage the use of the new names.

Numerous variables in swh.model codebase have also been renamed accordingly.

Also rework and improve documentation.
parent 8863b5c1
No related branches found
Tags v0.3.8
No related merge requests found
......@@ -4,8 +4,8 @@ swh-model
Implementation of the Data model of the Software Heritage project, used to
archive source code artifacts.
This module defines the notion of Persistent Identifier (PID) and provides
tools to compute them:
This module defines the notion of SoftWare Heritage persistent IDentifiers
(SWHIDs) and provides tools to compute them:
```sh
$ swh-identify fork.c kmod.c sched/deadline.c
......
......@@ -331,7 +331,7 @@ corresponding object, like this:
A **dedicated** ``/resolve`` **endpoint** of the Software Heritage `Web API
<https://archive.softwareheritage.org/api/>`_ is also available to
programmatically resolve SWHIDs; see: :http:get:`/api/1/resolve/(swh_id)/`.
programmatically resolve SWHIDs; see: :http:get:`/api/1/resolve/(swhid)/`.
Examples:
......
......@@ -7,6 +7,9 @@ warn_unused_ignores = True
[mypy-attrs_strict.*] # a bit sad, but...
ignore_missing_imports = True
[mypy-deprecated.*]
ignore_missing_imports = True
[mypy-django.*] # false positive, only used my hypotesis' extras
ignore_missing_imports = True
......
......@@ -3,6 +3,7 @@
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
attrs
attrs_strict >= 0.0.7
deprecated
hypothesis
iso8601
python-dateutil
......
......@@ -12,7 +12,15 @@ from functools import partial
from urllib.parse import urlparse
from swh.model import hashutil
from swh.model import identifiers as pids
from swh.model.identifiers import (
origin_identifier,
snapshot_identifier,
parse_swhid,
swhid,
SWHID,
CONTENT,
DIRECTORY,
)
from swh.model.exceptions import ValidationError
from swh.model.from_disk import Content, Directory
......@@ -29,40 +37,38 @@ _DULWICH_TYPES = {
}
class PidParamType(click.ParamType):
class SWHIDParamType(click.ParamType):
name = "persistent identifier"
def convert(self, value, param, ctx):
try:
pids.parse_persistent_identifier(value)
parse_swhid(value)
return value # return as string, as we need just that
except ValidationError as e:
self.fail("%s is not a valid SWHID. %s." % (value, e), param, ctx)
def pid_of_file(path):
def swhid_of_file(path):
object = Content.from_file(path=path).get_data()
return pids.persistent_identifier(pids.CONTENT, object)
return swhid(CONTENT, object)
def pid_of_file_content(data):
def swhid_of_file_content(data):
object = Content.from_bytes(mode=644, data=data).get_data()
return pids.persistent_identifier(pids.CONTENT, object)
return swhid(CONTENT, object)
def pid_of_dir(path):
def swhid_of_dir(path):
object = Directory.from_disk(path=path).get_data()
return pids.persistent_identifier(pids.DIRECTORY, object)
return swhid(DIRECTORY, object)
def pid_of_origin(url):
pid = pids.PersistentId(
object_type="origin", object_id=pids.origin_identifier({"url": url})
)
return str(pid)
def swhid_of_origin(url):
swhid = SWHID(object_type="origin", object_id=origin_identifier({"url": url}))
return str(swhid)
def pid_of_git_repo(path):
def swhid_of_git_repo(path):
repo = dulwich.repo.Repo(path)
branches = {}
......@@ -84,10 +90,8 @@ def pid_of_git_repo(path):
snapshot = {"branches": branches}
pid = pids.PersistentId(
object_type="snapshot", object_id=pids.snapshot_identifier(snapshot)
)
return str(pid)
swhid = SWHID(object_type="snapshot", object_id=snapshot_identifier(snapshot))
return str(swhid)
def identify_object(obj_type, follow_symlinks, obj):
......@@ -105,29 +109,29 @@ def identify_object(obj_type, follow_symlinks, obj):
except ValueError:
raise click.BadParameter("cannot detect object type for %s" % obj)
pid = None
swhid = None
if obj == "-":
content = sys.stdin.buffer.read()
pid = pid_of_file_content(content)
swhid = swhid_of_file_content(content)
elif obj_type in ["content", "directory"]:
path = obj.encode(sys.getfilesystemencoding())
if follow_symlinks and os.path.islink(obj):
path = os.path.realpath(obj)
if obj_type == "content":
pid = pid_of_file(path)
swhid = swhid_of_file(path)
elif obj_type == "directory":
pid = pid_of_dir(path)
swhid = swhid_of_dir(path)
elif obj_type == "origin":
pid = pid_of_origin(obj)
swhid = swhid_of_origin(obj)
elif obj_type == "snapshot":
pid = pid_of_git_repo(obj)
swhid = swhid_of_git_repo(obj)
else: # shouldn't happen, due to option validation
raise click.BadParameter("invalid object type: " + obj_type)
# note: we return original obj instead of path here, to preserve user-given
# file name in output
return (obj, pid)
return (obj, swhid)
@click.command(context_settings=CONTEXT_SETTINGS)
......@@ -156,7 +160,7 @@ def identify_object(obj_type, follow_symlinks, obj):
"--verify",
"-v",
metavar="SWHID",
type=PidParamType(),
type=SWHIDParamType(),
help="reference identifier to be compared with computed one",
)
@click.argument("objects", nargs=-1, required=True)
......@@ -197,18 +201,18 @@ def identify(obj_type, verify, show_filename, follow_symlinks, objects):
results = map(partial(identify_object, obj_type, follow_symlinks), objects)
if verify:
pid = next(results)[1]
if verify == pid:
click.echo("SWHID match: %s" % pid)
swhid = next(results)[1]
if verify == swhid:
click.echo("SWHID match: %s" % swhid)
sys.exit(0)
else:
click.echo("SWHID mismatch: %s != %s" % (verify, pid))
click.echo("SWHID mismatch: %s != %s" % (verify, swhid))
sys.exit(1)
else:
for (obj, pid) in results:
msg = pid
for (obj, swhid) in results:
msg = swhid
if show_filename:
msg = "%s\t%s" % (pid, os.fsdecode(obj))
msg = "%s\t%s" % (swhid, os.fsdecode(obj))
click.echo(msg)
......
......@@ -10,6 +10,8 @@ import hashlib
from functools import lru_cache
from typing import Any, Dict, NamedTuple
from deprecated import deprecated
from .exceptions import ValidationError
from .fields.hashes import validate_sha1
from .hashutil import hash_git_data, hash_to_hex, MultiHash
......@@ -22,11 +24,18 @@ RELEASE = "release"
DIRECTORY = "directory"
CONTENT = "content"
PID_NAMESPACE = "swh"
PID_VERSION = 1
PID_TYPES = ["ori", "snp", "rel", "rev", "dir", "cnt"]
PID_SEP = ":"
PID_CTXT_SEP = ";"
SWHID_NAMESPACE = "swh"
SWHID_VERSION = 1
SWHID_TYPES = ["ori", "snp", "rel", "rev", "dir", "cnt"]
SWHID_SEP = ":"
SWHID_CTXT_SEP = ";"
# deprecated variables
PID_NAMESPACE = SWHID_NAMESPACE
PID_VERSION = SWHID_VERSION
PID_TYPES = SWHID_TYPES
PID_SEP = SWHID_SEP
PID_CTXT_SEP = SWHID_CTXT_SEP
@lru_cache()
......@@ -649,8 +658,8 @@ _object_type_map = {
}
_PersistentId = NamedTuple(
"PersistentId",
_SWHID = NamedTuple(
"SWHID",
[
("namespace", str),
("scheme_version", int),
......@@ -661,25 +670,23 @@ _PersistentId = NamedTuple(
)
class PersistentId(_PersistentId):
class SWHID(_SWHID):
"""
Named tuple holding the relevant info associated to a Software Heritage
persistent identifier.
Named tuple holding the relevant info associated to a SoftWare Heritage
persistent IDentifier (SWHID)
Args:
namespace (str): the namespace of the identifier, defaults to 'swh'
namespace (str): the namespace of the identifier, defaults to ``swh``
scheme_version (int): the scheme version of the identifier,
defaults to 1
object_type (str): the type of object the identifier points to,
either 'content', 'directory', 'release', 'revision' or 'snapshot'
object_id (dict/bytes/str): object's dict representation or
object identifier
either ``content``, ``directory``, ``release``, ``revision`` or ``snapshot``
object_id (str): object's identifier
metadata (dict): optional dict filled with metadata related to
pointed object
Raises:
swh.model.exceptions.ValidationError: In case of invalid object type
or id
swh.model.exceptions.ValidationError: In case of invalid object type or id
Once created, it contains the following attributes:
......@@ -690,14 +697,14 @@ class PersistentId(_PersistentId):
object_id (str): hexadecimal representation of the object hash
metadata (dict): metadata related to the pointed object
To get the raw persistent identifier string from an instance of
this named tuple, use the :func:`str` function::
To get the raw SWHID string from an instance of this named tuple,
use the :func:`str` function::
pid = PersistentId(
swhid = SWHID(
object_type='content',
object_id='8ff44f081d43176474b267de5451f2c2e88089d0'
)
pid_str = str(pid)
swhid_str = str(swhid)
# 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0'
"""
......@@ -705,79 +712,110 @@ class PersistentId(_PersistentId):
def __new__(
cls,
namespace=PID_NAMESPACE,
scheme_version=PID_VERSION,
object_type="",
object_id="",
metadata={},
namespace: str = SWHID_NAMESPACE,
scheme_version: int = SWHID_VERSION,
object_type: str = "",
object_id: str = "",
metadata: Dict[str, Any] = {},
):
o = _object_type_map.get(object_type)
if not o:
raise ValidationError(
"Wrong input: Supported types are %s" % (list(_object_type_map.keys()))
)
if namespace != PID_NAMESPACE:
if namespace != SWHID_NAMESPACE:
raise ValidationError(
"Wrong format: only supported namespace is '%s'" % PID_NAMESPACE
"Wrong format: only supported namespace is '%s'" % SWHID_NAMESPACE
)
if scheme_version != PID_VERSION:
if scheme_version != SWHID_VERSION:
raise ValidationError(
"Wrong format: only supported version is %d" % PID_VERSION
"Wrong format: only supported version is %d" % SWHID_VERSION
)
# internal swh representation resolution
if isinstance(object_id, dict):
object_id = object_id[o["key_id"]]
validate_sha1(object_id) # can raise if invalid hash
object_id = hash_to_hex(object_id)
return super(cls, PersistentId).__new__(
return super().__new__(
cls, namespace, scheme_version, object_type, object_id, metadata
)
def __str__(self):
def __str__(self) -> str:
o = _object_type_map.get(self.object_type)
pid = PID_SEP.join(
assert o
swhid = SWHID_SEP.join(
[self.namespace, str(self.scheme_version), o["short_name"], self.object_id]
)
if self.metadata:
for k, v in self.metadata.items():
pid += "%s%s=%s" % (PID_CTXT_SEP, k, v)
return pid
swhid += "%s%s=%s" % (SWHID_CTXT_SEP, k, v)
return swhid
@deprecated("Use swh.model.identifiers.SWHID instead")
class PersistentId(SWHID):
"""
Named tuple holding the relevant info associated to a SoftWare Heritage
persistent IDentifier.
.. deprecated:: 0.3.8
Use :class:`swh.model.identifiers.SWHID` instead
"""
def __new__(cls, *args, **kwargs):
return super(cls, PersistentId).__new__(cls, *args, **kwargs)
def persistent_identifier(object_type, object_id, scheme_version=1, metadata={}):
"""Compute :ref:`SWHID <persistent-identifiers>` persistent identifiers.
def swhid(
object_type: str,
object_id: str,
scheme_version: int = 1,
metadata: Dict[str, Any] = {},
) -> str:
"""Compute :ref:`persistent-identifiers`
Args:
object_type (str): object's type, either 'content', 'directory',
'release', 'revision' or 'snapshot'
object_id (dict/bytes/str): object's dict representation or object
identifier
scheme_version (int): persistent identifier scheme version,
defaults to 1
metadata (dict): metadata related to the pointed object
object_type: object's type, either ``content``, ``directory``,
``release``, ``revision`` or ``snapshot``
object_id: object's identifier
scheme_version: SWHID scheme version, defaults to 1
metadata: metadata related to the pointed object
Raises:
swh.model.exceptions.ValidationError: In case of invalid object type
or id
swh.model.exceptions.ValidationError: In case of invalid object type or id
Returns:
str: the persistent identifier
the SWHID of the object
"""
pid = PersistentId(
swhid = SWHID(
scheme_version=scheme_version,
object_type=object_type,
object_id=object_id,
metadata=metadata,
)
return str(pid)
return str(swhid)
@deprecated("Use swh.model.identifiers.swhid instead")
def persistent_identifier(*args, **kwargs) -> str:
"""Compute :ref:`persistent-identifiers`
.. deprecated:: 0.3.8
Use :func:`swh.model.identifiers.swhid` instead
"""
return swhid(*args, **kwargs)
def parse_persistent_identifier(persistent_id):
"""Parse :ref:`SWHID <persistent-identifiers>` persistent identifiers.
def parse_swhid(swhid: str) -> SWHID:
"""Parse :ref:`persistent-identifiers`.
Args:
persistent_id (str): A persistent identifier
swhid (str): A persistent identifier
Raises:
swh.model.exceptions.ValidationError: in case of:
......@@ -790,35 +828,43 @@ def parse_persistent_identifier(persistent_id):
* invalid hash identifier supplied
Returns:
PersistentId: a named tuple holding the parsing result
a named tuple holding the parsing result
"""
# <pid>;<contextual-information>
persistent_id_parts = persistent_id.split(PID_CTXT_SEP)
pid_data = persistent_id_parts.pop(0).split(":")
# <swhid>;<contextual-information>
swhid_parts = swhid.split(SWHID_CTXT_SEP)
swhid_data = swhid_parts.pop(0).split(":")
if len(pid_data) != 4:
if len(swhid_data) != 4:
raise ValidationError("Wrong format: There should be 4 mandatory values")
# Checking for parsing errors
_ns, _version, _type, _id = pid_data
pid_data[1] = int(pid_data[1])
_ns, _version, _type, _id = swhid_data
for otype, data in _object_type_map.items():
if _type == data["short_name"]:
pid_data[2] = otype
_type = otype
break
if not _id:
raise ValidationError("Wrong format: Identifier should be present")
persistent_id_metadata = {}
for part in persistent_id_parts:
_metadata = {}
for part in swhid_parts:
try:
key, val = part.split("=")
persistent_id_metadata[key] = val
_metadata[key] = val
except Exception:
msg = "Contextual data is badly formatted, form key=val expected"
raise ValidationError(msg)
pid_data.append(persistent_id_metadata)
return PersistentId(*pid_data)
return SWHID(_ns, int(_version), _type, _id, _metadata)
@deprecated("Use swh.model.identifiers.parse_swhid instead")
def parse_persistent_identifier(persistent_id: str) -> PersistentId:
"""Parse :ref:`persistent-identifiers`.
.. deprecated:: 0.3.8
Use :func:`swh.model.identifiers.parse_swhid` instead
"""
return PersistentId(**parse_swhid(persistent_id)._asdict())
......@@ -22,9 +22,9 @@ class TestIdentify(DataMixin, unittest.TestCase):
super().setUp()
self.runner = CliRunner()
def assertPidOK(self, result, pid):
def assertSWHID(self, result, swhid):
self.assertEqual(result.exit_code, 0)
self.assertEqual(result.output.split()[0], pid)
self.assertEqual(result.output.split()[0], swhid)
def test_no_args(self):
result = self.runner.invoke(cli.identify)
......@@ -36,21 +36,21 @@ class TestIdentify(DataMixin, unittest.TestCase):
for filename, content in self.contents.items():
path = os.path.join(self.tmpdir_name, filename)
result = self.runner.invoke(cli.identify, ["--type", "content", path])
self.assertPidOK(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
def test_content_id_from_stdin(self):
"""identify file content"""
self.make_contents(self.tmpdir_name)
for _, content in self.contents.items():
result = self.runner.invoke(cli.identify, ["-"], input=content["data"])
self.assertPidOK(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
def test_directory_id(self):
"""identify an entire directory"""
self.make_from_tarball(self.tmpdir_name)
path = os.path.join(self.tmpdir_name, b"sample-folder")
result = self.runner.invoke(cli.identify, ["--type", "directory", path])
self.assertPidOK(result, "swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759")
self.assertSWHID(result, "swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759")
def test_snapshot_id(self):
"""identify a snapshot"""
......@@ -64,7 +64,7 @@ class TestIdentify(DataMixin, unittest.TestCase):
result = self.runner.invoke(
cli.identify, ["--type", "snapshot", repo_dir]
)
self.assertPidOK(
self.assertSWHID(
result, "swh:1:snp:abc888898124270905a0ef3c67e872ce08e7e0c1"
)
......@@ -72,7 +72,7 @@ class TestIdentify(DataMixin, unittest.TestCase):
"""identify an origin URL"""
url = "https://github.com/torvalds/linux"
result = self.runner.invoke(cli.identify, ["--type", "origin", url])
self.assertPidOK(result, "swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f")
self.assertSWHID(result, "swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f")
def test_symlink(self):
"""identify symlink --- both itself and target"""
......@@ -82,10 +82,10 @@ class TestIdentify(DataMixin, unittest.TestCase):
os.symlink(os.path.basename(regular), link)
result = self.runner.invoke(cli.identify, [link])
self.assertPidOK(result, "swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99")
self.assertSWHID(result, "swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99")
result = self.runner.invoke(cli.identify, ["--no-dereference", link])
self.assertPidOK(result, "swh:1:cnt:996f1789ff67c0e3f69ef5933a55d54c5d0e9954")
self.assertSWHID(result, "swh:1:cnt:996f1789ff67c0e3f69ef5933a55d54c5d0e9954")
def test_show_filename(self):
"""filename is shown by default"""
......@@ -108,7 +108,7 @@ class TestIdentify(DataMixin, unittest.TestCase):
result = self.runner.invoke(
cli.identify, ["--type", "content", "--no-filename", path]
)
self.assertPidOK(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
def test_auto_content(self):
"""automatic object type detection: content"""
......
......@@ -17,7 +17,7 @@ from swh.model.identifiers import (
RELEASE,
REVISION,
SNAPSHOT,
PersistentId,
SWHID,
normalize_timestamp,
)
......@@ -739,7 +739,7 @@ class SnapshotIdentifier(unittest.TestCase):
identifiers.identifier_to_str(self.all_types["id"]),
)
def test_persistent_identifier(self):
def test_swhid(self):
_snapshot_id = _x("c7c108084bc0bf3d81436bf980b46e98bd338453")
_release_id = "22ece559cc7cc2364edc5e5593d63ae8bd229f9f"
_revision_id = "309cf2674ee7a0749978cf8265ab91a60aea0f7d"
......@@ -751,7 +751,7 @@ class SnapshotIdentifier(unittest.TestCase):
_directory = {"id": _directory_id}
_content = {"sha1_git": _content_id}
for full_type, _hash, expected_persistent_id, version, _meta in [
for full_type, _hash, expected_swhid, version, _meta in [
(
SNAPSHOT,
_snapshot_id,
......@@ -831,17 +831,15 @@ class SnapshotIdentifier(unittest.TestCase):
),
]:
if version:
actual_value = identifiers.persistent_identifier(
actual_value = identifiers.swhid(
full_type, _hash, version, metadata=_meta
)
else:
actual_value = identifiers.persistent_identifier(
full_type, _hash, metadata=_meta
)
actual_value = identifiers.swhid(full_type, _hash, metadata=_meta)
self.assertEqual(actual_value, expected_persistent_id)
self.assertEqual(actual_value, expected_swhid)
def test_persistent_identifier_wrong_input(self):
def test_swhid_wrong_input(self):
_snapshot_id = "notahash4bc0bf3d81436bf980b46e98bd338453"
_snapshot = {"id": _snapshot_id}
......@@ -851,10 +849,10 @@ class SnapshotIdentifier(unittest.TestCase):
("foo", ""),
]:
with self.assertRaises(ValidationError):
identifiers.persistent_identifier(_type, _hash)
identifiers.swhid(_type, _hash)
def test_parse_persistent_identifier(self):
for pid, _type, _version, _hash in [
def test_parse_swhid(self):
for swhid, _type, _version, _hash in [
(
"swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2",
CONTENT,
......@@ -886,17 +884,17 @@ class SnapshotIdentifier(unittest.TestCase):
"c7c108084bc0bf3d81436bf980b46e98bd338453",
),
]:
expected_result = PersistentId(
expected_result = SWHID(
namespace="swh",
scheme_version=_version,
object_type=_type,
object_id=_hash,
metadata={},
)
actual_result = identifiers.parse_persistent_identifier(pid)
actual_result = identifiers.parse_swhid(swhid)
self.assertEqual(actual_result, expected_result)
for pid, _type, _version, _hash, _metadata in [
for swhid, _type, _version, _hash, _metadata in [
(
"swh:1:cnt:9c95815d9e9d91b8dae8e05d8bbc696fe19f796b;lines=1-18;origin=https://github.com/python/cpython", # noqa
CONTENT,
......@@ -912,18 +910,18 @@ class SnapshotIdentifier(unittest.TestCase):
{"origin": "deb://Debian/packages/linuxdoc-tools"},
),
]:
expected_result = PersistentId(
expected_result = SWHID(
namespace="swh",
scheme_version=_version,
object_type=_type,
object_id=_hash,
metadata=_metadata,
)
actual_result = identifiers.parse_persistent_identifier(pid)
actual_result = identifiers.parse_swhid(swhid)
self.assertEqual(actual_result, expected_result)
def test_parse_persistent_identifier_parsing_error(self):
for pid in [
def test_parse_swhid_parsing_error(self):
for swhid in [
("swh:1:cnt"),
("swh:1:"),
("swh:"),
......@@ -936,7 +934,7 @@ class SnapshotIdentifier(unittest.TestCase):
("swh:1:snp:foo"),
]:
with self.assertRaises(ValidationError):
identifiers.parse_persistent_identifier(pid)
identifiers.parse_swhid(swhid)
def test_persistentid_class_validation_error(self):
for _ns, _version, _type, _id in [
......@@ -946,7 +944,7 @@ class SnapshotIdentifier(unittest.TestCase):
("swh", 1, SNAPSHOT, "gh6959356d30f1a4e9b7f6bca59b9a336464c03d"),
]:
with self.assertRaises(ValidationError):
PersistentId(
SWHID(
namespace=_ns,
scheme_version=_version,
object_type=_type,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment