diff --git a/PKG-INFO b/PKG-INFO index a1069b6cee2c2c7b8c0710cbcddc2a2bfd3dc243..d5b37a98f21d98df7b25d174c5a949f2e628de42 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.20 +Version: 0.0.21 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/bin/swh-hash-file b/bin/swh-hash-file new file mode 100755 index 0000000000000000000000000000000000000000..c30de78f2ebb4cdd7956e762b4310c89e1528939 --- /dev/null +++ b/bin/swh-hash-file @@ -0,0 +1,32 @@ +#!/usr/bin/python3 + +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import sys + +from swh.model.from_disk import Content +from swh.model.hashutil import hash_to_hex + + +HASH_ALGO = 'sha1_git' + + +def hash_file(fname): + return hash_to_hex(Content.from_file(path=fname.encode()).hash) + + +def main(fnames): + for f in fnames: + print(f, hash_file(f), sep='\t') + + +if __name__ == '__main__': + fnames = sys.argv[1:] + if not fnames: + print('Usage: swh-hash-file FILE...') + sys.exit(2) + + main(fnames) diff --git a/docs/data-model.rst b/docs/data-model.rst index f365f9f258de61421735fefce9b21747a0209fd4..1693ae496545272e794cf19cea3f6bf9ef854479 100644 --- a/docs/data-model.rst +++ b/docs/data-model.rst @@ -1,7 +1,7 @@ .. _data-model: -Software Heritage data model -============================ +Data model +========== TODO diff --git a/docs/index.rst b/docs/index.rst index db680710600a20d9a7d119477bf8798f968fa46b..74756e7522401bb4e7fa63d9231fc401524f1c6f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,6 +12,7 @@ Overview -------- * :ref:`data-model` +* :ref:`persistent-identifiers` Indices and tables diff --git a/docs/persistent-identifiers.rst b/docs/persistent-identifiers.rst new file mode 100644 index 0000000000000000000000000000000000000000..c796a808ff8670f9a13cc13f32b5018ccdfbc35d --- /dev/null +++ b/docs/persistent-identifiers.rst @@ -0,0 +1,145 @@ +.. _persistent-identifiers: + +Persistent identifiers +====================== + +You can point to objects present in the Software Heritage archive by the means +of **persistent identifiers** that are guaranteed to remain stable (persistent) +over time. Their syntax, meaning, and usage is described below. Note that they +are identifiers and not URLs, even though an URL-based resolver for Software +Heritage persistent identifiers is also provided. + +A persistent identifier can point to any software artifact (or "object") +available in the Software Heritage archive. Objects come in different types, +and most notably: + +* contents +* directories +* revisions +* releases +* snapshots + +Each object is identified by an intrinsic, type-specific object identifier that +is embedded in its persistent identifier as described below. Object identifiers +are strong cryptographic hashes computed on the entire set of object properties +to form a `Merkle structure <https://en.wikipedia.org/wiki/Merkle_tree>`_. + +See :ref:`data-model` for an overview of object types and how they are linked +together. See :py:mod:`swh.model.identifiers` for details on how intrinsic +object identifiers are computed. + + +Syntax +------ + +Syntactically, persistent identifiers are generated by the ``<identifier>`` +entry point of the grammar: + +.. code-block:: bnf + + <identifier> ::= "swh" ":" <scheme_version> ":" <object_type> ":" <object_id> ; + <scheme_version> ::= "1" ; + <object_type> ::= + "snp" (* snapshot *) + | "rel" (* release *) + | "rev" (* revision *) + | "dir" (* directory *) + | "cnt" (* content *) + ; + <object_id> ::= 40 * <hex_digit> ; (* intrinsic object id, as hex-encoded SHA1 *) + <hex_digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" + | "a" | "b" | "c" | "d" | "e" | "f" ; + + +Semantics +--------- + +``:`` is used as separator between the logical parts of identifiers. The +``swh`` prefix makes explicit that these identifiers are related to *SoftWare +Heritage*. ``1`` (``<scheme_version>``) is the current version of this +identifier *scheme*; future editions will use higher version numbers, possibly +breaking backward compatibility (but without breaking the resolvability of +identifiers that conform to previous versions of the scheme). + +A persistent identifier points to a single object, whose type is explicitly +captured by ``<object_type>``: + +* ``snp`` identifiers points to **snapshots**, +* ``rel`` to **releases**, +* ``rev`` to **revisions**, +* ``dir`` to **directories**, +* ``cnt`` to **releases**. + +The actual object pointed to is identified by the intrinsic identifier +``<object_id>``, which is a hex-encoded (using lowercase ASCII characters) SHA1 +computed on the content and metadata of the object itself, as follows: + +* for **snapshots**, intrinsic identifiers are computed as per + :py:func:`swh.model.identifiers.snapshot_identifier` + +* for **releases**, as per + :py:func:`swh.model.identifiers.release_identifier` + +* for **revisions**, as per + :py:func:`swh.model.identifiers.revision_identifier` + +* for **directories**, as per + :py:func:`swh.model.identifiers.directory_identifier` + +* for **contents**, the intrinsic identifier is the ``sha1_git`` hash of the + multiple hashes returned by + :py:func:`swh.model.identifiers.content_identifier`, i.e., the SHA1 of a byte + sequence obtained by juxtaposing the ASCII string ``"blob"`` (without + quotes), a space, the length of the content as decimal digits, a NULL byte, + and the actual content of the file. + + +Git compatibility +~~~~~~~~~~~~~~~~~ + +Intrinsic object identifiers for contents, directories, revisions, and releases +are, at present, compatible with the `Git <https://git-scm.com/>`_ way of +`computing identifiers +<https://git-scm.com/book/en/v2/Git-Internals-Git-Objects>`_ for its objects. +A Software Heritage content identifier will be identical to a Git blob +identifier of any file with the same content, a Software Heritage revision +identifier will be identical to the corresponding Git commit identifier, etc. +This is not the case for snapshot identifiers as Git doesn't have a +corresponding object type. + +Note that Git compatibility is incidental and is not guaranteed to be +maintained in future versions of this scheme (or Git). + + +Examples +-------- + +* ``swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2`` points to the content + of a file containing the full text of the GPL3 license +* ``swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505`` points to a directory + containing the source code of the Darktable photography application as it was + at some point on 4 May 2017 +* ``swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d`` points to a commit in + the development history of Darktable, dated 16 January 2017, that added + undo/redo supports for masks +* ``swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f`` points to Darktable + release 2.3.0, dated 24 December 2016 +* ``swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453`` points to a snapshot + of the entire Darktable Git repository taken on 4 May 2017 from GitHub + + +Resolution +---------- + +Persistent identifiers can be resolved using the Software Heritage Web +application (see :py:mod:`swh.web`). + +In particular, the ``/browse/`` endpoint can be given a persistent identifier +and will lead to the browsing page of the corresponding object, like this: +``https://archive.softwareheritage.org/browse/<identifier>``. For example: + +* `<https://archive.softwareheritage.org/browse/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2>`_ +* `<https://archive.softwareheritage.org/browse/swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505>`_ +* `<https://archive.softwareheritage.org/browse/swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d>`_ +* `<https://archive.softwareheritage.org/browse/swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f>`_ +* `<https://archive.softwareheritage.org/browse/swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453>`_ diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index a1069b6cee2c2c7b8c0710cbcddc2a2bfd3dc243..d5b37a98f21d98df7b25d174c5a949f2e628de42 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.20 +Version: 0.0.21 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh.model.egg-info/SOURCES.txt b/swh.model.egg-info/SOURCES.txt index bd2e46a13762f6726a8e68040be297c6c048d5c3..d8706419ea007d2a2294198e0fc110ad06cfad9b 100644 --- a/swh.model.egg-info/SOURCES.txt +++ b/swh.model.egg-info/SOURCES.txt @@ -10,6 +10,7 @@ requirements.txt setup.py version.txt bin/git-revhash +bin/swh-hash-file bin/swh-revhash debian/changelog debian/compat @@ -22,6 +23,7 @@ docs/Makefile docs/conf.py docs/data-model.rst docs/index.rst +docs/persistent-identifiers.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index 96905d956f1dc2d2bbe0833a4a23f6009dccab06..0dfbdc34a90c9798b8ce5fdf6d36c8feec2e03d2 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -122,8 +122,9 @@ def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): Args: fobj: a file-like object length: the length of the contents of the file-like object (for the - git-specific algorithms) - algorithms: the hashing algorithms used + git-specific algorithms) + algorithms: the hashing algorithms to be used, as an iterable over + strings Returns: a dict mapping each algorithm to a bytes digest. diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index b4ec15dd4ee13320e7d56b03e6244efc428a2bba..51d2d2e9b6f94467b526ef6291bc1f76d79b8202 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,6 +8,14 @@ import datetime from functools import lru_cache from .hashutil import hash_data, hash_git_data, DEFAULT_ALGORITHMS +from .hashutil import hash_to_hex + + +SNAPSHOT = 'snapshot' +REVISION = 'revision' +RELEASE = 'release' +DIRECTORY = 'directory' +CONTENT = 'content' @lru_cache() @@ -584,3 +592,64 @@ def snapshot_identifier(snapshot, *, ignore_unresolved=False): for name, target in unresolved)) return identifier_to_str(hash_git_data(b''.join(lines), 'snapshot')) + + +def persistent_identifier(type, object, version=1): + """Compute persistent identifier (stable over time) as per + documentation. + + Documentation: + https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html # noqa + + Args: + type (str): Object's type + object (str): Object's dict representation + version (int): persistent identifier version (default to 1) + + Returns: + Persistent identifier as string. + + """ + _map = { + SNAPSHOT: { + 'short_name': 'snp', + 'key_id': 'id' + }, + RELEASE: { + 'short_name': 'rel', + 'key_id': 'id' + }, + REVISION: { + 'short_name': 'rev', + 'key_id': 'id' + }, + DIRECTORY: { + 'short_name': 'dir', + 'key_id': 'id' + }, + CONTENT: { + 'short_name': 'cnt', + 'key_id': 'sha1_git' + }, + } + o = _map[type] + _hash = hash_to_hex(object[o['key_id']]) + return 'swh:%s:%s:%s' % (version, o['short_name'], _hash) + + +PERSISTENT_IDENTIFIER_KEYS = [ + 'namespace', 'scheme_version', 'object_type', 'object_id'] + + +def parse_persistent_identifier(persistent_id): + """Parse swh's persistent identifier scheme. + + Args: + persistent_id (str): A persistent identifier + + Returns: + dict with keys namespace, scheme_version, object_type, object_id + + """ + data = persistent_id.split(':') + return dict(zip(PERSISTENT_IDENTIFIER_KEYS, data)) diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index 4a56b0c26434d86c5a263d20b042a15fc3c09baf..26dc01e98543557197926ac41da6733d8de786ff 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2017 The Software Heritage developers +# Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,6 +11,9 @@ from nose.tools import istest from swh.model import hashutil, identifiers +from swh.model.identifiers import SNAPSHOT, RELEASE, REVISION, DIRECTORY +from swh.model.identifiers import CONTENT + class UtilityFunctionsIdentifier(unittest.TestCase): def setUp(self): @@ -768,3 +771,53 @@ class SnapshotIdentifier(unittest.TestCase): identifiers.snapshot_identifier(self.all_types), identifiers.identifier_to_str(self.all_types['id']), ) + + def test_persistent_identifier(self): + _snapshot = {'id': hashutil.hash_to_bytes( + 'c7c108084bc0bf3d81436bf980b46e98bd338453')} + _release = {'id': '22ece559cc7cc2364edc5e5593d63ae8bd229f9f'} + _revision = {'id': '309cf2674ee7a0749978cf8265ab91a60aea0f7d'} + _directory = {'id': 'd198bc9d7a6bcf6db04f476d29314f157507d505'} + _content = {'sha1_git': '94a9ed024d3859793618152ea559a168bbcbb5e2'} + for full_type, _hash, expected_persistent_id, version in [ + (SNAPSHOT, _snapshot, + 'swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', None), + (RELEASE, _release, + 'swh:2:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', 2), + (REVISION, _revision, + 'swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', None), + (DIRECTORY, _directory, + 'swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', None), + (CONTENT, _content, + 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 1) + ]: + if version: + actual_value = identifiers.persistent_identifier( + full_type, _hash, version) + else: + actual_value = identifiers.persistent_identifier( + full_type, _hash) + + self.assertEquals(actual_value, expected_persistent_id) + + def test_parse_persistent_identifier(self): + for pid, _type, _version, _hash in [ + ('swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 'cnt', + '1', '94a9ed024d3859793618152ea559a168bbcbb5e2'), + ('swh:2:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir', + '2', 'd198bc9d7a6bcf6db04f476d29314f157507d505'), + ('swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', 'rev', + '1', '309cf2674ee7a0749978cf8265ab91a60aea0f7d'), + ('swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', 'rel', + '1', '22ece559cc7cc2364edc5e5593d63ae8bd229f9f'), + ('swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', 'snp', + '1', 'c7c108084bc0bf3d81436bf980b46e98bd338453'), + ]: + expected_result = { + 'namespace': 'swh', + 'scheme_version': _version, + 'object_type': _type, + 'object_id': _hash, + } + actual_result = identifiers.parse_persistent_identifier(pid) + self.assertEquals(actual_result, expected_result) diff --git a/version.txt b/version.txt index 7b21cbd42663afc173ab30f0e9bacfdc715c6e05..943a41a2f9b8a76b2fa6d967ea9da976022b3ae6 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.20-0-g91d74ef \ No newline at end of file +v0.0.21-0-gbdf26f5 \ No newline at end of file