diff --git a/PKG-INFO b/PKG-INFO index 8afa9731644afd7cb3d175b5c5735c4eee8aae86..b2ef8133528bcc8e62d761a1c4bcdd2d05f493b0 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.23 +Version: 0.0.24 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/README-dev.md b/README-dev.md deleted file mode 100644 index 97f9fb42da360b997cb7cca64cb7866948846930..0000000000000000000000000000000000000000 --- a/README-dev.md +++ /dev/null @@ -1,118 +0,0 @@ -Git sha1 computation --------------------- - -Document to describe how the git sha1 computation takes place. - -### commit/revision - -sha1 git commit/revision computation: - - commit `size`\0 - tree `sha1-git-tree-and-subtree-in-plain-hex-string` - ([parent `commit-parent-n`]) - author `name` <`email`> `date-ts` `date-offset` - committer `name` <`email`> `date-ts` `date-offset` - ([extra-header-key-n extra-header-value-n]) - - `commit-message` - (inline-gpg-signature) - - -Notes: -- [] denotes list of entries (one per line) -- () denotes optional entry. For example, the parent entry is optional. -- empty line at the end of the commit message -- timestamp example: 1444054085 -- date offset for example: +0200, -0100 - -sources: -- commit_tree_extended: https://github.com/git/git/blob/8d530c4d64ffcc853889f7b385f554d53db375ed/commit.c#L1522 -- commit_tree: https://github.com/git/git/blob/8d530c4d64ffcc853889f7b385f554d53db375ed/commit.c#L1392 - -Examples: - -```sh -$ cat commit.txt -tree 85a74718d377195e1efd0843ba4f3260bad4fe07 -parent 01e2d0627a9a6edb24c37db45db5ecb31e9de808 -author Linus Torvalds <torvalds@linux-foundation.org> 1436739030 -0700 -committer Linus Torvalds <torvalds@linux-foundation.org> 1436739030 -0700 -svn-repo-uuid 046f1af7-66c2-d61b-5410-ce57b7db7bff -svn-revision 10 - -Linux 4.2-rc2 -``` - -``` -$ cat commit.txt | git hash-object -t commit --stdin -010d34f384fa99d047cdd5e2f41e56e5c2feee45 -``` - -commit: 44cc742a8ca17b9c279be4cc195a93a6ef7a320e -``` -$ git cat-file -p 44cc742a8ca17b9c279be4cc195a93a6ef7a320e -... -tree b134f9b7dc434f593c0bab696345548b37de0558 -parent 689664ae944b4692724f13b709a4e4de28b54e57 -parent c888305e1efbaa252d01b4e5e6b778f865a97514 -author Jiang Xin <worldhello.net@gmail.com> 1428538899 +0800 -committer Jiang Xin <worldhello.net@gmail.com> 1428538899 +0800 -gpgsig -----BEGIN PGP SIGNATURE----- - Version: GnuPG v1.4.13 (Darwin) - - iQIcBAABAgAGBQJVJcYsAAoJEBiY3kIkQRNJVAUQAJ8/XQIfMqqC5oYeEFfHOPYZ - L7qy46bXHVBa9Qd8zAJ2Dou3IbI2ZoF6/Et89K/UggOycMlt5FKV/9toWyuZv4Po - L682wonoxX99qvVTHo6+wtnmYO7+G0f82h+qHMErxjP+I6gzRNBvRr+SfY7VlGdK - wikMKOMWC5smrScSHITnOq1Ews5pe3N7qDYMzK0XVZmgDoaem4RSWMJs4My/qVLN - e0CqYWq2A22GX7sXl6pjneJYQvcAXUX+CAzp24QnPSb+Q22Guj91TcxLFcHCTDdn - qgqMsEyMiisoglwrCbO+D+1xq9mjN9tNFWP66SQ48mrrHYTBV5sz9eJyDfroJaLP - CWgbDTgq6GzRMehHT3hXfYS5NNatjnhkNISXR7pnVP/obIi/vpWh5ll6Gd8q26z+ - a/O41UzOaLTeNI365MWT4/cnXohVLRG7iVJbAbCxoQmEgsYMRc/pBAzWJtLfcB2G - jdTswYL6+MUdL8sB9pZ82D+BP/YAdHe69CyTu1lk9RT2pYtI/kkfjHubXBCYEJSG - +VGllBbYG6idQJpyrOYNRJyrDi9yvDJ2W+S0iQrlZrxzGBVGTB/y65S8C+2WTBcE - lf1Qb5GDsQrZWgD+jtWTywOYHtCBwyCKSAXxSARMbNPeak9WPlcW/Jmu+fUcMe2x - dg1KdHOa34shrKDaOVzW - =od6m - -----END PGP SIGNATURE----- - -Merge branch 'master' of git://github.com/alexhenrie/git-po - -* 'master' of git://github.com/alexhenrie/git-po: - l10n: ca.po: update translation -``` - -### directory/tree - -sha1 git directory/tree computation: - - tree `tree-size`\0 - <file-perm> <file-name>\0<file-sha1-in-20-bytes-string>...<dir-perm> <dir-name>\0<dir-sha1-in-20-bytes-string>... - - -Notes: -- no newline separator between tree entries -- no empty newline at the end of the tree entries -- tree content header size is the length of the content -- The tree entries are ordered according to bytes in their <name> properties. - -Note: Tree entries referencing trees are sorted as if their name have a trailing / -at their end. - -Possible permissions are: -- 100644 - file -- 40000 - directory -- 100755 - executable file -- 120000 - symbolink link -- 160000 - git link (relative to submodule) - -### content/file - -sha1 git content computation: - - blob `blob-size`\0 - `blob-content` - -Notes: -- no newline at the end of the blob content - -Compress with DEFLATE and compute sha1 diff --git a/bin/swh-hash-file b/bin/swh-hash-file deleted file mode 100755 index c30de78f2ebb4cdd7956e762b4310c89e1528939..0000000000000000000000000000000000000000 --- a/bin/swh-hash-file +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/python3 - -# Copyright (C) 2018 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import sys - -from swh.model.from_disk import Content -from swh.model.hashutil import hash_to_hex - - -HASH_ALGO = 'sha1_git' - - -def hash_file(fname): - return hash_to_hex(Content.from_file(path=fname.encode()).hash) - - -def main(fnames): - for f in fnames: - print(f, hash_file(f), sep='\t') - - -if __name__ == '__main__': - fnames = sys.argv[1:] - if not fnames: - print('Usage: swh-hash-file FILE...') - sys.exit(2) - - main(fnames) diff --git a/debian/control b/debian/control index 49c2aae10397ecc8de1ab3502e910edea430994b..8124280f1450a9ed9434eea527e4bdcffa6d6599 100644 --- a/debian/control +++ b/debian/control @@ -6,6 +6,7 @@ Build-Depends: debhelper (>= 9), dh-python (>= 2), python3 (>= 3.5) | python3-pyblake2, python3-all, + python3-click, python3-nose, python3-setuptools, python3-vcversioner diff --git a/docs/data-model.rst b/docs/data-model.rst index f6e4f066a0ffb589d49593163367a41788e755af..fc1639d3cac2ee1f0ab3b80bf0408b689cfd66a1 100644 --- a/docs/data-model.rst +++ b/docs/data-model.rst @@ -3,6 +3,152 @@ Data model ========== +.. note:: The text below is adapted from §7 of the article `Software Heritage: + Why and How to Preserve Software Source Code + <https://hal.archives-ouvertes.fr/hal-01590958/>`_ (in proceedings of `iPRES + 2017 <https://ipres2017.jp/>`_, 14th International Conference on Digital + Preservation, by Roberto Di Cosmo and Stefano Zacchiroli), which also + provides a more general description of Software Heritage for the digital + preservation research community. + +In any archival project the choice of the underlying data model—at the logical +level, independently from how data is actually stored on physical media—is +paramount. The data model adopted by Software Heritage to represent the +information that it collects is centered around the notion of *software +artifact*, described below. + +It is important to notice that according to our principles, we must store with +every software artifact full information on where it has been found +(provenance), that is also captured in our data model, so we start by providing +some basic information on the nature of this provenance information. + + +Source code hosting places +-------------------------- + +Currently, Software Heritage uses of a curated list of source code hosting +places to crawl. The most common entries we expect to place in such a list are +popular collaborative development forges (e.g., GitHub, Bitbucket), package +manager repositories that host source package (e.g., CPAN, npm), and FOSS +distributions (e.g., Fedora, FreeBSD). But we may of course allow also more +niche entries, such as URLs of personal or institutional project collections +not hosted on major forges. + +While currently entirely manual, the curation of such a list might easily be +semi-automatic, with entries suggested by fellow archivists and/or concerned +users that want to notify Software Heritage of the need of archiving specific +pieces of endangered source code. This approach is entirely compatible with +Web-wide crawling approaches: crawlers capable of detecting the presence of +source code might enrich the list. In both cases the list will remain curated, +with (semi-automated) review processes that will need to pass before a hosting +place starts to be used. + + +Software artifacts +------------------ + +Once the hosting places are known, they will need to be periodically looked at +in order to add to the archive missing software artifacts. Which software +artifacts will be found there? + +In general, each software distribution mechanism hosts multiple releases of a +given software at any given time. For VCS (Version Control Systems), this is +the natural behaviour; for software packages, while a single version of a +package is just a snapshot of the corresponding software product, one can often +retrieve both current and past versions of the package from its distribution +site. + +By reviewing and generalizing existing VCS and source package formats, we have +identified the following recurrent artifacts as commonly found at source code +hosting places. They form the basic ingredients of the Software Heritage +archive. As the terminology varies quite a bit from technology to technology, +we provide below both the canonical name used in Software Heritage and popular +synonyms. + +**contents** (AKA "blobs") + the raw content of (source code) files as a sequence of bytes, without file + names or any other metadata. File contents are often recurrent, e.g., across + different versions of the same software, different directories of the same + project, or different projects all together. + +**directories** + a list of named directory entries, each of which pointing to other artifacts, + usually file contents or sub-directories. Directory entries are also + associated to arbitrary metadata, which vary with technologies, but usually + includes permission bits, modification timestamps, etc. + +**revisions** (AKA "commits") + software development within a specific project is essentially a time-indexed + series of copies of a single "root" directory that contains the entire + project source code. Software evolves when a developer modifies the content + of one or more files in that directory and record their changes. + + Each recorded copy of the root directory is known as a "revision". It points + to a fully-determined directory and is equipped with arbitrary metadata. Some + of those are added manually by the developer (e.g., commit message), others + are automatically synthesized (timestamps, preceding commit(s), etc). + +**releases** (AKA "tags") + some revisions are more equals than others and get selected by developers as + denoting important project milestones known as "releases". Each release + points to the last commit in project history corresponding to the release and + might carry arbitrary metadata—e.g., release name and version, release + message, cryptographic signatures, etc. + + +Additionally, the following crawling-related information are stored as +provenance information in the Software Heritage archive: + +**origins** + code "hosting places" as previously described are usually large platforms + that host several unrelated software projects. For software provenance + purposes it is important to be more specific than that. + + Software origins are fine grained references to where source code artifacts + archived by Software Heritage have been retrieved from. They take the form of + ``(type, url)`` pairs, where ``url`` is a canonical URL (e.g., the address at + which one can ``git clone`` a repository or download a source tarball) and + ``type`` the kind of software origin (e.g., git, svn, or dsc for Debian + source packages). + +.. + **projects** + as commonly intended are more abstract entities that precise software + origins. Projects relate together several development resources, including + websites, issue trackers, mailing lists, as well as software origins as + intended by Software Heritage. + + The debate around the most apt ontologies to capture project-related + information for software hasn't settled yet, but the place projects will take + in the Software Heritage archive is fairly clear. Projects are abstract + entities, which will be arbitrarily nestable in a versioned + project/sub-project hierarchy, and that can be associated to arbitrary + metadata as well as origins where their source code can be found. + +**snapshots** + any kind of software origin offers multiple pointers to the "current" state + of a development project. In the case of VCS this is reflected by branches + (e.g., master, development, but also so called feature branches dedicated to + extending the software in a specific direction); in the case of package + distributions by notions such as suites that correspond to different maturity + levels of individual packages (e.g., stable, development, etc.). + + A "snapshot" of a given software origin records all entry points found there + and where each of them was pointing at the time. For example, a snapshot + object might track the commit where the master branch was pointing to at any + given time, as well as the most recent release of a given package in the + stable suite of a FOSS distribution. + +**visits** + links together software origins with snapshots. Every time an origin is + consulted a new visit object is created, recording when (according to + Software Heritage clock) the visit happened and the full snapshot of the + state of the software origin at the time. + + +Data structure +-------------- + .. _swh-merkle-dag: .. figure:: images/swh-merkle-dag.svg :width: 1024px @@ -11,3 +157,101 @@ Data model Software Heritage archive as a Merkle DAG, augmented with crawling information (click to zoom). + +With all the bits of what we want to archive in place, the next question is how +to organize them, i.e., which logical data structure to adopt for their +storage. A key observation for this decision is that source code artifacts are +massively duplicated. This is so for several reasons: + +* code hosting diaspora (i.e., project development moving to the most + recent/cool collaborative development technology over time); +* copy/paste (AKA "vendoring") of parts or entire external FOSS software + components into other software products; +* large overlap between revisions of the same project: usually only a very + small amount of files/directories are modified by a single commit; +* emergence of DVCS (distributed version control systems), which natively work + by replicating entire repository copies around. GitHub-style pull requests + are the pinnacle of this, as they result in creating an additional repository + copy at each change done by a new developer; +* migration from one VCS to another—e.g., migrations from Subversion to Git, + which are really popular these days—resulting in additional copies, but in a + different distribution format, of the very same development histories. + +These trends seem to be neither stopping nor slowing down, and it is reasonable +to expect that they will be even more prominent in the future, due to the +decreasing costs of storage and bandwidth. + +For this reason we argue that any sustainable storage layout for archiving +source code in the very long term should support deduplication, allowing to pay +for the cost of storing source code artifacts that are encountered more than +once only once. For storage efficiency, deduplication should be supported for +all the software artifacts we have discussed, namely: file contents, +directories, revisions, releases, snapshots. + +Realizing that principle, the Software Heritage archive is conceptually a +single (big) `Merkle Direct Acyclic Graph (DAG) +<https://en.wikipedia.org/wiki/Merkle_tree>`_, as depicted in Figure +:ref:`Software Heritage Merkle DAG <swh-merkle-dag>`. In such a graph each of +the artifacts we have described—from file contents up to entire +snapshots—correspond to a node. Edges between nodes emerge naturally: +directory entries point to other directories or file contents; revisions point +to directories and previous revisions, releases point to revisions, snapshots +point to revisions and releases. Additionally, each node contains all metadata +that are specific to the node itself rather than to pointed nodes; e.g., commit +messages, timestamps, or file names. Note that the structure is really a DAG, +and not a tree, due to the fact that the line of revisions nodes might be +forked and merged back. + +.. + directory: fff3cc22cb40f71d26f736c082326e77de0b7692 + parent: e4feb05112588741b4764739d6da756c357e1f37 + author: Stefano Zacchiroli <zack@upsilon.cc> + date: 1443617461 +0200 + committer: Stefano Zacchiroli <zack@upsilon.cc> + commiter_date: 1443617461 +0200 + message: + objstorage: fix tempfile race when adding objects + + Before this change, two workers adding the same + object will end up racing to write <SHA1>.tmp. + [...] + + revisionid: 64a783216c1ec69dcb267449c0bbf5e54f7c4d6d + A revision node in the Software Heritage DAG + +In a Merkle structure each node is identified by an intrinsic identifier +computed as a cryptographic hash of the node content. In the case of Software +Heritage identifiers are computed taking into account both node-specific +metadata and the identifiers of child nodes. + +Consider the revision node in the picture whose identifier starts with +`c7640e08d..`. it points to a directory (identifier starting with +`45f0c078..`), which has also been archived. That directory contains a full +copy, at a specific point in time, of a software component—in the example the +`Hello World <https://forge.softwareheritage.org/source/helloworld/>`_ software +component available on our forge. The revision node also points to the +preceding revision node (`43ef7dcd..`) in the project development history. +Finally, the node contains revision-specific metadata, such as the author and +committer of the given change, its timestamps, and the message entered by the +author at commit time. + +The identifier of the revision node itself (`c7640e08d..`) is computed as a +cryptographic hash of a (canonical representation of) all the information shown +in figure. A change in any of them—metadata and/or pointed nodes—would result +in an entirely different node identifier. All other types of nodes in the +Software Heritage archive behave similarly. + +The Software Heritage archive inherits useful properties from the underlying +Merkle structure. In particular, deduplication is built-in. Any software +artifacts encountered in the wild gets added to the archive only if a +corresponding node with a matching intrinsic identifier is not already +available in the graph—file content, commits, entire directories or project +snapshots are all deduplicated incurring storage costs only once. + +Furthermore, as a side effect of this data model choice, the entire development +history of all the source code archived in Software Heritage—which ambitions to +match all published source code in the world—is available as a unified whole, +making emergent structures such as code reuse across different projects or +software origins, readily available. Further reinforcing the Software Heritage +use cases, this object could become a veritable "map of the stars" of our +entire software commons. diff --git a/docs/persistent-identifiers.rst b/docs/persistent-identifiers.rst index 7f41d610561b80a288aa3719eeeca543bc92848d..29bf797e144d0a0ba19293caf3aca6f0497b55f9 100644 --- a/docs/persistent-identifiers.rst +++ b/docs/persistent-identifiers.rst @@ -47,8 +47,8 @@ entry point of the grammar: | "cnt" (* content *) ; <object_id> ::= 40 * <hex_digit> ; (* intrinsic object id, as hex-encoded SHA1 *) - <hex_digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" - | "a" | "b" | "c" | "d" | "e" | "f" ; + <dec_digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" + <hex_digit> ::= <dec_digit> | "a" | "b" | "c" | "d" | "e" | "f" ; Semantics @@ -134,12 +134,60 @@ Resolution Persistent identifiers can be resolved using the Software Heritage Web application (see :py:mod:`swh.web`). -In particular, the ``/browse/`` endpoint can be given a persistent identifier -and will lead to the browsing page of the corresponding object, like this: -``https://archive.softwareheritage.org/browse/<identifier>``. For example: +In particular, the root endpoint ``/`` can be given a persistent identifier and +will lead to the browsing page of the corresponding object, like this: +``https://archive.softwareheritage.org/<identifier>``. For example: -* `<https://archive.softwareheritage.org/browse/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2>`_ -* `<https://archive.softwareheritage.org/browse/swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505>`_ -* `<https://archive.softwareheritage.org/browse/swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d>`_ -* `<https://archive.softwareheritage.org/browse/swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f>`_ -* `<https://archive.softwareheritage.org/browse/swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453>`_ +* `<https://archive.softwareheritage.org/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2>`_ +* `<https://archive.softwareheritage.org/swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505>`_ +* `<https://archive.softwareheritage.org/swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d>`_ +* `<https://archive.softwareheritage.org/swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f>`_ +* `<https://archive.softwareheritage.org/swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453>`_ + + +Contextual information +====================== + +It is often useful to complement persistent identifiers with **contextual +information** about where the identified object has been found as well as which +specific parts of it are of interest. To that end it is possible, via a +dedicated syntax, to extend persistent identifiers with the following pieces of +information: + +* the **software origin** where an object has been found/observed +* the **line number(s)** of interest, usually within a content object + + +Syntax +------ + +The full-syntax to complement identifiers with contextual information is given +by the ``<identifier_with_context>`` entry point of the grammar: + +.. code-block:: bnf + + <identifier_with_context> ::= <identifier> [<lines_ctxt>] [<origin_ctxt>] + <lines_ctxt> ::= ";" "lines" "=" <line_number> ["-" <line_number>] + <origin_ctxt> ::= ";" "origin" "=" <url> + <line_number> ::= <dec_digit> + + <url> ::= (* RFC 3986 compliant URLs *) + + +Semantics +--------- + +``;`` is used as separator between persistent identifiers and additional +optional contextual information. Each piece of contextual information is +specified as a key/value pair, using ``=`` as a separator. + +The following piece of contextual information are supported: + +* line numbers: it is possible to specify a single line number or a line range, + separating two numbers with ``-``. Note that line numbers are purely + indicative and are not meant to be stable, as in some degenerate cases + (e.g., text files which mix different types of line terminators) it is + impossible to resolve them unambiguously. + +* software origin: where a given object has been found or observed in the wild, + as the URI that was used by Software Heritage to ingest the object into the + archive diff --git a/requirements.txt b/requirements.txt index 151b92672f2d12b3b1eca2a026d388b86f03010c..447def3ef89e2d70a432a96982e62af6142e9d71 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner +Click diff --git a/setup.py b/setup.py index 232f3bc236391f21c8be9bed35dcd97f590890df..7e4a47ceb5549af703f343344c5e3161177d948b 100644 --- a/setup.py +++ b/setup.py @@ -17,9 +17,21 @@ def parse_requirements(): extra_requirements = [] - -pyblake2_hashes = {'blake2s256', 'blake2b512'} -if pyblake2_hashes - set(hashlib.algorithms_available): +pyblake2_hash_sets = [ + # Built-in implementation in Python 3.6+ + {'blake2s', 'blake2b'}, + # Potentially shipped by OpenSSL 1.1 (e.g. Python 3.5 in Debian stretch + # has these) + {'blake2s256', 'blake2b512'}, +] + +for pyblake2_hashes in pyblake2_hash_sets: + if not pyblake2_hashes - set(hashlib.algorithms_available): + # The required blake2 hashes have been found + break +else: + # None of the possible sets of blake2 hashes are available. + # use pyblake2 instead extra_requirements.append('pyblake2') setup( @@ -31,6 +43,10 @@ setup( packages=find_packages(), # packages's modules scripts=[], # scripts to package install_requires=parse_requirements() + extra_requirements, + entry_points=''' + [console_scripts] + swh-identify=swh.model.cli:identify + ''', setup_requires=['vcversioner'], vcversioner={}, include_package_data=True, diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 8afa9731644afd7cb3d175b5c5735c4eee8aae86..b2ef8133528bcc8e62d761a1c4bcdd2d05f493b0 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.23 +Version: 0.0.24 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh.model.egg-info/SOURCES.txt b/swh.model.egg-info/SOURCES.txt index 4b2269ff4fe5c7dfd1b6889241146897135bb07b..f6a2f94098b482c8b10045483d3c375f3165ad33 100644 --- a/swh.model.egg-info/SOURCES.txt +++ b/swh.model.egg-info/SOURCES.txt @@ -4,13 +4,11 @@ LICENSE MANIFEST.in Makefile Makefile.local -README-dev.md requirements-swh.txt requirements.txt setup.py version.txt bin/git-revhash -bin/swh-hash-file bin/swh-revhash debian/changelog debian/compat @@ -33,9 +31,11 @@ swh/__init__.py swh.model.egg-info/PKG-INFO swh.model.egg-info/SOURCES.txt swh.model.egg-info/dependency_links.txt +swh.model.egg-info/entry_points.txt swh.model.egg-info/requires.txt swh.model.egg-info/top_level.txt swh/model/__init__.py +swh/model/cli.py swh/model/exceptions.py swh/model/from_disk.py swh/model/hashutil.py @@ -49,6 +49,7 @@ swh/model/fields/hashes.py swh/model/fields/simple.py swh/model/tests/__init__.py swh/model/tests/generate_testdata_from_disk.py +swh/model/tests/test_cli.py swh/model/tests/test_from_disk.py swh/model/tests/test_hashutil.py swh/model/tests/test_identifiers.py diff --git a/swh.model.egg-info/entry_points.txt b/swh.model.egg-info/entry_points.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c74f3660b342d539bac70c1973a1cea205cbb7a --- /dev/null +++ b/swh.model.egg-info/entry_points.txt @@ -0,0 +1,4 @@ + + [console_scripts] + swh-identify=swh.model.cli:identify + \ No newline at end of file diff --git a/swh.model.egg-info/requires.txt b/swh.model.egg-info/requires.txt index 39a323addb39c408716b8874ef828acd3c4da427..dbcd3082d3efe925da6522cc818c882fb36e74f8 100644 --- a/swh.model.egg-info/requires.txt +++ b/swh.model.egg-info/requires.txt @@ -1 +1,2 @@ +Click vcversioner diff --git a/swh/model/cli.py b/swh/model/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..5996d19a196907f85da94bf6107686735f10fd12 --- /dev/null +++ b/swh/model/cli.py @@ -0,0 +1,95 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import click +import os +import sys + +from swh.model import identifiers as pids +from swh.model.exceptions import ValidationError +from swh.model.from_disk import Content, Directory + + +class PidParamType(click.ParamType): + name = 'persistent identifier' + + def convert(self, value, param, ctx): + try: + pids.parse_persistent_identifier(value) + return value # return as string, as we need just that + except ValidationError as e: + self.fail('%s is not a valid PID. %s.' % (value, e), param, ctx) + + +def pid_of_file(path): + object = Content.from_file(path=path).get_data() + return pids.persistent_identifier(pids.CONTENT, object) + + +def pid_of_dir(path): + object = Directory.from_disk(path=path).get_data() + return pids.persistent_identifier(pids.DIRECTORY, object) + + +@click.command() +@click.option('--type', '-t', default='auto', + type=click.Choice(['auto', 'content', 'directory']), + help='type of object to identify (default: auto)') +@click.option('--verify', '-v', metavar='PID', type=PidParamType(), + help='reference identifier to be compared with computed one') +@click.argument('object', + type=click.Path(exists=True, readable=True, + allow_dash=True, path_type=bytes)) +def identify(type, verify, object): + """Compute the Software Heritage persistent identifier (PID) for a given + source code object. + + For more details about Software Heritage PIDs see: + + \b + https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html + + \b + Examples: + + \b + $ swh-identify /usr/src/linux/kernel/ + swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab + + \b + $ swh-identify /usr/src/linux/kernel/sched/deadline.c + swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82 + + """ + if type == 'auto': + if os.path.isfile(object): + type = 'content' + elif os.path.isdir(object): + type = 'directory' + else: # shouldn't happen, due to path validation + raise click.BadParameter('%s is neither a file nor a directory' % + object) + + pid = None + if type == 'content': + pid = pid_of_file(object) + elif type == 'directory': + pid = pid_of_dir(object) + else: # shouldn't happen, due to option validation + raise click.BadParameter('invalid object type: ' + type) + + if verify: + if verify == pid: + click.echo('PID match: %s' % pid) + sys.exit(0) + else: + click.echo('PID mismatch: %s != %s' % (verify, pid)) + sys.exit(1) + else: + click.echo(pid) + + +if __name__ == '__main__': + identify() diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index 0dfbdc34a90c9798b8ce5fdf6d36c8feec2e03d2..3355161689a0329bfac73f696aa4492fc29ac517 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -43,15 +43,61 @@ Subset of :const:`ALGORITHMS`. HASH_BLOCK_SIZE = 32768 """Block size for streaming hash computations made in this module""" -# Load blake2 hashes from pyblake2 if they are not available in the builtin -# hashlib -__pyblake2_hashes = {'blake2s256': 'blake2s', - 'blake2b512': 'blake2b'} -__cache = hashlib.__builtin_constructor_cache -for __hash, __pyblake2_fn in __pyblake2_hashes.items(): - if __hash not in hashlib.algorithms_available: - import pyblake2 - __cache[__hash] = getattr(pyblake2, __pyblake2_fn) +_blake2_hash_cache = {} + + +def _new_blake2_hash(algo): + """Return a function that initializes a blake2 hash. + + """ + if algo in _blake2_hash_cache: + return _blake2_hash_cache[algo]() + + lalgo = algo.lower() + if not lalgo.startswith('blake2'): + raise ValueError('Algorithm %s is not a blake2 hash' % algo) + + blake_family = lalgo[:7] + + digest_size = None + if lalgo[7:]: + try: + digest_size, remainder = divmod(int(lalgo[7:]), 8) + except ValueError: + raise ValueError( + 'Unknown digest size for algo %s' % algo + ) from None + if remainder: + raise ValueError( + 'Digest size for algorithm %s must be a multiple of 8' % algo + ) + + if lalgo in hashlib.algorithms_available: + # Handle the case where OpenSSL ships the given algorithm + # (e.g. Python 3.5 on Debian 9 stretch) + _blake2_hash_cache[algo] = lambda: hashlib.new(lalgo) + else: + # Try using the built-in implementation for Python 3.6+ + if blake_family in hashlib.algorithms_available: + blake2 = getattr(hashlib, blake_family) + else: + import pyblake2 + blake2 = getattr(pyblake2, blake_family) + + _blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size) + + return _blake2_hash_cache[algo]() + + +def _new_hashlib_hash(algo): + """Initialize a digest object from hashlib. + + Handle the swh-specific names for the blake2-related algorithms + """ + if algo.startswith('blake2'): + return _new_blake2_hash(algo) + else: + return hashlib.new(algo) def _new_git_hash(base_algo, git_type, length): @@ -75,7 +121,7 @@ def _new_git_hash(base_algo, git_type, length): a hashutil.hash object """ - h = hashlib.new(base_algo) + h = _new_hashlib_hash(base_algo) git_header = '%s %d\0' % (git_type, length) h.update(git_header.encode('ascii')) @@ -113,7 +159,7 @@ def _new_hash(algo, length=None): base_algo = algo[:-4] return _new_git_hash(base_algo, 'blob', length) - return hashlib.new(algo) + return _new_hashlib_hash(algo) def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index eef7710698e0ec499a62e85e1a507a9371f8fcf5..00471f354424c34584d234c1be756f0d71e6a203 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -7,6 +7,8 @@ import binascii import datetime from functools import lru_cache +from .exceptions import ValidationError +from .fields.hashes import validate_sha1 from .hashutil import hash_data, hash_git_data, DEFAULT_ALGORITHMS from .hashutil import hash_to_hex @@ -603,9 +605,16 @@ def persistent_identifier(type, object, version=1): Args: type (str): Object's type - object (str): Object's dict representation + object (dict/bytes/str): Object's dict representation or object + identifier version (int): persistent identifier version (default to 1) + Raises: + ValidationError (class) in case of: + + invalid type + invalid hash object + Returns: Persistent identifier as string. @@ -632,11 +641,22 @@ def persistent_identifier(type, object, version=1): 'key_id': 'sha1_git' }, } - o = _map[type] - _hash = hash_to_hex(object[o['key_id']]) + o = _map.get(type) + if not o: + raise ValidationError('Wrong input: Supported types are %s' % ( + list(_map.keys()))) + + if isinstance(object, dict): # internal swh representation resolution + _hash = object[o['key_id']] + else: # client passed direct identifier (bytes/str) + _hash = object + validate_sha1(_hash) # can raise if invalid hash + _hash = hash_to_hex(_hash) return 'swh:%s:%s:%s' % (version, o['short_name'], _hash) +PERSISTENT_IDENTIFIER_TYPES = ['snp', 'rel', 'rev', 'dir', 'cnt'] + PERSISTENT_IDENTIFIER_KEYS = [ 'namespace', 'scheme_version', 'object_type', 'object_id', 'metadata'] @@ -649,6 +669,16 @@ def parse_persistent_identifier(persistent_id): Args: persistent_id (str): A persistent identifier + Raises: + ValidationError (class) in case of: + + missing mandatory values (4) + invalid namespace supplied + invalid version supplied + invalid type supplied + missing hash + invalid hash identifier supplied + Returns: dict: dict with keys : @@ -659,14 +689,47 @@ def parse_persistent_identifier(persistent_id): * metadata, holding dict value """ + # <pid>;<contextual-information> persistent_id_parts = persistent_id.split(PERSISTENT_IDENTIFIER_PARTS_SEP) - data = persistent_id_parts.pop(0).split(':') + pid_data = persistent_id_parts.pop(0).split(':') + + if len(pid_data) != 4: + raise ValidationError( + 'Wrong format: There should be 4 mandatory parameters') + + # Checking for parsing errors + _ns, _version, _type, _id = pid_data + if _ns != 'swh': + raise ValidationError( + 'Wrong format: Supported namespace is \'swh\'') + + if _version != '1': + raise ValidationError( + 'Wrong format: Supported version is 1') + + expected_types = PERSISTENT_IDENTIFIER_TYPES + if _type not in expected_types: + raise ValidationError( + 'Wrong format: Supported types are %s' % ( + ', '.join(expected_types))) + + if not _id: + raise ValidationError( + 'Wrong format: Identifier should be present') + + try: + validate_sha1(_id) + except ValidationError: + raise ValidationError( + 'Wrong format: Identifier should be a valid hash') + persistent_id_metadata = {} for part in persistent_id_parts: try: key, val = part.split('=') persistent_id_metadata[key] = val except Exception: - pass - data.append(persistent_id_metadata) - return dict(zip(PERSISTENT_IDENTIFIER_KEYS, data)) + msg = 'Contextual data is badly formatted, form key=val expected' + raise ValidationError(msg) + pid_data.append(persistent_id_metadata) + return dict(zip(PERSISTENT_IDENTIFIER_KEYS, pid_data)) diff --git a/swh/model/tests/test_cli.py b/swh/model/tests/test_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..054cc0c764f35fa769ec3d051635a3c835f53ea5 --- /dev/null +++ b/swh/model/tests/test_cli.py @@ -0,0 +1,73 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import tempfile +import unittest + +from click.testing import CliRunner +from nose.plugins.attrib import attr + +from swh.model import cli +from swh.model.tests.test_from_disk import DataMixin +from swh.model.hashutil import hash_to_hex + + +@attr('fs') +class TestIdentify(DataMixin, unittest.TestCase): + + def setUp(self): + super().setUp() + self.runner = CliRunner() + + def test_content_id(self): + self.make_contents(self.tmpdir_name) + for filename, content in self.contents.items(): + path = os.path.join(self.tmpdir_name, filename) + result = self.runner.invoke(cli.identify, + ['--type', 'content', path]) + + self.assertEqual(result.exit_code, 0) + self.assertEqual(result.output.rstrip(), + 'swh:1:cnt:' + hash_to_hex(content['sha1_git'])) + + def test_directory_id(self): + self.make_from_tarball(self.tmpdir_name) + path = os.path.join(self.tmpdir_name, b'sample-folder') + result = self.runner.invoke(cli.identify, + ['--type', 'directory', path]) + + self.assertEqual(result.exit_code, 0) + self.assertEqual(result.output.rstrip(), + 'swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759') + + def test_auto_id(self): + with tempfile.NamedTemporaryFile(prefix='swh.model.cli') as f: + result = self.runner.invoke(cli.identify, [f.name]) + self.assertEqual(result.exit_code, 0) + self.assertRegex(result.output, r'^swh:\d+:cnt:') + + with tempfile.TemporaryDirectory(prefix='swh.model.cli') as dirname: + result = self.runner.invoke(cli.identify, [dirname]) + self.assertEqual(result.exit_code, 0) + self.assertRegex(result.output, r'^swh:\d+:dir:') + + def test_verify_content(self): + self.make_contents(self.tmpdir_name) + for filename, content in self.contents.items(): + expected_id = 'swh:1:cnt:' + hash_to_hex(content['sha1_git']) + + # match + path = os.path.join(self.tmpdir_name, filename) + result = self.runner.invoke(cli.identify, + ['--verify', expected_id, path]) + self.assertEqual(result.exit_code, 0) + + # mismatch + with open(path, 'a') as f: + f.write('trailing garbage to make verification fail') + result = self.runner.invoke(cli.identify, + ['--verify', expected_id, path]) + self.assertEqual(result.exit_code, 1) diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index 8b883f16bf033376c1b8c7c91fb18e38b9b1b56c..da49af99fe259ba358a233d6fc87e7c6c1e8aa16 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -3,6 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import hashlib import io import os import tempfile @@ -16,6 +17,9 @@ from swh.model import hashutil class Hashutil(unittest.TestCase): def setUp(self): + # Reset function cache + hashutil._blake2_hash_cache = {} + self.data = b'1984\n' self.hex_checksums = { 'sha1': '62be35bf00ff0c624f4a621e2ea5595a049e0731', @@ -150,25 +154,103 @@ class Hashutil(unittest.TestCase): 'expected one of blake2b512, blake2s256, ' 'sha1, sha1_git, sha256') - @patch('swh.model.hashutil.hashlib') + @patch('hashlib.new') @istest - def new_hash_blake2b(self, mock_hashlib): - mock_hashlib.new.return_value = 'some-hashlib-object' + def new_hash_blake2b_blake2b512_builtin(self, mock_hashlib_new): + if 'blake2b512' not in hashlib.algorithms_available: + self.skipTest('blake2b512 not built-in') + mock_hashlib_new.return_value = sentinel = object() h = hashutil._new_hash('blake2b512') - self.assertEquals(h, 'some-hashlib-object') - mock_hashlib.new.assert_called_with('blake2b512') + self.assertIs(h, sentinel) + mock_hashlib_new.assert_called_with('blake2b512') - @patch('swh.model.hashutil.hashlib') + @patch('hashlib.new') @istest - def new_hash_blake2s(self, mock_hashlib): - mock_hashlib.new.return_value = 'some-hashlib-object' + def new_hash_blake2s_blake2s256_builtin(self, mock_hashlib_new): + if 'blake2s256' not in hashlib.algorithms_available: + self.skipTest('blake2s256 not built-in') + mock_hashlib_new.return_value = sentinel = object() h = hashutil._new_hash('blake2s256') - self.assertEquals(h, 'some-hashlib-object') - mock_hashlib.new.assert_called_with('blake2s256') + self.assertIs(h, sentinel) + mock_hashlib_new.assert_called_with('blake2s256') + + @istest + def new_hash_blake2b_builtin(self): + removed_hash = False + + try: + if 'blake2b512' in hashlib.algorithms_available: + removed_hash = True + hashlib.algorithms_available.remove('blake2b512') + if 'blake2b' not in hashlib.algorithms_available: + self.skipTest('blake2b not built in') + + with patch('hashlib.blake2b') as mock_blake2b: + mock_blake2b.return_value = sentinel = object() + + h = hashutil._new_hash('blake2b512') + + self.assertIs(h, sentinel) + mock_blake2b.assert_called_with(digest_size=512//8) + finally: + if removed_hash: + hashlib.algorithms_available.add('blake2b512') + + @istest + def new_hash_blake2s_builtin(self): + removed_hash = False + + try: + if 'blake2s256' in hashlib.algorithms_available: + removed_hash = True + hashlib.algorithms_available.remove('blake2s256') + if 'blake2s' not in hashlib.algorithms_available: + self.skipTest('blake2s not built in') + + with patch('hashlib.blake2s') as mock_blake2s: + mock_blake2s.return_value = sentinel = object() + + h = hashutil._new_hash('blake2s256') + + self.assertIs(h, sentinel) + mock_blake2s.assert_called_with(digest_size=256//8) + finally: + if removed_hash: + hashlib.algorithms_available.add('blake2s256') + + @istest + def new_hash_blake2b_pyblake2(self): + if 'blake2b512' in hashlib.algorithms_available: + self.skipTest('blake2b512 built in') + if 'blake2b' in hashlib.algorithms_available: + self.skipTest('blake2b built in') + + with patch('pyblake2.blake2b') as mock_blake2b: + mock_blake2b.return_value = sentinel = object() + + h = hashutil._new_hash('blake2b512') + + self.assertIs(h, sentinel) + mock_blake2b.assert_called_with(digest_size=512//8) + + @istest + def new_hash_blake2s_pyblake2(self): + if 'blake2s256' in hashlib.algorithms_available: + self.skipTest('blake2s256 built in') + if 'blake2s' in hashlib.algorithms_available: + self.skipTest('blake2s built in') + + with patch('pyblake2.blake2s') as mock_blake2s: + mock_blake2s.return_value = sentinel = object() + + h = hashutil._new_hash('blake2s256') + + self.assertIs(h, sentinel) + mock_blake2s.assert_called_with(digest_size=256//8) class HashlibGit(unittest.TestCase): diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index afe943b6de8699ffb1775c8df135253c3a1d1a42..7daf8e40ac25b68d4cf05cb0ebc455e63bee26b2 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -11,8 +11,9 @@ from nose.tools import istest from swh.model import hashutil, identifiers +from swh.model.exceptions import ValidationError from swh.model.identifiers import SNAPSHOT, RELEASE, REVISION, DIRECTORY -from swh.model.identifiers import CONTENT +from swh.model.identifiers import CONTENT, PERSISTENT_IDENTIFIER_TYPES class UtilityFunctionsIdentifier(unittest.TestCase): @@ -773,13 +774,29 @@ class SnapshotIdentifier(unittest.TestCase): ) def test_persistent_identifier(self): - _snapshot = {'id': hashutil.hash_to_bytes( - 'c7c108084bc0bf3d81436bf980b46e98bd338453')} - _release = {'id': '22ece559cc7cc2364edc5e5593d63ae8bd229f9f'} - _revision = {'id': '309cf2674ee7a0749978cf8265ab91a60aea0f7d'} - _directory = {'id': 'd198bc9d7a6bcf6db04f476d29314f157507d505'} - _content = {'sha1_git': '94a9ed024d3859793618152ea559a168bbcbb5e2'} + _snapshot_id = hashutil.hash_to_bytes( + 'c7c108084bc0bf3d81436bf980b46e98bd338453') + _release_id = '22ece559cc7cc2364edc5e5593d63ae8bd229f9f' + _revision_id = '309cf2674ee7a0749978cf8265ab91a60aea0f7d' + _directory_id = 'd198bc9d7a6bcf6db04f476d29314f157507d505' + _content_id = '94a9ed024d3859793618152ea559a168bbcbb5e2' + _snapshot = {'id': _snapshot_id} + _release = {'id': _release_id} + _revision = {'id': _revision_id} + _directory = {'id': _directory_id} + _content = {'sha1_git': _content_id} + for full_type, _hash, expected_persistent_id, version in [ + (SNAPSHOT, _snapshot_id, + 'swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', None), + (RELEASE, _release_id, + 'swh:2:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', 2), + (REVISION, _revision_id, + 'swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', None), + (DIRECTORY, _directory_id, + 'swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', None), + (CONTENT, _content_id, + 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 1), (SNAPSHOT, _snapshot, 'swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', None), (RELEASE, _release, @@ -789,7 +806,7 @@ class SnapshotIdentifier(unittest.TestCase): (DIRECTORY, _directory, 'swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', None), (CONTENT, _content, - 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 1) + 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 1), ]: if version: actual_value = identifiers.persistent_identifier( @@ -800,12 +817,24 @@ class SnapshotIdentifier(unittest.TestCase): self.assertEquals(actual_value, expected_persistent_id) + def test_persistent_identifier_wrong_input(self): + _snapshot_id = 'notahash4bc0bf3d81436bf980b46e98bd338453' + _snapshot = {'id': _snapshot_id} + + for _type, _hash, _error in [ + (SNAPSHOT, _snapshot_id, 'Unexpected characters'), + (SNAPSHOT, _snapshot, 'Unexpected characters'), + ('foo', '', 'Wrong input: Supported types are'), + ]: + with self.assertRaisesRegex(ValidationError, _error): + identifiers.persistent_identifier(_type, _hash) + def test_parse_persistent_identifier(self): for pid, _type, _version, _hash in [ ('swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 'cnt', '1', '94a9ed024d3859793618152ea559a168bbcbb5e2'), - ('swh:2:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir', - '2', 'd198bc9d7a6bcf6db04f476d29314f157507d505'), + ('swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir', + '1', 'd198bc9d7a6bcf6db04f476d29314f157507d505'), ('swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', 'rev', '1', '309cf2674ee7a0749978cf8265ab91a60aea0f7d'), ('swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', 'rel', @@ -834,9 +863,7 @@ class SnapshotIdentifier(unittest.TestCase): 'dir', '1', '0b6959356d30f1a4e9b7f6bca59b9a336464c03d', { 'origin': 'deb://Debian/packages/linuxdoc-tools' - }), - ('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;malformed', # noqa - 'dir', '1', '0b6959356d30f1a4e9b7f6bca59b9a336464c03d', {}) + }) ]: expected_result = { 'namespace': 'swh', @@ -847,3 +874,32 @@ class SnapshotIdentifier(unittest.TestCase): } actual_result = identifiers.parse_persistent_identifier(pid) self.assertEquals(actual_result, expected_result) + + def test_parse_persistent_identifier_parsing_error(self): + for pid, _error in [ + ('swh:1:cnt', + 'Wrong format: There should be 4 mandatory parameters'), + ('swh:1:', + 'Wrong format: There should be 4 mandatory parameters'), + ('swh:', + 'Wrong format: There should be 4 mandatory parameters'), + ('swh:1:cnt:', + 'Wrong format: Identifier should be present'), + ('foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505', + 'Wrong format: Supported namespace is \'swh\''), + ('swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505', + 'Wrong format: Supported version is 1'), + ('swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505', + 'Wrong format: Supported types are %s' % ( + ', '.join(PERSISTENT_IDENTIFIER_TYPES))), + ('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;' + 'malformed', + 'Contextual data is badly formatted, form key=val expected'), + ('swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d', + 'Wrong format: Identifier should be a valid hash'), + ('swh:1:snp:foo', + 'Wrong format: Identifier should be a valid hash') + ]: + with self.assertRaisesRegex( + ValidationError, _error): + identifiers.parse_persistent_identifier(pid) diff --git a/version.txt b/version.txt index c0bdf11c06ccb6f33f80e32ef7f296cd2080242e..8af3930e8c604e6de47bdf78d405936146a71cf7 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.23-0-g448eafa \ No newline at end of file +v0.0.24-0-g5eb055d \ No newline at end of file