diff --git a/PKG-INFO b/PKG-INFO
index 8afa9731644afd7cb3d175b5c5735c4eee8aae86..b2ef8133528bcc8e62d761a1c4bcdd2d05f493b0 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.0
 Name: swh.model
-Version: 0.0.23
+Version: 0.0.24
 Summary: Software Heritage data model
 Home-page: https://forge.softwareheritage.org/diffusion/DMOD/
 Author: Software Heritage developers
diff --git a/README-dev.md b/README-dev.md
deleted file mode 100644
index 97f9fb42da360b997cb7cca64cb7866948846930..0000000000000000000000000000000000000000
--- a/README-dev.md
+++ /dev/null
@@ -1,118 +0,0 @@
-Git sha1 computation
---------------------
-
-Document to describe how the git sha1 computation takes place.
-
-### commit/revision
-
-sha1 git commit/revision computation:
-
-    commit `size`\0
-    tree `sha1-git-tree-and-subtree-in-plain-hex-string`
-    ([parent `commit-parent-n`])
-    author `name` <`email`> `date-ts` `date-offset`
-    committer `name` <`email`> `date-ts` `date-offset`
-    ([extra-header-key-n extra-header-value-n])
-
-    `commit-message`
-    (inline-gpg-signature)
-
-
-Notes:
-- [] denotes list of entries (one per line)
-- () denotes optional entry. For example, the parent entry is optional.
-- empty line at the end of the commit message
-- timestamp example: 1444054085
-- date offset for example: +0200, -0100
-
-sources:
-- commit_tree_extended: https://github.com/git/git/blob/8d530c4d64ffcc853889f7b385f554d53db375ed/commit.c#L1522
-- commit_tree: https://github.com/git/git/blob/8d530c4d64ffcc853889f7b385f554d53db375ed/commit.c#L1392
-
-Examples:
-
-```sh
-$ cat commit.txt
-tree 85a74718d377195e1efd0843ba4f3260bad4fe07
-parent 01e2d0627a9a6edb24c37db45db5ecb31e9de808
-author Linus Torvalds <torvalds@linux-foundation.org> 1436739030 -0700
-committer Linus Torvalds <torvalds@linux-foundation.org> 1436739030 -0700
-svn-repo-uuid 046f1af7-66c2-d61b-5410-ce57b7db7bff
-svn-revision 10
-
-Linux 4.2-rc2
-```
-
-```
-$ cat commit.txt | git hash-object -t commit --stdin
-010d34f384fa99d047cdd5e2f41e56e5c2feee45
-```
-
-commit: 44cc742a8ca17b9c279be4cc195a93a6ef7a320e
-```
-$ git cat-file -p 44cc742a8ca17b9c279be4cc195a93a6ef7a320e
-...
-tree b134f9b7dc434f593c0bab696345548b37de0558
-parent 689664ae944b4692724f13b709a4e4de28b54e57
-parent c888305e1efbaa252d01b4e5e6b778f865a97514
-author Jiang Xin <worldhello.net@gmail.com> 1428538899 +0800
-committer Jiang Xin <worldhello.net@gmail.com> 1428538899 +0800
-gpgsig -----BEGIN PGP SIGNATURE-----
- Version: GnuPG v1.4.13 (Darwin)
-
- iQIcBAABAgAGBQJVJcYsAAoJEBiY3kIkQRNJVAUQAJ8/XQIfMqqC5oYeEFfHOPYZ
- L7qy46bXHVBa9Qd8zAJ2Dou3IbI2ZoF6/Et89K/UggOycMlt5FKV/9toWyuZv4Po
- L682wonoxX99qvVTHo6+wtnmYO7+G0f82h+qHMErxjP+I6gzRNBvRr+SfY7VlGdK
- wikMKOMWC5smrScSHITnOq1Ews5pe3N7qDYMzK0XVZmgDoaem4RSWMJs4My/qVLN
- e0CqYWq2A22GX7sXl6pjneJYQvcAXUX+CAzp24QnPSb+Q22Guj91TcxLFcHCTDdn
- qgqMsEyMiisoglwrCbO+D+1xq9mjN9tNFWP66SQ48mrrHYTBV5sz9eJyDfroJaLP
- CWgbDTgq6GzRMehHT3hXfYS5NNatjnhkNISXR7pnVP/obIi/vpWh5ll6Gd8q26z+
- a/O41UzOaLTeNI365MWT4/cnXohVLRG7iVJbAbCxoQmEgsYMRc/pBAzWJtLfcB2G
- jdTswYL6+MUdL8sB9pZ82D+BP/YAdHe69CyTu1lk9RT2pYtI/kkfjHubXBCYEJSG
- +VGllBbYG6idQJpyrOYNRJyrDi9yvDJ2W+S0iQrlZrxzGBVGTB/y65S8C+2WTBcE
- lf1Qb5GDsQrZWgD+jtWTywOYHtCBwyCKSAXxSARMbNPeak9WPlcW/Jmu+fUcMe2x
- dg1KdHOa34shrKDaOVzW
- =od6m
- -----END PGP SIGNATURE-----
-
-Merge branch 'master' of git://github.com/alexhenrie/git-po
-
-* 'master' of git://github.com/alexhenrie/git-po:
-  l10n: ca.po: update translation
-```
-
-### directory/tree
-
-sha1 git directory/tree computation:
-
-    tree `tree-size`\0
-    <file-perm> <file-name>\0<file-sha1-in-20-bytes-string>...<dir-perm> <dir-name>\0<dir-sha1-in-20-bytes-string>...
-
-
-Notes:
-- no newline separator between tree entries
-- no empty newline at the end of the tree entries
-- tree content header size is the length of the content
-- The tree entries are ordered according to bytes in their <name> properties.
-
-Note: Tree entries referencing trees are sorted as if their name have a trailing /
-at their end.
-
-Possible permissions are:
-- 100644 - file
-- 40000  - directory
-- 100755 - executable file
-- 120000 - symbolink link
-- 160000 - git link (relative to submodule)
-
-### content/file
-
-sha1 git content computation:
-
-    blob `blob-size`\0
-    `blob-content`
-
-Notes:
-- no newline at the end of the blob content
-
-Compress with DEFLATE and compute sha1
diff --git a/bin/swh-hash-file b/bin/swh-hash-file
deleted file mode 100755
index c30de78f2ebb4cdd7956e762b4310c89e1528939..0000000000000000000000000000000000000000
--- a/bin/swh-hash-file
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/python3
-
-# Copyright (C) 2018  The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import sys
-
-from swh.model.from_disk import Content
-from swh.model.hashutil import hash_to_hex
-
-
-HASH_ALGO = 'sha1_git'
-
-
-def hash_file(fname):
-    return hash_to_hex(Content.from_file(path=fname.encode()).hash)
-
-
-def main(fnames):
-    for f in fnames:
-        print(f, hash_file(f), sep='\t')
-
-
-if __name__ == '__main__':
-    fnames = sys.argv[1:]
-    if not fnames:
-        print('Usage: swh-hash-file FILE...')
-        sys.exit(2)
-
-    main(fnames)
diff --git a/debian/control b/debian/control
index 49c2aae10397ecc8de1ab3502e910edea430994b..8124280f1450a9ed9434eea527e4bdcffa6d6599 100644
--- a/debian/control
+++ b/debian/control
@@ -6,6 +6,7 @@ Build-Depends: debhelper (>= 9),
                dh-python (>= 2),
                python3 (>= 3.5) | python3-pyblake2,
                python3-all,
+               python3-click,
                python3-nose,
                python3-setuptools,
                python3-vcversioner
diff --git a/docs/data-model.rst b/docs/data-model.rst
index f6e4f066a0ffb589d49593163367a41788e755af..fc1639d3cac2ee1f0ab3b80bf0408b689cfd66a1 100644
--- a/docs/data-model.rst
+++ b/docs/data-model.rst
@@ -3,6 +3,152 @@
 Data model
 ==========
 
+.. note:: The text below is adapted from Â§7 of the article `Software Heritage:
+  Why and How to Preserve Software Source Code
+  <https://hal.archives-ouvertes.fr/hal-01590958/>`_ (in proceedings of `iPRES
+  2017 <https://ipres2017.jp/>`_, 14th International Conference on Digital
+  Preservation, by Roberto Di Cosmo and Stefano Zacchiroli), which also
+  provides a more general description of Software Heritage for the digital
+  preservation research community.
+
+In any archival project the choice of the underlying data modelâ€”at the logical
+level, independently from how data is actually stored on physical mediaâ€”is
+paramount. The data model adopted by Software Heritage to represent the
+information that it collects is centered around the notion of *software
+artifact*, described below.
+
+It is important to notice that according to our principles, we must store with
+every software artifact full information on where it has been found
+(provenance), that is also captured in our data model, so we start by providing
+some basic information on the nature of this provenance information.
+
+
+Source code hosting places
+--------------------------
+
+Currently, Software Heritage uses of a curated list of source code hosting
+places to crawl. The most common entries we expect to place in such a list are
+popular collaborative development forges (e.g., GitHub, Bitbucket), package
+manager repositories that host source package (e.g., CPAN, npm), and FOSS
+distributions (e.g., Fedora, FreeBSD). But we may of course allow also more
+niche entries, such as URLs of personal or institutional project collections
+not hosted on major forges.
+
+While currently entirely manual, the curation of such a list might easily be
+semi-automatic, with entries suggested by fellow archivists and/or concerned
+users that want to notify Software Heritage of the need of archiving specific
+pieces of endangered source code. This approach is entirely compatible with
+Web-wide crawling approaches: crawlers capable of detecting the presence of
+source code might enrich the list. In both cases the list will remain curated,
+with (semi-automated) review processes that will need to pass before a hosting
+place starts to be used.
+
+
+Software artifacts
+------------------
+
+Once the hosting places are known, they will need to be periodically looked at
+in order to add to the archive missing software artifacts. Which software
+artifacts will be found there?
+
+In general, each software distribution mechanism hosts multiple releases of a
+given software at any given time. For VCS (Version Control Systems), this is
+the natural behaviour; for software packages, while a single version of a
+package is just a snapshot of the corresponding software product, one can often
+retrieve both current and past versions of the package from its distribution
+site.
+
+By reviewing and generalizing existing VCS and source package formats, we have
+identified the following recurrent artifacts as commonly found at source code
+hosting places. They form the basic ingredients of the Software Heritage
+archive. As the terminology varies quite a bit from technology to technology,
+we provide below both the canonical name used in Software Heritage and popular
+synonyms.
+
+**contents** (AKA "blobs")
+  the raw content of (source code) files as a sequence of bytes, without file
+  names or any other metadata.  File contents are often recurrent, e.g., across
+  different versions of the same software, different directories of the same
+  project, or different projects all together.
+
+**directories**
+  a list of named directory entries, each of which pointing to other artifacts,
+  usually file contents or sub-directories. Directory entries are also
+  associated to arbitrary metadata, which vary with technologies, but usually
+  includes permission bits, modification timestamps, etc.
+
+**revisions** (AKA "commits")
+  software development within a specific project is essentially a time-indexed
+  series of copies of a single "root" directory that contains the entire
+  project source code. Software evolves when a developer modifies the content
+  of one or more files in that directory and record their changes.
+
+  Each recorded copy of the root directory is known as a "revision". It points
+  to a fully-determined directory and is equipped with arbitrary metadata. Some
+  of those are added manually by the developer (e.g., commit message), others
+  are automatically synthesized (timestamps, preceding commit(s), etc).
+
+**releases** (AKA "tags")
+  some revisions are more equals than others and get selected by developers as
+  denoting important project milestones known as "releases". Each release
+  points to the last commit in project history corresponding to the release and
+  might carry arbitrary metadataâ€”e.g., release name and version, release
+  message, cryptographic signatures, etc.
+
+
+Additionally, the following crawling-related information are stored as
+provenance information in the Software Heritage archive:
+
+**origins**
+  code "hosting places" as previously described are usually large platforms
+  that host several unrelated software projects. For software provenance
+  purposes it is important to be more specific than that.
+
+  Software origins are fine grained references to where source code artifacts
+  archived by Software Heritage have been retrieved from. They take the form of
+  ``(type, url)`` pairs, where ``url`` is a canonical URL (e.g., the address at
+  which one can ``git clone`` a repository or download a source tarball) and
+  ``type`` the kind of software origin (e.g., git, svn, or dsc for Debian
+  source packages).
+
+..
+   **projects**
+     as commonly intended are more abstract entities that precise software
+     origins. Projects relate together several development resources, including
+     websites, issue trackers, mailing lists, as well as software origins as
+     intended by Software Heritage.
+
+     The debate around the most apt ontologies to capture project-related
+     information for software hasn't settled yet, but the place projects will take
+     in the Software Heritage archive is fairly clear. Projects are abstract
+     entities, which will be arbitrarily nestable in a versioned
+     project/sub-project hierarchy, and that can be associated to arbitrary
+     metadata as well as origins where their source code can be found.
+
+**snapshots**
+  any kind of software origin offers multiple pointers to the "current" state
+  of a development project. In the case of VCS this is reflected by branches
+  (e.g., master, development, but also so called feature branches dedicated to
+  extending the software in a specific direction); in the case of package
+  distributions by notions such as suites that correspond to different maturity
+  levels of individual packages (e.g., stable, development, etc.).
+
+  A "snapshot" of a given software origin records all entry points found there
+  and where each of them was pointing at the time. For example, a snapshot
+  object might track the commit where the master branch was pointing to at any
+  given time, as well as the most recent release of a given package in the
+  stable suite of a FOSS distribution.
+
+**visits**
+  links together software origins with snapshots. Every time an origin is
+  consulted a new visit object is created, recording when (according to
+  Software Heritage clock) the visit happened and the full snapshot of the
+  state of the software origin at the time.
+
+
+Data structure
+--------------
+
 .. _swh-merkle-dag:
 .. figure:: images/swh-merkle-dag.svg
    :width: 1024px
@@ -11,3 +157,101 @@ Data model
    Software Heritage archive as a Merkle DAG, augmented with crawling
    information (click to zoom).
 
+
+With all the bits of what we want to archive in place, the next question is how
+to organize them, i.e., which logical data structure to adopt for their
+storage. A key observation for this decision is that source code artifacts are
+massively duplicated. This is so for several reasons:
+
+* code hosting diaspora (i.e., project development moving to the most
+  recent/cool collaborative development technology over time);
+* copy/paste (AKA "vendoring") of parts or entire external FOSS software
+  components into other software products;
+* large overlap between revisions of the same project: usually only a very
+  small amount of files/directories are modified by a single commit;
+* emergence of DVCS (distributed version control systems), which natively work
+  by replicating entire repository copies around. GitHub-style pull requests
+  are the pinnacle of this, as they result in creating an additional repository
+  copy at each change done by a new developer;
+* migration from one VCS to anotherâ€”e.g., migrations from Subversion to Git,
+  which are really popular these daysâ€”resulting in additional copies, but in a
+  different distribution format, of the very same development histories.
+
+These trends seem to be neither stopping nor slowing down, and it is reasonable
+to expect that they will be even more prominent in the future, due to the
+decreasing costs of storage and bandwidth.
+
+For this reason we argue that any sustainable storage layout for archiving
+source code in the very long term should support deduplication, allowing to pay
+for the cost of storing source code artifacts that are encountered more than
+once only once. For storage efficiency, deduplication should be supported for
+all the software artifacts we have discussed, namely: file contents,
+directories, revisions, releases, snapshots.
+
+Realizing that principle, the Software Heritage archive is conceptually a
+single (big) `Merkle Direct Acyclic Graph (DAG)
+<https://en.wikipedia.org/wiki/Merkle_tree>`_, as depicted in Figure
+:ref:`Software Heritage Merkle DAG <swh-merkle-dag>`. In such a graph each of
+the artifacts we have describedâ€”from file contents up to entire
+snapshotsâ€”correspond to a node.  Edges between nodes emerge naturally:
+directory entries point to other directories or file contents; revisions point
+to directories and previous revisions, releases point to revisions, snapshots
+point to revisions and releases. Additionally, each node contains all metadata
+that are specific to the node itself rather than to pointed nodes; e.g., commit
+messages, timestamps, or file names. Note that the structure is really a DAG,
+and not a tree, due to the fact that the line of revisions nodes might be
+forked and merged back.
+
+..
+   directory: fff3cc22cb40f71d26f736c082326e77de0b7692
+   parent: e4feb05112588741b4764739d6da756c357e1f37
+   author: Stefano Zacchiroli <zack@upsilon.cc>
+   date: 1443617461 +0200
+   committer: Stefano Zacchiroli <zack@upsilon.cc>
+   commiter_date: 1443617461 +0200
+   message:
+     objstorage: fix tempfile race when adding objects
+
+     Before this change, two workers adding the same
+     object will end up racing to write <SHA1>.tmp.
+     [...]
+
+     revisionid: 64a783216c1ec69dcb267449c0bbf5e54f7c4d6d
+     A revision node in the Software Heritage DAG
+
+In a Merkle structure each node is identified by an intrinsic identifier
+computed as a cryptographic hash of the node content. In the case of Software
+Heritage identifiers are computed taking into account both node-specific
+metadata and the identifiers of child nodes.
+
+Consider the revision node in the picture whose identifier starts with
+`c7640e08d..`. it points to a directory (identifier starting with
+`45f0c078..`), which has also been archived. That directory contains a full
+copy, at a specific point in time, of a software componentâ€”in the example the
+`Hello World <https://forge.softwareheritage.org/source/helloworld/>`_ software
+component available on our forge. The revision node also points to the
+preceding revision node (`43ef7dcd..`) in the project development history.
+Finally, the node contains revision-specific metadata, such as the author and
+committer of the given change, its timestamps, and the message entered by the
+author at commit time.
+
+The identifier of the revision node itself (`c7640e08d..`) is computed as a
+cryptographic hash of a (canonical representation of) all the information shown
+in figure. A change in any of themâ€”metadata and/or pointed nodesâ€”would result
+in an entirely different node identifier. All other types of nodes in the
+Software Heritage archive behave similarly.
+
+The Software Heritage archive inherits useful properties from the underlying
+Merkle structure. In particular, deduplication is built-in. Any software
+artifacts encountered in the wild gets added to the archive only if a
+corresponding node with a matching intrinsic identifier is not already
+available in the graphâ€”file content, commits, entire directories or project
+snapshots are all deduplicated incurring storage costs only once.
+
+Furthermore, as a side effect of this data model choice, the entire development
+history of all the source code archived in Software Heritageâ€”which ambitions to
+match all published source code in the worldâ€”is available as a unified whole,
+making emergent structures such as code reuse across different projects or
+software origins, readily available. Further reinforcing the Software Heritage
+use cases, this object could become a veritable "map of the stars" of our
+entire software commons.
diff --git a/docs/persistent-identifiers.rst b/docs/persistent-identifiers.rst
index 7f41d610561b80a288aa3719eeeca543bc92848d..29bf797e144d0a0ba19293caf3aca6f0497b55f9 100644
--- a/docs/persistent-identifiers.rst
+++ b/docs/persistent-identifiers.rst
@@ -47,8 +47,8 @@ entry point of the grammar:
     | "cnt"  (* content *)
     ;
   <object_id> ::= 40 * <hex_digit> ;  (* intrinsic object id, as hex-encoded SHA1 *)
-  <hex_digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
-                | "a" | "b" | "c" | "d" | "e" | "f" ;
+  <dec_digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
+  <hex_digit> ::= <dec_digit> | "a" | "b" | "c" | "d" | "e" | "f" ;
 
 
 Semantics
@@ -134,12 +134,60 @@ Resolution
 Persistent identifiers can be resolved using the Software Heritage Web
 application (see :py:mod:`swh.web`).
 
-In particular, the ``/browse/`` endpoint can be given a persistent identifier
-and will lead to the browsing page of the corresponding object, like this:
-``https://archive.softwareheritage.org/browse/<identifier>``. For example:
+In particular, the root endpoint ``/`` can be given a persistent identifier and
+will lead to the browsing page of the corresponding object, like this:
+``https://archive.softwareheritage.org/<identifier>``. For example:
 
-* `<https://archive.softwareheritage.org/browse/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2>`_
-* `<https://archive.softwareheritage.org/browse/swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505>`_
-* `<https://archive.softwareheritage.org/browse/swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d>`_
-* `<https://archive.softwareheritage.org/browse/swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f>`_
-* `<https://archive.softwareheritage.org/browse/swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453>`_
+* `<https://archive.softwareheritage.org/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2>`_
+* `<https://archive.softwareheritage.org/swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505>`_
+* `<https://archive.softwareheritage.org/swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d>`_
+* `<https://archive.softwareheritage.org/swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f>`_
+* `<https://archive.softwareheritage.org/swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453>`_
+
+
+Contextual information
+======================
+
+It is often useful to complement persistent identifiers with **contextual
+information** about where the identified object has been found as well as which
+specific parts of it are of interest. To that end it is possible, via a
+dedicated syntax, to extend persistent identifiers with the following pieces of
+information:
+
+* the **software origin** where an object has been found/observed
+* the **line number(s)** of interest, usually within a content object
+
+
+Syntax
+------
+
+The full-syntax to complement identifiers with contextual information is given
+by the ``<identifier_with_context>`` entry point of the grammar:
+
+.. code-block:: bnf
+
+  <identifier_with_context> ::= <identifier> [<lines_ctxt>] [<origin_ctxt>]
+  <lines_ctxt> ::= ";" "lines" "=" <line_number> ["-" <line_number>]
+  <origin_ctxt> ::= ";" "origin" "=" <url>
+  <line_number> ::= <dec_digit> +
+  <url> ::= (* RFC 3986 compliant URLs *)
+
+
+Semantics
+---------
+
+``;`` is used as separator between persistent identifiers and additional
+optional contextual information. Each piece of contextual information is
+specified as a key/value pair, using ``=`` as a separator.
+
+The following piece of contextual information are supported:
+
+* line numbers: it is possible to specify a single line number or a line range,
+  separating two numbers with ``-``. Note that line numbers are purely
+  indicative and are not meant to be stable, as in some degenerate cases
+  (e.g., text files which mix different types of line terminators) it is
+  impossible to resolve them unambiguously.
+
+* software origin: where a given object has been found or observed in the wild,
+  as the URI that was used by Software Heritage to ingest the object into the
+  archive
diff --git a/requirements.txt b/requirements.txt
index 151b92672f2d12b3b1eca2a026d388b86f03010c..447def3ef89e2d70a432a96982e62af6142e9d71 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@
 # should match https://pypi.python.org/pypi names. For the full spec or
 # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
 vcversioner
+Click
diff --git a/setup.py b/setup.py
index 232f3bc236391f21c8be9bed35dcd97f590890df..7e4a47ceb5549af703f343344c5e3161177d948b 100644
--- a/setup.py
+++ b/setup.py
@@ -17,9 +17,21 @@ def parse_requirements():
 
 extra_requirements = []
 
-
-pyblake2_hashes = {'blake2s256', 'blake2b512'}
-if pyblake2_hashes - set(hashlib.algorithms_available):
+pyblake2_hash_sets = [
+    # Built-in implementation in Python 3.6+
+    {'blake2s', 'blake2b'},
+    # Potentially shipped by OpenSSL 1.1 (e.g. Python 3.5 in Debian stretch
+    # has these)
+    {'blake2s256', 'blake2b512'},
+]
+
+for pyblake2_hashes in pyblake2_hash_sets:
+    if not pyblake2_hashes - set(hashlib.algorithms_available):
+        # The required blake2 hashes have been found
+        break
+else:
+    # None of the possible sets of blake2 hashes are available.
+    # use pyblake2 instead
     extra_requirements.append('pyblake2')
 
 setup(
@@ -31,6 +43,10 @@ setup(
     packages=find_packages(),  # packages's modules
     scripts=[],   # scripts to package
     install_requires=parse_requirements() + extra_requirements,
+    entry_points='''
+        [console_scripts]
+        swh-identify=swh.model.cli:identify
+    ''',
     setup_requires=['vcversioner'],
     vcversioner={},
     include_package_data=True,
diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO
index 8afa9731644afd7cb3d175b5c5735c4eee8aae86..b2ef8133528bcc8e62d761a1c4bcdd2d05f493b0 100644
--- a/swh.model.egg-info/PKG-INFO
+++ b/swh.model.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.0
 Name: swh.model
-Version: 0.0.23
+Version: 0.0.24
 Summary: Software Heritage data model
 Home-page: https://forge.softwareheritage.org/diffusion/DMOD/
 Author: Software Heritage developers
diff --git a/swh.model.egg-info/SOURCES.txt b/swh.model.egg-info/SOURCES.txt
index 4b2269ff4fe5c7dfd1b6889241146897135bb07b..f6a2f94098b482c8b10045483d3c375f3165ad33 100644
--- a/swh.model.egg-info/SOURCES.txt
+++ b/swh.model.egg-info/SOURCES.txt
@@ -4,13 +4,11 @@ LICENSE
 MANIFEST.in
 Makefile
 Makefile.local
-README-dev.md
 requirements-swh.txt
 requirements.txt
 setup.py
 version.txt
 bin/git-revhash
-bin/swh-hash-file
 bin/swh-revhash
 debian/changelog
 debian/compat
@@ -33,9 +31,11 @@ swh/__init__.py
 swh.model.egg-info/PKG-INFO
 swh.model.egg-info/SOURCES.txt
 swh.model.egg-info/dependency_links.txt
+swh.model.egg-info/entry_points.txt
 swh.model.egg-info/requires.txt
 swh.model.egg-info/top_level.txt
 swh/model/__init__.py
+swh/model/cli.py
 swh/model/exceptions.py
 swh/model/from_disk.py
 swh/model/hashutil.py
@@ -49,6 +49,7 @@ swh/model/fields/hashes.py
 swh/model/fields/simple.py
 swh/model/tests/__init__.py
 swh/model/tests/generate_testdata_from_disk.py
+swh/model/tests/test_cli.py
 swh/model/tests/test_from_disk.py
 swh/model/tests/test_hashutil.py
 swh/model/tests/test_identifiers.py
diff --git a/swh.model.egg-info/entry_points.txt b/swh.model.egg-info/entry_points.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9c74f3660b342d539bac70c1973a1cea205cbb7a
--- /dev/null
+++ b/swh.model.egg-info/entry_points.txt
@@ -0,0 +1,4 @@
+
+        [console_scripts]
+        swh-identify=swh.model.cli:identify
+    
\ No newline at end of file
diff --git a/swh.model.egg-info/requires.txt b/swh.model.egg-info/requires.txt
index 39a323addb39c408716b8874ef828acd3c4da427..dbcd3082d3efe925da6522cc818c882fb36e74f8 100644
--- a/swh.model.egg-info/requires.txt
+++ b/swh.model.egg-info/requires.txt
@@ -1 +1,2 @@
+Click
 vcversioner
diff --git a/swh/model/cli.py b/swh/model/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..5996d19a196907f85da94bf6107686735f10fd12
--- /dev/null
+++ b/swh/model/cli.py
@@ -0,0 +1,95 @@
+# Copyright (C) 2018  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import click
+import os
+import sys
+
+from swh.model import identifiers as pids
+from swh.model.exceptions import ValidationError
+from swh.model.from_disk import Content, Directory
+
+
+class PidParamType(click.ParamType):
+    name = 'persistent identifier'
+
+    def convert(self, value, param, ctx):
+        try:
+            pids.parse_persistent_identifier(value)
+            return value  # return as string, as we need just that
+        except ValidationError as e:
+            self.fail('%s is not a valid PID. %s.' % (value, e), param, ctx)
+
+
+def pid_of_file(path):
+    object = Content.from_file(path=path).get_data()
+    return pids.persistent_identifier(pids.CONTENT, object)
+
+
+def pid_of_dir(path):
+    object = Directory.from_disk(path=path).get_data()
+    return pids.persistent_identifier(pids.DIRECTORY, object)
+
+
+@click.command()
+@click.option('--type', '-t', default='auto',
+              type=click.Choice(['auto', 'content', 'directory']),
+              help='type of object to identify (default: auto)')
+@click.option('--verify', '-v', metavar='PID', type=PidParamType(),
+              help='reference identifier to be compared with computed one')
+@click.argument('object',
+                type=click.Path(exists=True, readable=True,
+                                allow_dash=True, path_type=bytes))
+def identify(type, verify, object):
+    """Compute the Software Heritage persistent identifier (PID) for a given
+    source code object.
+
+    For more details about Software Heritage PIDs see:
+
+    \b
+    https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
+
+    \b
+    Examples:
+
+    \b
+      $ swh-identify /usr/src/linux/kernel/
+      swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab
+
+    \b
+      $ swh-identify /usr/src/linux/kernel/sched/deadline.c
+      swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82
+
+    """
+    if type == 'auto':
+        if os.path.isfile(object):
+            type = 'content'
+        elif os.path.isdir(object):
+            type = 'directory'
+        else:  # shouldn't happen, due to path validation
+            raise click.BadParameter('%s is neither a file nor a directory' %
+                                     object)
+
+    pid = None
+    if type == 'content':
+        pid = pid_of_file(object)
+    elif type == 'directory':
+        pid = pid_of_dir(object)
+    else:  # shouldn't happen, due to option validation
+        raise click.BadParameter('invalid object type: ' + type)
+
+    if verify:
+        if verify == pid:
+            click.echo('PID match: %s' % pid)
+            sys.exit(0)
+        else:
+            click.echo('PID mismatch: %s != %s' % (verify, pid))
+            sys.exit(1)
+    else:
+        click.echo(pid)
+
+
+if __name__ == '__main__':
+    identify()
diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py
index 0dfbdc34a90c9798b8ce5fdf6d36c8feec2e03d2..3355161689a0329bfac73f696aa4492fc29ac517 100644
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
@@ -43,15 +43,61 @@ Subset of :const:`ALGORITHMS`.
 HASH_BLOCK_SIZE = 32768
 """Block size for streaming hash computations made in this module"""
 
-# Load blake2 hashes from pyblake2 if they are not available in the builtin
-# hashlib
-__pyblake2_hashes = {'blake2s256': 'blake2s',
-                     'blake2b512': 'blake2b'}
-__cache = hashlib.__builtin_constructor_cache
-for __hash, __pyblake2_fn in __pyblake2_hashes.items():
-    if __hash not in hashlib.algorithms_available:
-        import pyblake2
-        __cache[__hash] = getattr(pyblake2, __pyblake2_fn)
+_blake2_hash_cache = {}
+
+
+def _new_blake2_hash(algo):
+    """Return a function that initializes a blake2 hash.
+
+    """
+    if algo in _blake2_hash_cache:
+        return _blake2_hash_cache[algo]()
+
+    lalgo = algo.lower()
+    if not lalgo.startswith('blake2'):
+        raise ValueError('Algorithm %s is not a blake2 hash' % algo)
+
+    blake_family = lalgo[:7]
+
+    digest_size = None
+    if lalgo[7:]:
+        try:
+            digest_size, remainder = divmod(int(lalgo[7:]), 8)
+        except ValueError:
+            raise ValueError(
+                'Unknown digest size for algo %s' % algo
+            ) from None
+        if remainder:
+            raise ValueError(
+                'Digest size for algorithm %s must be a multiple of 8' % algo
+            )
+
+    if lalgo in hashlib.algorithms_available:
+        # Handle the case where OpenSSL ships the given algorithm
+        # (e.g. Python 3.5 on Debian 9 stretch)
+        _blake2_hash_cache[algo] = lambda: hashlib.new(lalgo)
+    else:
+        # Try using the built-in implementation for Python 3.6+
+        if blake_family in hashlib.algorithms_available:
+            blake2 = getattr(hashlib, blake_family)
+        else:
+            import pyblake2
+            blake2 = getattr(pyblake2, blake_family)
+
+        _blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size)
+
+    return _blake2_hash_cache[algo]()
+
+
+def _new_hashlib_hash(algo):
+    """Initialize a digest object from hashlib.
+
+    Handle the swh-specific names for the blake2-related algorithms
+    """
+    if algo.startswith('blake2'):
+        return _new_blake2_hash(algo)
+    else:
+        return hashlib.new(algo)
 
 
 def _new_git_hash(base_algo, git_type, length):
@@ -75,7 +121,7 @@ def _new_git_hash(base_algo, git_type, length):
         a hashutil.hash object
     """
 
-    h = hashlib.new(base_algo)
+    h = _new_hashlib_hash(base_algo)
     git_header = '%s %d\0' % (git_type, length)
     h.update(git_header.encode('ascii'))
 
@@ -113,7 +159,7 @@ def _new_hash(algo, length=None):
         base_algo = algo[:-4]
         return _new_git_hash(base_algo, 'blob', length)
 
-    return hashlib.new(algo)
+    return _new_hashlib_hash(algo)
 
 
 def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None):
diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py
index eef7710698e0ec499a62e85e1a507a9371f8fcf5..00471f354424c34584d234c1be756f0d71e6a203 100644
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
@@ -7,6 +7,8 @@ import binascii
 import datetime
 from functools import lru_cache
 
+from .exceptions import ValidationError
+from .fields.hashes import validate_sha1
 from .hashutil import hash_data, hash_git_data, DEFAULT_ALGORITHMS
 from .hashutil import hash_to_hex
 
@@ -603,9 +605,16 @@ def persistent_identifier(type, object, version=1):
 
     Args:
         type (str): Object's type
-        object (str): Object's dict representation
+        object (dict/bytes/str): Object's dict representation or object
+                                 identifier
         version (int): persistent identifier version (default to 1)
 
+    Raises:
+        ValidationError (class) in case of:
+
+            invalid type
+            invalid hash object
+
     Returns:
         Persistent identifier as string.
 
@@ -632,11 +641,22 @@ def persistent_identifier(type, object, version=1):
             'key_id': 'sha1_git'
         },
     }
-    o = _map[type]
-    _hash = hash_to_hex(object[o['key_id']])
+    o = _map.get(type)
+    if not o:
+        raise ValidationError('Wrong input: Supported types are %s' % (
+            list(_map.keys())))
+
+    if isinstance(object, dict):  # internal swh representation resolution
+        _hash = object[o['key_id']]
+    else:  # client passed direct identifier (bytes/str)
+        _hash = object
+    validate_sha1(_hash)  # can raise if invalid hash
+    _hash = hash_to_hex(_hash)
     return 'swh:%s:%s:%s' % (version, o['short_name'], _hash)
 
 
+PERSISTENT_IDENTIFIER_TYPES = ['snp', 'rel', 'rev', 'dir', 'cnt']
+
 PERSISTENT_IDENTIFIER_KEYS = [
     'namespace', 'scheme_version', 'object_type', 'object_id', 'metadata']
 
@@ -649,6 +669,16 @@ def parse_persistent_identifier(persistent_id):
     Args:
         persistent_id (str): A persistent identifier
 
+    Raises:
+        ValidationError (class) in case of:
+
+            missing mandatory values (4)
+            invalid namespace supplied
+            invalid version supplied
+            invalid type supplied
+            missing hash
+            invalid hash identifier supplied
+
     Returns:
         dict: dict with keys :
 
@@ -659,14 +689,47 @@ def parse_persistent_identifier(persistent_id):
             * metadata, holding dict value
 
     """
+    # <pid>;<contextual-information>
     persistent_id_parts = persistent_id.split(PERSISTENT_IDENTIFIER_PARTS_SEP)
-    data = persistent_id_parts.pop(0).split(':')
+    pid_data = persistent_id_parts.pop(0).split(':')
+
+    if len(pid_data) != 4:
+        raise ValidationError(
+            'Wrong format: There should be 4 mandatory parameters')
+
+    # Checking for parsing errors
+    _ns, _version, _type, _id = pid_data
+    if _ns != 'swh':
+        raise ValidationError(
+            'Wrong format: Supported namespace is \'swh\'')
+
+    if _version != '1':
+        raise ValidationError(
+            'Wrong format: Supported version is 1')
+
+    expected_types = PERSISTENT_IDENTIFIER_TYPES
+    if _type not in expected_types:
+        raise ValidationError(
+            'Wrong format: Supported types are %s' % (
+                ', '.join(expected_types)))
+
+    if not _id:
+        raise ValidationError(
+            'Wrong format: Identifier should be present')
+
+    try:
+        validate_sha1(_id)
+    except ValidationError:
+        raise ValidationError(
+           'Wrong format: Identifier should be a valid hash')
+
     persistent_id_metadata = {}
     for part in persistent_id_parts:
         try:
             key, val = part.split('=')
             persistent_id_metadata[key] = val
         except Exception:
-            pass
-    data.append(persistent_id_metadata)
-    return dict(zip(PERSISTENT_IDENTIFIER_KEYS, data))
+            msg = 'Contextual data is badly formatted, form key=val expected'
+            raise ValidationError(msg)
+    pid_data.append(persistent_id_metadata)
+    return dict(zip(PERSISTENT_IDENTIFIER_KEYS, pid_data))
diff --git a/swh/model/tests/test_cli.py b/swh/model/tests/test_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..054cc0c764f35fa769ec3d051635a3c835f53ea5
--- /dev/null
+++ b/swh/model/tests/test_cli.py
@@ -0,0 +1,73 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import tempfile
+import unittest
+
+from click.testing import CliRunner
+from nose.plugins.attrib import attr
+
+from swh.model import cli
+from swh.model.tests.test_from_disk import DataMixin
+from swh.model.hashutil import hash_to_hex
+
+
+@attr('fs')
+class TestIdentify(DataMixin, unittest.TestCase):
+
+    def setUp(self):
+        super().setUp()
+        self.runner = CliRunner()
+
+    def test_content_id(self):
+        self.make_contents(self.tmpdir_name)
+        for filename, content in self.contents.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            result = self.runner.invoke(cli.identify,
+                                        ['--type', 'content', path])
+
+            self.assertEqual(result.exit_code, 0)
+            self.assertEqual(result.output.rstrip(),
+                             'swh:1:cnt:' + hash_to_hex(content['sha1_git']))
+
+    def test_directory_id(self):
+        self.make_from_tarball(self.tmpdir_name)
+        path = os.path.join(self.tmpdir_name, b'sample-folder')
+        result = self.runner.invoke(cli.identify,
+                                    ['--type', 'directory', path])
+
+        self.assertEqual(result.exit_code, 0)
+        self.assertEqual(result.output.rstrip(),
+                         'swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759')
+
+    def test_auto_id(self):
+        with tempfile.NamedTemporaryFile(prefix='swh.model.cli') as f:
+            result = self.runner.invoke(cli.identify, [f.name])
+            self.assertEqual(result.exit_code, 0)
+            self.assertRegex(result.output, r'^swh:\d+:cnt:')
+
+        with tempfile.TemporaryDirectory(prefix='swh.model.cli') as dirname:
+            result = self.runner.invoke(cli.identify, [dirname])
+            self.assertEqual(result.exit_code, 0)
+            self.assertRegex(result.output, r'^swh:\d+:dir:')
+
+    def test_verify_content(self):
+        self.make_contents(self.tmpdir_name)
+        for filename, content in self.contents.items():
+            expected_id = 'swh:1:cnt:' + hash_to_hex(content['sha1_git'])
+
+            # match
+            path = os.path.join(self.tmpdir_name, filename)
+            result = self.runner.invoke(cli.identify,
+                                        ['--verify', expected_id, path])
+            self.assertEqual(result.exit_code, 0)
+
+            # mismatch
+            with open(path, 'a') as f:
+                f.write('trailing garbage to make verification fail')
+            result = self.runner.invoke(cli.identify,
+                                        ['--verify', expected_id, path])
+            self.assertEqual(result.exit_code, 1)
diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py
index 8b883f16bf033376c1b8c7c91fb18e38b9b1b56c..da49af99fe259ba358a233d6fc87e7c6c1e8aa16 100644
--- a/swh/model/tests/test_hashutil.py
+++ b/swh/model/tests/test_hashutil.py
@@ -3,6 +3,7 @@
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+import hashlib
 import io
 import os
 import tempfile
@@ -16,6 +17,9 @@ from swh.model import hashutil
 
 class Hashutil(unittest.TestCase):
     def setUp(self):
+        # Reset function cache
+        hashutil._blake2_hash_cache = {}
+
         self.data = b'1984\n'
         self.hex_checksums = {
             'sha1': '62be35bf00ff0c624f4a621e2ea5595a049e0731',
@@ -150,25 +154,103 @@ class Hashutil(unittest.TestCase):
                               'expected one of blake2b512, blake2s256, '
                               'sha1, sha1_git, sha256')
 
-    @patch('swh.model.hashutil.hashlib')
+    @patch('hashlib.new')
     @istest
-    def new_hash_blake2b(self, mock_hashlib):
-        mock_hashlib.new.return_value = 'some-hashlib-object'
+    def new_hash_blake2b_blake2b512_builtin(self, mock_hashlib_new):
+        if 'blake2b512' not in hashlib.algorithms_available:
+            self.skipTest('blake2b512 not built-in')
+        mock_hashlib_new.return_value = sentinel = object()
 
         h = hashutil._new_hash('blake2b512')
 
-        self.assertEquals(h, 'some-hashlib-object')
-        mock_hashlib.new.assert_called_with('blake2b512')
+        self.assertIs(h, sentinel)
+        mock_hashlib_new.assert_called_with('blake2b512')
 
-    @patch('swh.model.hashutil.hashlib')
+    @patch('hashlib.new')
     @istest
-    def new_hash_blake2s(self, mock_hashlib):
-        mock_hashlib.new.return_value = 'some-hashlib-object'
+    def new_hash_blake2s_blake2s256_builtin(self, mock_hashlib_new):
+        if 'blake2s256' not in hashlib.algorithms_available:
+            self.skipTest('blake2s256 not built-in')
+        mock_hashlib_new.return_value = sentinel = object()
 
         h = hashutil._new_hash('blake2s256')
 
-        self.assertEquals(h, 'some-hashlib-object')
-        mock_hashlib.new.assert_called_with('blake2s256')
+        self.assertIs(h, sentinel)
+        mock_hashlib_new.assert_called_with('blake2s256')
+
+    @istest
+    def new_hash_blake2b_builtin(self):
+        removed_hash = False
+
+        try:
+            if 'blake2b512' in hashlib.algorithms_available:
+                removed_hash = True
+                hashlib.algorithms_available.remove('blake2b512')
+            if 'blake2b' not in hashlib.algorithms_available:
+                self.skipTest('blake2b not built in')
+
+            with patch('hashlib.blake2b') as mock_blake2b:
+                mock_blake2b.return_value = sentinel = object()
+
+                h = hashutil._new_hash('blake2b512')
+
+                self.assertIs(h, sentinel)
+                mock_blake2b.assert_called_with(digest_size=512//8)
+        finally:
+            if removed_hash:
+                hashlib.algorithms_available.add('blake2b512')
+
+    @istest
+    def new_hash_blake2s_builtin(self):
+        removed_hash = False
+
+        try:
+            if 'blake2s256' in hashlib.algorithms_available:
+                removed_hash = True
+                hashlib.algorithms_available.remove('blake2s256')
+            if 'blake2s' not in hashlib.algorithms_available:
+                self.skipTest('blake2s not built in')
+
+            with patch('hashlib.blake2s') as mock_blake2s:
+                mock_blake2s.return_value = sentinel = object()
+
+                h = hashutil._new_hash('blake2s256')
+
+                self.assertIs(h, sentinel)
+                mock_blake2s.assert_called_with(digest_size=256//8)
+        finally:
+            if removed_hash:
+                hashlib.algorithms_available.add('blake2s256')
+
+    @istest
+    def new_hash_blake2b_pyblake2(self):
+        if 'blake2b512' in hashlib.algorithms_available:
+            self.skipTest('blake2b512 built in')
+        if 'blake2b' in hashlib.algorithms_available:
+            self.skipTest('blake2b built in')
+
+        with patch('pyblake2.blake2b') as mock_blake2b:
+            mock_blake2b.return_value = sentinel = object()
+
+            h = hashutil._new_hash('blake2b512')
+
+            self.assertIs(h, sentinel)
+            mock_blake2b.assert_called_with(digest_size=512//8)
+
+    @istest
+    def new_hash_blake2s_pyblake2(self):
+        if 'blake2s256' in hashlib.algorithms_available:
+            self.skipTest('blake2s256 built in')
+        if 'blake2s' in hashlib.algorithms_available:
+            self.skipTest('blake2s built in')
+
+        with patch('pyblake2.blake2s') as mock_blake2s:
+            mock_blake2s.return_value = sentinel = object()
+
+            h = hashutil._new_hash('blake2s256')
+
+            self.assertIs(h, sentinel)
+            mock_blake2s.assert_called_with(digest_size=256//8)
 
 
 class HashlibGit(unittest.TestCase):
diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py
index afe943b6de8699ffb1775c8df135253c3a1d1a42..7daf8e40ac25b68d4cf05cb0ebc455e63bee26b2 100644
--- a/swh/model/tests/test_identifiers.py
+++ b/swh/model/tests/test_identifiers.py
@@ -11,8 +11,9 @@ from nose.tools import istest
 
 from swh.model import hashutil, identifiers
 
+from swh.model.exceptions import ValidationError
 from swh.model.identifiers import SNAPSHOT, RELEASE, REVISION, DIRECTORY
-from swh.model.identifiers import CONTENT
+from swh.model.identifiers import CONTENT, PERSISTENT_IDENTIFIER_TYPES
 
 
 class UtilityFunctionsIdentifier(unittest.TestCase):
@@ -773,13 +774,29 @@ class SnapshotIdentifier(unittest.TestCase):
         )
 
     def test_persistent_identifier(self):
-        _snapshot = {'id': hashutil.hash_to_bytes(
-                    'c7c108084bc0bf3d81436bf980b46e98bd338453')}
-        _release = {'id': '22ece559cc7cc2364edc5e5593d63ae8bd229f9f'}
-        _revision = {'id': '309cf2674ee7a0749978cf8265ab91a60aea0f7d'}
-        _directory = {'id': 'd198bc9d7a6bcf6db04f476d29314f157507d505'}
-        _content = {'sha1_git': '94a9ed024d3859793618152ea559a168bbcbb5e2'}
+        _snapshot_id = hashutil.hash_to_bytes(
+                    'c7c108084bc0bf3d81436bf980b46e98bd338453')
+        _release_id = '22ece559cc7cc2364edc5e5593d63ae8bd229f9f'
+        _revision_id = '309cf2674ee7a0749978cf8265ab91a60aea0f7d'
+        _directory_id = 'd198bc9d7a6bcf6db04f476d29314f157507d505'
+        _content_id = '94a9ed024d3859793618152ea559a168bbcbb5e2'
+        _snapshot = {'id': _snapshot_id}
+        _release = {'id': _release_id}
+        _revision = {'id': _revision_id}
+        _directory = {'id': _directory_id}
+        _content = {'sha1_git': _content_id}
+
         for full_type, _hash, expected_persistent_id, version in [
+                (SNAPSHOT, _snapshot_id,
+                 'swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', None),
+                (RELEASE, _release_id,
+                 'swh:2:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', 2),
+                (REVISION, _revision_id,
+                 'swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', None),
+                (DIRECTORY, _directory_id,
+                 'swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', None),
+                (CONTENT, _content_id,
+                 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 1),
                 (SNAPSHOT, _snapshot,
                  'swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', None),
                 (RELEASE, _release,
@@ -789,7 +806,7 @@ class SnapshotIdentifier(unittest.TestCase):
                 (DIRECTORY, _directory,
                  'swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', None),
                 (CONTENT, _content,
-                 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 1)
+                 'swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 1),
         ]:
             if version:
                 actual_value = identifiers.persistent_identifier(
@@ -800,12 +817,24 @@ class SnapshotIdentifier(unittest.TestCase):
 
             self.assertEquals(actual_value, expected_persistent_id)
 
+    def test_persistent_identifier_wrong_input(self):
+        _snapshot_id = 'notahash4bc0bf3d81436bf980b46e98bd338453'
+        _snapshot = {'id': _snapshot_id}
+
+        for _type, _hash, _error in [
+                (SNAPSHOT, _snapshot_id, 'Unexpected characters'),
+                (SNAPSHOT, _snapshot, 'Unexpected characters'),
+                ('foo', '', 'Wrong input: Supported types are'),
+        ]:
+            with self.assertRaisesRegex(ValidationError, _error):
+                identifiers.persistent_identifier(_type, _hash)
+
     def test_parse_persistent_identifier(self):
         for pid, _type, _version, _hash in [
                 ('swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 'cnt',
                  '1', '94a9ed024d3859793618152ea559a168bbcbb5e2'),
-                ('swh:2:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir',
-                 '2', 'd198bc9d7a6bcf6db04f476d29314f157507d505'),
+                ('swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir',
+                 '1', 'd198bc9d7a6bcf6db04f476d29314f157507d505'),
                 ('swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', 'rev',
                  '1', '309cf2674ee7a0749978cf8265ab91a60aea0f7d'),
                 ('swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', 'rel',
@@ -834,9 +863,7 @@ class SnapshotIdentifier(unittest.TestCase):
                   'dir', '1', '0b6959356d30f1a4e9b7f6bca59b9a336464c03d',
                  {
                      'origin': 'deb://Debian/packages/linuxdoc-tools'
-                 }),
-                 ('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;malformed', # noqa
-                  'dir', '1', '0b6959356d30f1a4e9b7f6bca59b9a336464c03d', {})
+                 })
         ]:
             expected_result = {
                 'namespace': 'swh',
@@ -847,3 +874,32 @@ class SnapshotIdentifier(unittest.TestCase):
             }
             actual_result = identifiers.parse_persistent_identifier(pid)
             self.assertEquals(actual_result, expected_result)
+
+    def test_parse_persistent_identifier_parsing_error(self):
+        for pid, _error in [
+                ('swh:1:cnt',
+                 'Wrong format: There should be 4 mandatory parameters'),
+                ('swh:1:',
+                 'Wrong format: There should be 4 mandatory parameters'),
+                ('swh:',
+                 'Wrong format: There should be 4 mandatory parameters'),
+                ('swh:1:cnt:',
+                 'Wrong format: Identifier should be present'),
+                ('foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505',
+                 'Wrong format: Supported namespace is \'swh\''),
+                ('swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505',
+                 'Wrong format: Supported version is 1'),
+                ('swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505',
+                 'Wrong format: Supported types are %s' % (
+                     ', '.join(PERSISTENT_IDENTIFIER_TYPES))),
+                ('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;'
+                 'malformed',
+                 'Contextual data is badly formatted, form key=val expected'),
+                ('swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d',
+                 'Wrong format: Identifier should be a valid hash'),
+                ('swh:1:snp:foo',
+                 'Wrong format: Identifier should be a valid hash')
+        ]:
+            with self.assertRaisesRegex(
+                    ValidationError, _error):
+                identifiers.parse_persistent_identifier(pid)
diff --git a/version.txt b/version.txt
index c0bdf11c06ccb6f33f80e32ef7f296cd2080242e..8af3930e8c604e6de47bdf78d405936146a71cf7 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-v0.0.23-0-g448eafa
\ No newline at end of file
+v0.0.24-0-g5eb055d
\ No newline at end of file