Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • anlambert/swh-model
  • lunar/swh-model
  • franckbret/swh-model
  • douardda/swh-model
  • olasd/swh-model
  • swh/devel/swh-model
  • Alphare/swh-model
  • samplet/swh-model
  • marmoute/swh-model
  • rboyer/swh-model
10 results
Show changes
Commits on Source (415)
# Changes here will be overwritten by Copier
_commit: v0.3.3
_src_path: https://gitlab.softwareheritage.org/swh/devel/swh-py-template.git
description: Software Heritage data model
distribution_name: swh-model
have_cli: true
have_workers: false
package_root: swh/model
project_name: swh.model
python_minimal_version: '3.7'
readme_format: rst
# python: Reformat code with black
bf3f1cec8685c8f480ddd95027852f8caa10b8e3
4c39334b2aa9f782950aaee72781dc1df9d37550
5ff7c5b592ce1d76f5696a7f089680807ad557a6
*~
build
/.coverage
/.coverage.*
dist
*.egg-info/
*.pyc
.coverage
.eggs/
.hypothesis
*.pyc
__pycache__
.pytest_cache
*.sw?
.mypy_cache
.tox
version.txt
.mypy_cache/
__pycache__
build/
dist/
# these are symlinks created by a hook in swh-docs' main sphinx conf.py
docs/README.rst
docs/README.md
# this should be a symlink for people who want to build the sphinx doc
# without using tox, generally created by the swh-env/bin/update script
docs/Makefile.sphinx
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: check-json
- id: check-yaml
- repo: https://github.com/python/black
rev: 25.1.0
hooks:
- id: black
- repo: https://github.com/PyCQA/isort
rev: 6.0.0
hooks:
- id: isort
- repo: https://github.com/pycqa/flake8
rev: 7.1.1
hooks:
- id: flake8
additional_dependencies: [flake8-bugbear==24.12.12, flake8-pyproject]
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
hooks:
- id: codespell
name: Check source code spelling
stages: [pre-commit]
args: [-L assertIn, -L anc]
- id: codespell
name: Check commit message spelling
stages: [commit-msg]
- repo: local
hooks:
- id: mypy
name: mypy
entry: mypy
args: [swh]
pass_filenames: false
language: system
types: [python]
- id: twine-check
name: twine check
description: call twine check when pushing an annotated release tag
entry: bash -c "ref=$(git describe) &&
[[ $ref =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] &&
(python3 -m build --sdist && twine check $(ls -t dist/* | head -1)) || true"
pass_filenames: false
stages: [pre-push]
language: python
additional_dependencies: [twine, build]
......@@ -6,7 +6,7 @@ In the interest of fostering an open and welcoming environment, we as Software
Heritage contributors and maintainers pledge to making participation in our
project and our community a harassment-free experience for everyone, regardless
of age, body size, disability, ethnicity, sex characteristics, gender identity
and expression, level of experience, education, socio-economic status,
and expression, level of experience, education, socioeconomic status,
nationality, personal appearance, race, religion, or sexual identity and
orientation.
......
Daniele Serafini
Ishan Bhanuka
Antoine Cezar
Pierre-Yves David
include README.md
include Makefile
include requirements.txt
include requirements-swh.txt
include version.txt
recursive-include swh/model/tests/data *.tgz
swh-model
=========
Software Heritage - Data model
==============================
Implementation of the Data model of the Software Heritage project, used to
archive source code artifacts.
This module defines the notion of Persistent Identifier (PID) and provides
tools to compute them:
This module defines the notion of SoftWare Hash persistent IDentifiers
(SWHIDs) and provides tools to compute them:
.. code-block:: shell
```sh
$ swh-identify fork.c kmod.c sched/deadline.c
swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3 fork.c
swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2 kmod.c
......@@ -15,4 +16,4 @@ tools to compute them:
$ swh-identify --no-filename /usr/src/linux/kernel/
swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab
```
......@@ -5,16 +5,17 @@
# --ignore-empty-folders
# 38f8d2c3a951f6b94007896d0981077e48bbd702
import click
import os
import click
from swh.model import from_disk, hashutil
def combine_filters(*filters):
"""Combine several ignore filters"""
if len(filters) == 0:
return from_disk.accept_all_directories
return from_disk.accept_all_paths
elif len(filters) == 1:
return filters[0]
......@@ -25,27 +26,24 @@ def combine_filters(*filters):
@click.command()
@click.option('--path', default='.',
help='Optional path to hash.')
@click.option('--ignore-empty-folder', is_flag=True, default=False,
help='Ignore empty folder.')
@click.option('--ignore', multiple=True,
help='Ignore pattern.')
@click.option("--path", default=".", help="Optional path to hash.")
@click.option(
"--ignore-empty-folder", is_flag=True, default=False, help="Ignore empty folder."
)
@click.option("--ignore", multiple=True, help="Ignore pattern.")
def main(path, ignore_empty_folder=False, ignore=None):
filters = []
if ignore_empty_folder:
filters.append(from_disk.ignore_empty_directories)
if ignore:
filters.append(
from_disk.ignore_named_directories(
[os.fsencode(name) for name in ignore]
)
from_disk.ignore_named_directories([os.fsencode(name) for name in ignore])
)
try:
d = from_disk.Directory.from_disk(path=os.fsencode(path),
dir_filter=combine_filters(*filters))
d = from_disk.Directory.from_disk(
path=os.fsencode(path), path_filter=combine_filters(*filters)
)
hash = d.hash
except Exception as e:
print(e)
......@@ -54,5 +52,5 @@ def main(path, ignore_empty_folder=False, ignore=None):
print(hashutil.hash_to_hex(hash))
if __name__ == '__main__':
if __name__ == "__main__":
main()
......@@ -11,21 +11,19 @@
import sys
from swh.model import identifiers, hashutil
from swh.model import hashutil, identifiers
def revhash(revision_raw):
"""Compute the revision hash.
"""
"""Compute the revision hash."""
# HACK: string have somehow their \n expanded to \\n
if b'\\n' in revision_raw:
revision_raw = revision_raw.replace(b'\\n', b'\n')
if b"\\n" in revision_raw:
revision_raw = revision_raw.replace(b"\\n", b"\n")
h = hashutil.hash_git_data(revision_raw, 'commit')
h = hashutil.hash_git_data(revision_raw, "commit")
return identifiers.identifier_to_str(h)
if __name__ == '__main__':
revision_raw = sys.argv[1].encode('utf-8')
if __name__ == "__main__":
revision_raw = sys.argv[1].encode("utf-8")
print(revhash(revision_raw))
include ../../swh-docs/Makefile.sphinx
include Makefile.sphinx
-include Makefile.local
Command-line interface
======================
.. click:: swh.model.cli:identify
:prog: swh identify
:nested: full
......@@ -74,8 +74,7 @@ synonyms.
**directories**
a list of named directory entries, each of which pointing to other artifacts,
usually file contents or sub-directories. Directory entries are also
associated to arbitrary metadata, which vary with technologies, but usually
includes permission bits, modification timestamps, etc.
associated to some metadata stored as permission bits.
**revisions** (AKA "commits")
software development within a specific project is essentially a time-indexed
......@@ -92,8 +91,8 @@ synonyms.
some revisions are more equals than others and get selected by developers as
denoting important project milestones known as "releases". Each release
points to the last commit in project history corresponding to the release and
might carry arbitrary metadata—e.g., release name and version, release
message, cryptographic signatures, etc.
carries metadata: release name and version, release message, cryptographic
signatures, etc.
Additionally, the following crawling-related information are stored as
......@@ -145,6 +144,11 @@ provenance information in the Software Heritage archive:
Software Heritage clock) the visit happened and the full snapshot of the
state of the software origin at the time.
.. note::
This model currently records visits as a single point in time. However, the
actual visit process is not instantaneous. Loaders can record successive
changes to the state of the visit, as their work progresses, as updates to
the visit object.
Data structure
--------------
......@@ -255,3 +259,39 @@ making emergent structures such as code reuse across different projects or
software origins, readily available. Further reinforcing the Software Heritage
use cases, this object could become a veritable "map of the stars" of our
entire software commons.
Extended data model
-------------------
In addition to the artifacts detailed above used to represent original software
artifacts, the Software Heritage archive stores information about these
artifacts.
**extid**
a relationship between an original identifier of an artifact, in its
native/upstream environment, and a `core SWHID <persistent-identifiers>`,
which is specific to Software Heritage. As such, it includes:
* the external identifier, stored as bytes whose format is opaque to the
data model
* a type (a simple name and a version), to identify the type of relationship
* the "target", which is a core SWHID
An extid may also include a "payload", which is arbitrary data about the
relationship. For example, an extid might link a directory to the
cryptographic hash of the tarball that originally contained it. In this
case, the payload could include data useful for reconstructing the
original tarball from the directory. The payload data is stored
separately. An extid refers to it by its ``sha1_git`` hash.
**raw extrinsic metadata**
an opaque bytestring, along with its format (a simple name), an identifier
of the object the metadata is about and in which context (similar to a
`qualified SWHID <persistent-identifiers>`), and provenance information
(the authority who provided it, the fetcher tool used to get it, and the
data it was discovered at).
It provides both a way to store information about an artifact contributed by
external entities, after the artifact was created, and an escape hatch to
store metadata that would not otherwise fit in the data model.
(last updated 2020-04-28)
Scheme name: swh
Status: Provisional
Applications/protocols that use this scheme name:
Software Heritage: https://www.softwareheritage.org/
Software Package Data Exchange: https://spdx.org/
NTIA: https://www.ntia.doc.gov/SoftwareTransparency
Identifiers.org: http://identifiers.org/
Name-to-Thing (N2T): https://n2t.net/
HAL: https://hal.archives-ouvertes.fr/
Contact: Stefano Zacchiroli <zack@upsilon.cc>
Change controller: Software Heritage <info@softwareheritage.org>
References:
Scheme specification: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
The Software Heritage project: https://www.softwareheritage.org/
The Software Heritage archive: https://archive.softwareheritage.org/
Publications:
Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. Referencing Source
Code Artifacts: a Separate Concern in Software Citation. In Computing in
Science and Engineering, volume 22, issue 2, pp. 33-43. ISSN 1521-9615,
IEEE. March 2020. DOI 10.1109/MCSE.2019.2963148
Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. Identifiers for
Digital Objects: the Case of Software Source Code Preservation. In
proceedings of iPRES 2018: 15th International Conference on Digital
Preservation. September 2018. 10.17605/OSF.IO/KDE56
(file created 2020-04-28)
......@@ -6,12 +6,15 @@ BUILD_TARGETS += $(MERKLE_DAG)
all: $(BUILD_TARGETS)
%.svg: %.dia
inkscape -l $@ $<
dia -e $@ $<
%.pdf: %.dia
inkscape -A $@ $<
%.pdf: %.svg
set -e; if [ $$(inkscape --version 2>/dev/null | grep -Eo '[0-9]+' | head -1) -gt 0 ]; then \
inkscape -o $@ $< ; \
else \
inkscape -A $@ $< ; \
fi
clean:
-rm -f $(BUILD_TARGETS)
.. _swh-model:
Software Heritage - Data model
==============================
Implementation of the :ref:`data-model` to archive source code artifacts.
.. include:: README.rst
.. toctree::
:caption: Overview:
......@@ -11,4 +8,13 @@ Implementation of the :ref:`data-model` to archive source code artifacts.
data-model
persistent-identifiers
/apidoc/swh.model
cli
.. only:: standalone_package_doc
Indices and tables
------------------
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
.. _persistent-identifiers:
.. _swhids:
======================
Persistent identifiers
======================
=================================================
SoftWare Heritage persistent IDentifiers (SWHIDs)
=================================================
Description
===========
**version 1.6, last modified 2021-04-30**
You can point to objects present in the Software Heritage archive by the means
of **persistent identifiers** that are guaranteed to remain stable (persistent)
over time. Their syntax, meaning, and usage is described below. Note that they
are identifiers and not URLs, even though an URL-based resolver for Software
Heritage persistent identifiers is also provided.
.. contents::
:local:
:depth: 2
A persistent identifier can point to any software artifact (or "object")
available in the Software Heritage archive. Objects come in different types,
and most notably:
Overview
========
You can point to objects present in the `Software Heritage
<https://www.softwareheritage.org/>`_ `archive
<https://archive.softwareheritage.org/>`_ by the means of **SoftWare Heritage
persistent IDentifiers**, or **SWHIDs** for short, that are guaranteed to
remain stable (persistent) over time. Their syntax, meaning, and usage is
described below. Note that they are identifiers and not URLs, even though
URL-based `resolvers`_ for SWHIDs are also available.
A SWHID consists of two separate parts, a mandatory *core identifier* that can
point to any software artifact (or "object") available in the Software Heritage
archive, and an optional list of *qualifiers* that allows to specify the
context where the object is meant to be seen and point to a subpart of the
object itself.
Objects come in different types:
* contents
* directories
......@@ -24,52 +38,103 @@ and most notably:
* snapshots
Each object is identified by an intrinsic, type-specific object identifier that
is embedded in its persistent identifier as described below. Object identifiers
are strong cryptographic hashes computed on the entire set of object properties
to form a `Merkle structure <https://en.wikipedia.org/wiki/Merkle_tree>`_.
is embedded in its SWHID as described below. The intrinsic identifiers embedded
in SWHIDs are strong cryptographic hashes computed on the entire set of object
properties. Together, these identifiers form a `Merkle structure
<https://en.wikipedia.org/wiki/Merkle_tree>`_, specifically a Merkle `DAG
<https://en.wikipedia.org/wiki/Directed_acyclic_graph>`_.
See :ref:`data-model` for an overview of object types and how they are linked
together. See :py:mod:`swh.model.identifiers` for details on how intrinsic
object identifiers are computed.
See the :ref:`Software Heritage data model <data-model>` for an overview of
object types and how they are linked together. See
:py:mod:`swh.model.git_objects` for details on how the intrinsic identifiers
embedded in SWHIDs are computed.
The optional qualifiers are of two kinds:
* **context qualifiers:** carry information about the context where a given
object is meant to be seen. This is particularly important, as the same
object can be reached in the Merkle graph following different *paths*
starting from different nodes (or *anchors*), and it may have been retrieved
from different *origins*, that may evolve between different *visits*
* **fragment qualifiers:** allow to pinpoint specific subparts of an object
.. _swhids-syntax:
Syntax
------
======
Syntactically, persistent identifiers are generated by the ``<identifier>``
entry point of the grammar:
Syntactically, SWHIDs are generated by the ``<identifier>`` entry point in the
following grammar:
.. code-block:: bnf
<identifier> ::= "swh" ":" <scheme_version> ":" <object_type> ":" <object_id> ;
<identifier> ::= <identifier_core> [ <qualifiers> ] ;
<identifier_core> ::= "swh" ":" <scheme_version> ":" <object_type> ":" <object_id> ;
<scheme_version> ::= "1" ;
<object_type> ::=
"ori" (* origin *)
| "snp" (* snapshot *)
"snp" (* snapshot *)
| "rel" (* release *)
| "rev" (* revision *)
| "dir" (* directory *)
| "cnt" (* content *)
;
<object_id> ::= 40 * <hex_digit> ; (* intrinsic object id, as hex-encoded SHA1 *)
<dec_digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
<dec_digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
<hex_digit> ::= <dec_digit> | "a" | "b" | "c" | "d" | "e" | "f" ;
<qualifiers> := ";" <qualifier> [ <qualifiers> ] ;
<qualifier> ::=
<context_qualifier>
| <fragment_qualifier>
;
<context_qualifier> ::=
<origin_ctxt>
| <visit_ctxt>
| <anchor_ctxt>
| <path_ctxt>
;
<origin_ctxt> ::= "origin" "=" <url_escaped> ;
<visit_ctxt> ::= "visit" "=" <identifier_core> ;
<anchor_ctxt> ::= "anchor" "=" <identifier_core> ;
<path_ctxt> ::= "path" "=" <path_absolute_escaped> ;
<fragment_qualifier> ::= "lines" "=" <line_number> ["-" <line_number>] ;
<line_number> ::= <dec_digit> + ;
<url_escaped> ::= (* RFC 3987 IRI *)
<path_absolute_escaped> ::= (* RFC 3987 absolute path *)
Where:
- ``<path_absolute_escaped>`` is an ``<ipath-absolute>`` from `RFC 3987`_, and
- ``<url_escaped>`` is a `RFC 3987`_ IRI
in either case all occurrences of ``;`` (and ``%``, as required by the RFC)
have been percent-encoded (as ``%3B`` and ``%25`` respectively). Other
characters *can* be percent-encoded, e.g., to improve readability and/or
embeddability of SWHID in other contexts.
.. _RFC 3987: https://tools.ietf.org/html/rfc3987
.. _swhids-semantics:
Semantics
---------
=========
.. _swhids-core:
``:`` is used as separator between the logical parts of identifiers. The
Core identifiers
----------------
``:`` is used as separator between the logical parts of core identifiers. The
``swh`` prefix makes explicit that these identifiers are related to *SoftWare
Heritage*. ``1`` (``<scheme_version>``) is the current version of this
identifier *scheme*; future editions will use higher version numbers, possibly
breaking backward compatibility (but without breaking the resolvability of
identifiers that conform to previous versions of the scheme).
identifier *scheme*. Future editions will use higher version numbers, possibly
breaking backward compatibility, but without breaking the resolvability of
SWHIDs that conform to previous versions of the scheme.
A persistent identifier points to a single object, whose type is explicitly
captured by ``<object_type>``:
A SWHID points to a single object, whose type is explicitly captured by
``<object_type>``:
* ``ori`` identifiers point to **origins**
* ``snp`` to **snapshots**,
* ``rel`` to **releases**,
* ``rev`` to **revisions**,
......@@ -80,144 +145,250 @@ The actual object pointed to is identified by the intrinsic identifier
``<object_id>``, which is a hex-encoded (using lowercase ASCII characters) SHA1
computed on the content and metadata of the object itself, as follows:
* for **origins**, intrinsic identifiers are computed as per
:py:func:`swh.model.identifiers.origin_identifier`
* for **snapshots**, intrinsic identifiers are computed as per
:py:func:`swh.model.identifiers.snapshot_identifier`
* for **snapshots**, intrinsic identifiers are SHA1 hashes of manifests computed as per
:py:func:`swh.model.git_objects.snapshot_git_object`
* for **releases**, as per
:py:func:`swh.model.identifiers.release_identifier`
:py:func:`swh.model.git_objects.release_git_object`
that produces the same result as a git release hash
* for **revisions**, as per
:py:func:`swh.model.identifiers.revision_identifier`
:py:func:`swh.model.git_objects.revision_git_object`
that produces the same result as a git commit hash
* for **directories**, as per
:py:func:`swh.model.identifiers.directory_identifier`
* for **directories**, per
:py:func:`swh.model.git_objects.directory_git_object`
that produces the same result as a git tree hash
* for **contents**, the intrinsic identifier is the ``sha1_git`` hash of the
multiple hashes returned by
:py:func:`swh.model.identifiers.content_identifier`, i.e., the SHA1 of a byte
* for **contents**, the intrinsic identifier is the ``sha1_git`` hash returned by
:py:meth:`swh.hashutil.MultiHash.digest`, i.e., the SHA1 of a byte
sequence obtained by juxtaposing the ASCII string ``"blob"`` (without
quotes), a space, the length of the content as decimal digits, a NULL byte,
and the actual content of the file.
.. _swhids-qualifiers:
Qualifiers
----------
``;`` is used as separator between the core identifier and the optional
qualifiers, as well as between qualifiers. Each qualifier is specified as a
key/value pair, using ``=`` as a separator.
The following *context qualifiers* are available:
* **origin:** the *software origin* where an object has been found or observed
in the wild, as an URI;
* **visit:** the core identifier of a *snapshot* corresponding to a specific
*visit* of a repository containing the designated object;
* **anchor:** a *designated node* in the Merkle DAG relative to which a *path
to the object* is specified, as the core identifier of a directory, a
revision, a release or a snapshot;
* **path:** the *absolute file path*, from the *root directory* associated to
the *anchor node*, to the object; when the anchor denotes a directory or a
revision, and almost always when it's a release, the root directory is
uniquely determined; when the anchor denotes a snapshot, the root directory
is the one pointed to by ``HEAD`` (possibly indirectly), and undefined if
such a reference is missing;
The following *fragment qualifier* is available:
* **lines:** *line number(s)* of interest, usually within a content object
We recommend to equip identifiers meant to be shared with as many qualifiers as
possible. While qualifiers may be listed in any order, it is good practice to
present them in the order given above, i.e., ``origin``, ``visit``, ``anchor``,
``path``, ``lines``. Redundant information should be omitted: for example, if
the *visit* is present, and the *path* is relative to the snapshot indicated
there, then the *anchor* qualifier is superfluous; similarly, if the *path* is
empty, it may be omitted.
Interoperability
================
URI scheme
----------
The ``swh`` URI scheme is registered at IANA for SWHIDs. The present documents
constitutes the scheme specification for such URI scheme.
Git compatibility
~~~~~~~~~~~~~~~~~
-----------------
Intrinsic object identifiers for contents, directories, revisions, and releases
are, at present, compatible with the `Git <https://git-scm.com/>`_ way of
`computing identifiers
SWHIDs for contents, directories, revisions, and releases are, at present,
compatible with the `Git <https://git-scm.com/>`_ way of `computing identifiers
<https://git-scm.com/book/en/v2/Git-Internals-Git-Objects>`_ for its objects.
A Software Heritage content identifier will be identical to a Git blob
identifier of any file with the same content, a Software Heritage revision
identifier will be identical to the corresponding Git commit identifier, etc.
This is not the case for snapshot identifiers as Git doesn't have a
corresponding object type.
The ``<object_id>`` part of a SWHID for a content object is the Git blob
identifier of any file with the same content; for a revision it is the Git
commit identifier for the same revision, etc. This is not the case for
snapshot identifiers, as Git does not have a corresponding object type.
Note that Git compatibility is incidental and is not guaranteed to be
maintained in future versions of this scheme (or Git).
Automatically fixing invalid SWHIDs
-----------------------------------
User interfaces may fix invalid SWHIDs, by lower-casing the
``<identifier_core>`` part of a SWHID, if it contains upper-case letters
because of user errors or limitations in software displaying SWHIDs.
However, implementations displaying or generating SWHIDs should not rely
on this behavior, and must display or generate only valid SWHIDs when
technically possible.
User interfaces should show an error when such an automatic fix occurs,
so users have a chance to fix their SWHID before pasting it to an other interface
that does not perform the same corrections.
This also makes it easier to understand issues when a case-sensitive
qualifier has its casing altered.
Examples
--------
========
Core identifiers
----------------
* ``swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2`` points to the content
of a file containing the full text of the GPL3 license
* ``swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505`` points to a directory
containing the source code of the Darktable photography application as it was
at some point on 4 May 2017
* ``swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d`` points to a commit in
the development history of Darktable, dated 16 January 2017, that added
undo/redo supports for masks
* ``swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f`` points to Darktable
release 2.3.0, dated 24 December 2016
* ``swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453`` points to a snapshot
of the entire Darktable Git repository taken on 4 May 2017 from GitHub
* ``swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f`` points to the
repository https://github.com/torvalds/linux .
Contextual information
======================
It is often useful to complement persistent identifiers with **contextual
information** about where the identified object has been found as well as which
specific parts of it are of interest. To that end it is possible, via a
dedicated syntax, to extend persistent identifiers with the following pieces of
information:
Identifiers with qualifiers
---------------------------
* the **software origin** where an object has been found/observed
* the **line number(s)** of interest, usually within a content object
* The following :swh_web:`SWHID
<swh:1:cnt:4d99d2d18326621ccdd70f5ea66c2e2ac236ad8b;origin=https://gitorious.org/ocamlp3l/ocamlp3l_cvs.git;visit=swh:1:snp:d7f1b9eb7ccb596c2622c4780febaa02549830f9;anchor=swh:1:rev:2db189928c94d62a3b4757b3eec68f0a4d4113f0;path=/Examples/SimpleFarm/simplefarm.ml;lines=9-15>`
denotes the lines 9 to 15 of a file content that can be found at absolute
path ``/Examples/SimpleFarm/simplefarm.ml`` from the root directory of the
revision ``swh:1:rev:2db189928c94d62a3b4757b3eec68f0a4d4113f0`` that is
contained in the snapshot
``swh:1:snp:d7f1b9eb7ccb596c2622c4780febaa02549830f9`` taken from the origin
``https://gitorious.org/ocamlp3l/ocamlp3l_cvs.git``::
swh:1:cnt:4d99d2d18326621ccdd70f5ea66c2e2ac236ad8b;
origin=https://gitorious.org/ocamlp3l/ocamlp3l_cvs.git;
visit=swh:1:snp:d7f1b9eb7ccb596c2622c4780febaa02549830f9;
anchor=swh:1:rev:2db189928c94d62a3b4757b3eec68f0a4d4113f0;
path=/Examples/SimpleFarm/simplefarm.ml;
lines=9-15
Syntax
------
* Here is an example of a :swh_web:`SWHID
<swh:1:cnt:f10371aa7b8ccabca8479196d6cd640676fd4a04;origin=https://github.com/web-platform-tests/wpt;visit=swh:1:snp:b37d435721bbd450624165f334724e3585346499;anchor=swh:1:rev:259d0612af038d14f2cd889a14a3adb6c9e96d96;path=/html/semantics/document-metadata/the-meta-element/pragma-directives/attr-meta-http-equiv-refresh/support/x%253Burl=foo/>`
with a file path that requires percent-escaping::
The full-syntax to complement identifiers with contextual information is given
by the ``<identifier_with_context>`` entry point of the grammar:
swh:1:cnt:f10371aa7b8ccabca8479196d6cd640676fd4a04;
origin=https://github.com/web-platform-tests/wpt;
visit=swh:1:snp:b37d435721bbd450624165f334724e3585346499;
anchor=swh:1:rev:259d0612af038d14f2cd889a14a3adb6c9e96d96;
path=/html/semantics/document-metadata/the-meta-element/pragma-directives/attr-meta-http-equiv-refresh/support/x%3Burl=foo/
.. code-block:: bnf
<identifier_with_context> ::= <identifier> [<lines_ctxt>] [<origin_ctxt>]
<lines_ctxt> ::= ";" "lines" "=" <line_number> ["-" <line_number>]
<origin_ctxt> ::= ";" "origin" "=" <url>
<line_number> ::= <dec_digit> +
<url> ::= (* RFC 3986 compliant URLs *)
Implementation
==============
Semantics
Computing
---------
``;`` is used as separator between persistent identifiers and additional
optional contextual information. Each piece of contextual information is
specified as a key/value pair, using ``=`` as a separator.
An important property of any SWHID is that its core identifier is *intrinsic*:
it can be *computed from the object itself*, without having to rely on any
third party. An implementation of SWHID that allows to do so locally is the
`swh identify <https://docs.softwareheritage.org/devel/swh-model/cli.html>`_
tool, available from the `swh.model <https://pypi.org/project/swh.model/>`_
Python package under the GPL license. This package can be installed via the ``pip``
package manager with the one liner ``pip3 install swh.model[cli]`` on any machine with
Python (at least version 3.7) and ``pip`` installed (on a Debian or Ubuntu system a simple ``apt install python3 python3-pip``
will suffice, see `the general instructions <https://packaging.python.org/tutorials/installing-packages/>`_ for other platforms).
The following piece of contextual information are supported:
SWHIDs are also automatically computed by Software Heritage for all archived
objects as part of its archival activity, and can be looked up via the project
:swh_web:`Web interface <>`.
* line numbers: it is possible to specify a single line number or a line range,
separating two numbers with ``-``. Note that line numbers are purely
indicative and are not meant to be stable, as in some degenerate cases
(e.g., text files which mix different types of line terminators) it is
impossible to resolve them unambiguously.
This has various practical implications:
* software origin: where a given object has been found or observed in the wild,
as the URI that was used by Software Heritage to ingest the object into the
archive
* when a software artifact is obtained from Software Heritage by resolving a
SWHID, it is straightforward to verify that it is exactly the intended one:
just compute the core identifier from the artefact itself, and check that it
is the same as the core identifier part of the SHWID
* the core identifier of a software artifact can be computed *before* its
archival on Software Heritage
Resolution
==========
Choosing what type of SWHID to use
----------------------------------
``swh:1:dir:`` SWHIDs are the most robust SWHIDs, as they can be recomputed from
the simplest objects (a directory structure on a filesystem), even when all
metadata is lost, without relying on the Software Heritage archive.
Therefore, we advise implementers and users to prefer this type of SWHIDs
over ``swh:1:rev:`` and ``swh:1:rel:`` to reference a source code artifacts.
Dedicated resolvers
-------------------
However, since keeping the metadata is also important, you should add an anchor
qualifier to ``swh:1:dir:`` SWHIDs whenever possible, so the metadata stored
in the Software Heritage archive can be retrieved when needed.
This means, for example, that you should prefer
``swh:1:dir:a8eded6a2d062c998ba2dcc3dcb0ce68a4e15a58;anchor=swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f``
over ``swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f``.
Resolvers
---------
Persistent identifiers can be resolved using the Software Heritage Web
application (see :py:mod:`swh.web`). In particular, the **root endpoint**
``/`` can be given a persistent identifier and will lead to the browsing page
of the corresponding object, like this:
Software Heritage resolver
~~~~~~~~~~~~~~~~~~~~~~~~~~
SWHIDs can be resolved using the Software Heritage :swh_web:`Web interface <>`.
In particular, the **root endpoint**
``/`` can be given a SWHID and will lead to the browsing page of the
corresponding object, like this:
``https://archive.softwareheritage.org/<identifier>``.
A **dedicated** ``/resolve`` **endpoint** of the HTTP API is also available to
explicitly request persistent identifier resolution; see:
:http:get:`/api/1/resolve/(swh_id)/`.
A **dedicated** ``/resolve`` **endpoint** of the Software Heritage :swh_web:`Web API
<api/>` is also available to
programmatically resolve SWHIDs; see: :http:get:`/api/1/resolve/(swhid)/`.
Examples:
* `<https://archive.softwareheritage.org/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2>`_
* `<https://archive.softwareheritage.org/swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505>`_
* `<https://archive.softwareheritage.org/api/1/resolve/swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d>`_
* `<https://archive.softwareheritage.org/api/1/resolve/swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f>`_
* `<https://archive.softwareheritage.org/api/1/resolve/swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453>`_
* :swh_web:`swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2`
* :swh_web:`swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505`
* :swh_web:`api/1/resolve/swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d`
* :swh_web:`api/1/resolve/swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f`
* :swh_web:`api/1/resolve/swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453`
* :swh_web:`swh:1:cnt:4d99d2d18326621ccdd70f5ea66c2e2ac236ad8b;origin=https://gitorious.org/ocamlp3l/ocamlp3l_cvs.git;visit=swh:1:snp:d7f1b9eb7ccb596c2622c4780febaa02549830f9;anchor=swh:1:rev:2db189928c94d62a3b4757b3eec68f0a4d4113f0;path=/Examples/SimpleFarm/simplefarm.ml;lines=9-15`
* :swh_web:`swh:1:cnt:f10371aa7b8ccabca8479196d6cd640676fd4a04;origin=https://github.com/web-platform-tests/wpt;visit=swh:1:snp:b37d435721bbd450624165f334724e3585346499;anchor=swh:1:rev:259d0612af038d14f2cd889a14a3adb6c9e96d96;path=/html/semantics/document-metadata/the-meta-element/pragma-directives/attr-meta-http-equiv-refresh/support/x%253Burl=foo/`
External resolvers
------------------
Third-party resolvers
~~~~~~~~~~~~~~~~~~~~~
The following **independent resolvers** support resolution of Software
Heritage persistent identifiers:
The following **third party resolvers** support SWHID resolution:
* `Identifiers.org <https://identifiers.org>`_; see:
`<http://identifiers.org/swh/>`_ (registry identifier `MIR:00000655
......@@ -225,6 +396,10 @@ Heritage persistent identifiers:
* `Name-to-Thing (N2T) <https://n2t.net/>`_
Note that resolution via Identifiers.org currently only supports *core
identifiers* due to `syntactic incompatibilities with qualifiers
<http://identifiers.org/documentation#custom_requests>`_.
Examples:
* `<https://identifiers.org/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2>`_
......@@ -232,10 +407,8 @@ Examples:
* `<https://identifiers.org/swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d>`_
* `<https://n2t.net/swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f>`_
* `<https://n2t.net/swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453>`_
Note that resolution via Identifiers.org does not support contextual
information, due to `syntactic incompatibilities
<http://identifiers.org/documentation#custom_requests>`_.
* `<https://n2t.net/swh:1:cnt:4d99d2d18326621ccdd70f5ea66c2e2ac236ad8b;origin=https://gitorious.org/ocamlp3l/ocamlp3l_cvs.git;visit=swh:1:snp:d7f1b9eb7ccb596c2622c4780febaa02549830f9;anchor=swh:1:rev:2db189928c94d62a3b4757b3eec68f0a4d4113f0;path=/Examples/SimpleFarm/simplefarm.ml;lines=9-15>`_
* `<https://n2t.net/swh:1:cnt:f10371aa7b8ccabca8479196d6cd640676fd4a04;origin=https://github.com/web-platform-tests/wpt;visit=swh:1:snp:b37d435721bbd450624165f334724e3585346499;anchor=swh:1:rev:259d0612af038d14f2cd889a14a3adb6c9e96d96;path=/html/semantics/document-metadata/the-meta-element/pragma-directives/attr-meta-http-equiv-refresh/support/x%25253Burl=foo/>`_
References
......@@ -246,3 +419,9 @@ References
<https://hal.archives-ouvertes.fr/hal-01865790v4>`_. In Proceedings of `iPRES
2018 <https://ipres2018.org/>`_: 15th International Conference on Digital
Preservation, Boston, MA, USA, September 2018, 9 pages.
* Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. `Referencing Source
Code Artifacts: a Separate Concern in Software Citation
<https://arxiv.org/abs/2001.08647>`_. In Computing in Science and
Engineering, volume 22, issue 2, pages 33-43. ISSN 1521-9615,
IEEE. March 2020.
[mypy]
namespace_packages = True
warn_unused_ignores = True
# 3rd party libraries without stubs (yet)
[mypy-pkg_resources.*]
ignore_missing_imports = True
[mypy-pyblake2.*]
ignore_missing_imports = True
[mypy-pytest.*]
ignore_missing_imports = True
[project]
name = "swh.model"
authors = [
{name="Software Heritage developers", email="swh-devel@inria.fr"},
]
description = "Software Heritage data model"
readme = {file = "README.rst", content-type = "text/x-rst"}
requires-python = ">=3.7"
classifiers = [
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Development Status :: 5 - Production/Stable",
]
dynamic = ["version", "dependencies", "optional-dependencies"]
[tool.setuptools.packages.find]
include = ["swh.*"]
[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}
[tool.setuptools.dynamic.optional-dependencies]
cli = {file = "requirements-cli.txt"}
testing = {file = ["requirements-cli.txt", "requirements-test.txt"]}
testing_minimal = {file = "requirements-test.txt"}
[project.entry-points.console_scripts]
"swh.identify" = "swh.model.cli:identify"
[project.entry-points."swh.cli.subcommands"]
"swh.model" = "swh.model.cli"
[project.urls]
"Homepage" = "https://gitlab.softwareheritage.org/swh/devel/swh-model"
"Bug Reports" = "https://gitlab.softwareheritage.org/swh/devel/swh-model/-/issues"
"Funding" = "https://www.softwareheritage.org/donate"
"Documentation" = "https://docs.softwareheritage.org/devel/swh-model/"
"Source" = "https://gitlab.softwareheritage.org/swh/devel/swh-model.git"
[build-system]
requires = ["setuptools", "setuptools-scm"]
build-backend = "setuptools.build_meta"
[tool.setuptools_scm]
fallback_version = "0.0.1"
[tool.black]
target-version = ['py39', 'py310', 'py311', 'py312']
[tool.isort]
multi_line_output = 3
include_trailing_comma = true
force_grid_wrap = 0
use_parentheses = true
ensure_newline_before_comments = true
line_length = 88
force_sort_within_sections = true
known_first_party = ['swh']
[tool.mypy]
namespace_packages = true
warn_unused_ignores = true
explicit_package_bases = true
# ^ Needed for mypy to detect py.typed from swh packages installed
# in editable mode
plugins = []
# 3rd party libraries without stubs (yet)
# [[tool.mypy.overrides]]
# module = [
# "package1.*",
# "package2.*",
# ]
# ignore_missing_imports = true
[tool.flake8]
select = ["C", "E", "F", "W", "B950"]
ignore = [
"E203", # whitespaces before ':' <https://github.com/psf/black/issues/315>
"E231", # missing whitespace after ','
"E501", # line too long, use B950 warning from flake8-bugbear instead
"W503" # line break before binary operator <https://github.com/psf/black/issues/52>
]
max-line-length = 88
[tool.pytest.ini_options]
addopts = "--doctest-modules -p no:pytest_swh_core"
norecursedirs = "build docs .*"
asyncio_mode = "strict"
consider_namespace_packages = true
markers = [
"requires_optional_deps: tests in test_cli.py that should not run if optional dependencies are not installed",
]
[pytest]
addopts = --doctest-modules
norecursedirs = docs