Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • anlambert/swh-model
  • lunar/swh-model
  • franckbret/swh-model
  • douardda/swh-model
  • olasd/swh-model
  • swh/devel/swh-model
  • Alphare/swh-model
  • samplet/swh-model
  • marmoute/swh-model
  • rboyer/swh-model
10 results
Show changes
Commits on Source (435)
# Changes here will be overwritten by Copier
_commit: v0.3.3
_src_path: https://gitlab.softwareheritage.org/swh/devel/swh-py-template.git
description: Software Heritage data model
distribution_name: swh-model
have_cli: true
have_workers: false
package_root: swh/model
project_name: swh.model
python_minimal_version: '3.7'
readme_format: rst
# python: Reformat code with black
bf3f1cec8685c8f480ddd95027852f8caa10b8e3
4c39334b2aa9f782950aaee72781dc1df9d37550
5ff7c5b592ce1d76f5696a7f089680807ad557a6
*~
build
/.coverage
/.coverage.*
dist
*.egg-info/
*.pyc
.coverage
.eggs/
.hypothesis
*.pyc
__pycache__
.pytest_cache
*.sw?
.mypy_cache
.tox
version.txt
__pycache__
build/
dist/
# these are symlinks created by a hook in swh-docs' main sphinx conf.py
docs/README.rst
docs/README.md
# this should be a symlink for people who want to build the sphinx doc
# without using tox, generally created by the swh-env/bin/update script
docs/Makefile.sphinx
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: check-json
- id: check-yaml
- repo: https://github.com/python/black
rev: 25.1.0
hooks:
- id: black
- repo: https://github.com/PyCQA/isort
rev: 6.0.0
hooks:
- id: isort
- repo: https://github.com/pycqa/flake8
rev: 7.1.1
hooks:
- id: flake8
additional_dependencies: [flake8-bugbear==24.12.12, flake8-pyproject]
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
hooks:
- id: codespell
name: Check source code spelling
stages: [pre-commit]
args: [-L assertIn, -L anc]
- id: codespell
name: Check commit message spelling
stages: [commit-msg]
- repo: local
hooks:
- id: mypy
name: mypy
entry: mypy
args: [swh]
pass_filenames: false
language: system
types: [python]
- id: twine-check
name: twine check
description: call twine check when pushing an annotated release tag
entry: bash -c "ref=$(git describe) &&
[[ $ref =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]] &&
(python3 -m build --sdist && twine check $(ls -t dist/* | head -1)) || true"
pass_filenames: false
stages: [pre-push]
language: python
additional_dependencies: [twine, build]
# Software Heritage Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as Software
Heritage contributors and maintainers pledge to making participation in our
project and our community a harassment-free experience for everyone, regardless
of age, body size, disability, ethnicity, sex characteristics, gender identity
and expression, level of experience, education, socioeconomic status,
nationality, personal appearance, race, religion, or sexual identity and
orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies within all project spaces, and it also applies when
an individual is representing the project or its community in public spaces.
Examples of representing a project or community include using an official
project e-mail address, posting via an official social media account, or acting
as an appointed representative at an online or offline event. Representation of
a project may be further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at `conduct@softwareheritage.org`. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an
incident. Further details of specific enforcement policies may be posted
separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq
Daniele Serafini
Ishan Bhanuka
Antoine Cezar
Pierre-Yves David
include README.md
include Makefile
include requirements.txt
include requirements-swh.txt
include version.txt
recursive-include swh/model/tests/data *.tgz
swh-model
=========
Software Heritage - Data model
==============================
Implementation of the Data model of the Software Heritage project, used to
archive source code artifacts.
This module defines the notion of Persistent Identifier (PID) and provides
tools to compute them:
This module defines the notion of SoftWare Hash persistent IDentifiers
(SWHIDs) and provides tools to compute them:
.. code-block:: shell
```sh
$ swh-identify fork.c kmod.c sched/deadline.c
swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3 fork.c
swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2 kmod.c
......@@ -15,4 +16,4 @@ tools to compute them:
$ swh-identify --no-filename /usr/src/linux/kernel/
swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab
```
......@@ -5,16 +5,17 @@
# --ignore-empty-folders
# 38f8d2c3a951f6b94007896d0981077e48bbd702
import click
import os
import click
from swh.model import from_disk, hashutil
def combine_filters(*filters):
"""Combine several ignore filters"""
if len(filters) == 0:
return from_disk.accept_all_directories
return from_disk.accept_all_paths
elif len(filters) == 1:
return filters[0]
......@@ -25,27 +26,24 @@ def combine_filters(*filters):
@click.command()
@click.option('--path', default='.',
help='Optional path to hash.')
@click.option('--ignore-empty-folder', is_flag=True, default=False,
help='Ignore empty folder.')
@click.option('--ignore', multiple=True,
help='Ignore pattern.')
@click.option("--path", default=".", help="Optional path to hash.")
@click.option(
"--ignore-empty-folder", is_flag=True, default=False, help="Ignore empty folder."
)
@click.option("--ignore", multiple=True, help="Ignore pattern.")
def main(path, ignore_empty_folder=False, ignore=None):
filters = []
if ignore_empty_folder:
filters.append(from_disk.ignore_empty_directories)
if ignore:
filters.append(
from_disk.ignore_named_directories(
[os.fsencode(name) for name in ignore]
)
from_disk.ignore_named_directories([os.fsencode(name) for name in ignore])
)
try:
d = from_disk.Directory.from_disk(path=os.fsencode(path),
dir_filter=combine_filters(*filters))
d = from_disk.Directory.from_disk(
path=os.fsencode(path), path_filter=combine_filters(*filters)
)
hash = d.hash
except Exception as e:
print(e)
......@@ -54,5 +52,5 @@ def main(path, ignore_empty_folder=False, ignore=None):
print(hashutil.hash_to_hex(hash))
if __name__ == '__main__':
if __name__ == "__main__":
main()
......@@ -11,21 +11,19 @@
import sys
from swh.model import identifiers, hashutil
from swh.model import hashutil, identifiers
def revhash(revision_raw):
"""Compute the revision hash.
"""Compute the revision hash."""
# HACK: string have somehow their \n expanded to \\n
if b"\\n" in revision_raw:
revision_raw = revision_raw.replace(b"\\n", b"\n")
"""
if b'\\n' in revision_raw: # HACK: string have somehow their \n
# expanded to \\n
revision_raw = revision_raw.replace(b'\\n', b'\n')
h = hashutil.hash_git_data(revision_raw, 'commit')
h = hashutil.hash_git_data(revision_raw, "commit")
return identifiers.identifier_to_str(h)
if __name__ == '__main__':
revision_raw = sys.argv[1].encode('utf-8')
if __name__ == "__main__":
revision_raw = sys.argv[1].encode("utf-8")
print(revhash(revision_raw))
include ../../swh-docs/Makefile.sphinx
include Makefile.sphinx
-include Makefile.local
Command-line interface
======================
.. click:: swh.model.cli:identify
:prog: swh identify
:nested: full
:orphan:
.. _data-model:
Data model
......@@ -76,8 +74,7 @@ synonyms.
**directories**
a list of named directory entries, each of which pointing to other artifacts,
usually file contents or sub-directories. Directory entries are also
associated to arbitrary metadata, which vary with technologies, but usually
includes permission bits, modification timestamps, etc.
associated to some metadata stored as permission bits.
**revisions** (AKA "commits")
software development within a specific project is essentially a time-indexed
......@@ -94,8 +91,8 @@ synonyms.
some revisions are more equals than others and get selected by developers as
denoting important project milestones known as "releases". Each release
points to the last commit in project history corresponding to the release and
might carry arbitrary metadata—e.g., release name and version, release
message, cryptographic signatures, etc.
carries metadata: release name and version, release message, cryptographic
signatures, etc.
Additionally, the following crawling-related information are stored as
......@@ -147,6 +144,11 @@ provenance information in the Software Heritage archive:
Software Heritage clock) the visit happened and the full snapshot of the
state of the software origin at the time.
.. note::
This model currently records visits as a single point in time. However, the
actual visit process is not instantaneous. Loaders can record successive
changes to the state of the visit, as their work progresses, as updates to
the visit object.
Data structure
--------------
......@@ -257,3 +259,39 @@ making emergent structures such as code reuse across different projects or
software origins, readily available. Further reinforcing the Software Heritage
use cases, this object could become a veritable "map of the stars" of our
entire software commons.
Extended data model
-------------------
In addition to the artifacts detailed above used to represent original software
artifacts, the Software Heritage archive stores information about these
artifacts.
**extid**
a relationship between an original identifier of an artifact, in its
native/upstream environment, and a `core SWHID <persistent-identifiers>`,
which is specific to Software Heritage. As such, it includes:
* the external identifier, stored as bytes whose format is opaque to the
data model
* a type (a simple name and a version), to identify the type of relationship
* the "target", which is a core SWHID
An extid may also include a "payload", which is arbitrary data about the
relationship. For example, an extid might link a directory to the
cryptographic hash of the tarball that originally contained it. In this
case, the payload could include data useful for reconstructing the
original tarball from the directory. The payload data is stored
separately. An extid refers to it by its ``sha1_git`` hash.
**raw extrinsic metadata**
an opaque bytestring, along with its format (a simple name), an identifier
of the object the metadata is about and in which context (similar to a
`qualified SWHID <persistent-identifiers>`), and provenance information
(the authority who provided it, the fetcher tool used to get it, and the
data it was discovered at).
It provides both a way to store information about an artifact contributed by
external entities, after the artifact was created, and an escape hatch to
store metadata that would not otherwise fit in the data model.
(last updated 2020-04-28)
Scheme name: swh
Status: Provisional
Applications/protocols that use this scheme name:
Software Heritage: https://www.softwareheritage.org/
Software Package Data Exchange: https://spdx.org/
NTIA: https://www.ntia.doc.gov/SoftwareTransparency
Identifiers.org: http://identifiers.org/
Name-to-Thing (N2T): https://n2t.net/
HAL: https://hal.archives-ouvertes.fr/
Contact: Stefano Zacchiroli <zack@upsilon.cc>
Change controller: Software Heritage <info@softwareheritage.org>
References:
Scheme specification: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
The Software Heritage project: https://www.softwareheritage.org/
The Software Heritage archive: https://archive.softwareheritage.org/
Publications:
Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. Referencing Source
Code Artifacts: a Separate Concern in Software Citation. In Computing in
Science and Engineering, volume 22, issue 2, pp. 33-43. ISSN 1521-9615,
IEEE. March 2020. DOI 10.1109/MCSE.2019.2963148
Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. Identifiers for
Digital Objects: the Case of Software Source Code Preservation. In
proceedings of iPRES 2018: 15th International Conference on Digital
Preservation. September 2018. 10.17605/OSF.IO/KDE56
(file created 2020-04-28)
......@@ -6,12 +6,15 @@ BUILD_TARGETS += $(MERKLE_DAG)
all: $(BUILD_TARGETS)
%.svg: %.dia
inkscape -l $@ $<
dia -e $@ $<
%.pdf: %.dia
inkscape -A $@ $<
%.pdf: %.svg
set -e; if [ $$(inkscape --version 2>/dev/null | grep -Eo '[0-9]+' | head -1) -gt 0 ]; then \
inkscape -o $@ $< ; \
else \
inkscape -A $@ $< ; \
fi
clean:
-rm -f $(BUILD_TARGETS)
.. _swh-model:
Software Heritage - Data model
==============================
.. include:: README.rst
Implementation of the :ref:`data-model` to archive source code artifacts.
Overview
--------
* :ref:`data-model`
* :ref:`persistent-identifiers`
.. toctree::
:caption: Overview:
:titlesonly:
data-model
persistent-identifiers
cli
Reference Documentation
-----------------------
.. only:: standalone_package_doc
.. toctree::
:maxdepth: 2
Indices and tables
------------------
/apidoc/swh.model
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
This diff is collapsed.
[project]
name = "swh.model"
authors = [
{name="Software Heritage developers", email="swh-devel@inria.fr"},
]
description = "Software Heritage data model"
readme = {file = "README.rst", content-type = "text/x-rst"}
requires-python = ">=3.7"
classifiers = [
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Development Status :: 5 - Production/Stable",
]
dynamic = ["version", "dependencies", "optional-dependencies"]
[tool.setuptools.packages.find]
include = ["swh.*"]
[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}
[tool.setuptools.dynamic.optional-dependencies]
cli = {file = "requirements-cli.txt"}
testing = {file = ["requirements-cli.txt", "requirements-test.txt"]}
testing_minimal = {file = "requirements-test.txt"}
[project.entry-points.console_scripts]
"swh.identify" = "swh.model.cli:identify"
[project.entry-points."swh.cli.subcommands"]
"swh.model" = "swh.model.cli"
[project.urls]
"Homepage" = "https://gitlab.softwareheritage.org/swh/devel/swh-model"
"Bug Reports" = "https://gitlab.softwareheritage.org/swh/devel/swh-model/-/issues"
"Funding" = "https://www.softwareheritage.org/donate"
"Documentation" = "https://docs.softwareheritage.org/devel/swh-model/"
"Source" = "https://gitlab.softwareheritage.org/swh/devel/swh-model.git"
[build-system]
requires = ["setuptools", "setuptools-scm"]
build-backend = "setuptools.build_meta"
[tool.setuptools_scm]
fallback_version = "0.0.1"
[tool.black]
target-version = ['py39', 'py310', 'py311', 'py312']
[tool.isort]
multi_line_output = 3
include_trailing_comma = true
force_grid_wrap = 0
use_parentheses = true
ensure_newline_before_comments = true
line_length = 88
force_sort_within_sections = true
known_first_party = ['swh']
[tool.mypy]
namespace_packages = true
warn_unused_ignores = true
explicit_package_bases = true
# ^ Needed for mypy to detect py.typed from swh packages installed
# in editable mode
plugins = []
# 3rd party libraries without stubs (yet)
# [[tool.mypy.overrides]]
# module = [
# "package1.*",
# "package2.*",
# ]
# ignore_missing_imports = true
[tool.flake8]
select = ["C", "E", "F", "W", "B950"]
ignore = [
"E203", # whitespaces before ':' <https://github.com/psf/black/issues/315>
"E231", # missing whitespace after ','
"E501", # line too long, use B950 warning from flake8-bugbear instead
"W503" # line break before binary operator <https://github.com/psf/black/issues/52>
]
max-line-length = 88
[tool.pytest.ini_options]
addopts = "--doctest-modules -p no:pytest_swh_core"
norecursedirs = "build docs .*"
asyncio_mode = "strict"
consider_namespace_packages = true
markers = [
"requires_optional_deps: tests in test_cli.py that should not run if optional dependencies are not installed",
]
[pytest]
addopts = --doctest-modules
norecursedirs = docs
swh.core >= 0.3
Click
dulwich