Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • anlambert/swh-model
  • lunar/swh-model
  • franckbret/swh-model
  • douardda/swh-model
  • olasd/swh-model
  • swh/devel/swh-model
  • Alphare/swh-model
  • samplet/swh-model
  • marmoute/swh-model
  • rboyer/swh-model
10 results
Show changes
Showing
with 1685 additions and 434 deletions
(last updated 2020-04-28)
Scheme name: swh
Status: Provisional
Applications/protocols that use this scheme name:
Software Heritage: https://www.softwareheritage.org/
Software Package Data Exchange: https://spdx.org/
NTIA: https://www.ntia.doc.gov/SoftwareTransparency
Identifiers.org: http://identifiers.org/
Name-to-Thing (N2T): https://n2t.net/
HAL: https://hal.archives-ouvertes.fr/
Contact: Stefano Zacchiroli <zack@upsilon.cc>
Change controller: Software Heritage <info@softwareheritage.org>
References:
Scheme specification: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
The Software Heritage project: https://www.softwareheritage.org/
The Software Heritage archive: https://archive.softwareheritage.org/
Publications:
Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. Referencing Source
Code Artifacts: a Separate Concern in Software Citation. In Computing in
Science and Engineering, volume 22, issue 2, pp. 33-43. ISSN 1521-9615,
IEEE. March 2020. DOI 10.1109/MCSE.2019.2963148
Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. Identifiers for
Digital Objects: the Case of Software Source Code Preservation. In
proceedings of iPRES 2018: 15th International Conference on Digital
Preservation. September 2018. 10.17605/OSF.IO/KDE56
(file created 2020-04-28)
......@@ -6,20 +6,15 @@ BUILD_TARGETS += $(MERKLE_DAG)
all: $(BUILD_TARGETS)
# dia exporters
%.eps: %.dia
dia -t eps --export $@ $<
%.svg: %.dia
dia -t svg --export $@ $<
# generic converters
%.pdf: %.eps
epstopdf $<
dia -e $@ $<
%.pdf: %.svg
set -e; if [ $$(inkscape --version 2>/dev/null | grep -Eo '[0-9]+' | head -1) -gt 0 ]; then \
inkscape -o $@ $< ; \
else \
inkscape -A $@ $< ; \
fi
clean:
-rm -f $(BUILD_TARGETS)
.. _swh-model:
Software Heritage - Development Documentation
=============================================
.. include:: README.rst
.. toctree::
:maxdepth: 2
:caption: Contents:
:caption: Overview:
:titlesonly:
data-model
persistent-identifiers
cli
Overview
--------
.. only:: standalone_package_doc
* :ref:`data-model`
* :ref:`persistent-identifiers`
Indices and tables
------------------
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
This diff is collapsed.
[project]
name = "swh.model"
authors = [
{name="Software Heritage developers", email="swh-devel@inria.fr"},
]
description = "Software Heritage data model"
readme = {file = "README.rst", content-type = "text/x-rst"}
requires-python = ">=3.7"
classifiers = [
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Development Status :: 5 - Production/Stable",
]
dynamic = ["version", "dependencies", "optional-dependencies"]
[tool.setuptools.packages.find]
include = ["swh.*"]
[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}
[tool.setuptools.dynamic.optional-dependencies]
cli = {file = "requirements-cli.txt"}
testing = {file = ["requirements-cli.txt", "requirements-test.txt"]}
testing_minimal = {file = "requirements-test.txt"}
[project.entry-points.console_scripts]
"swh.identify" = "swh.model.cli:identify"
[project.entry-points."swh.cli.subcommands"]
"swh.model" = "swh.model.cli"
[project.urls]
"Homepage" = "https://gitlab.softwareheritage.org/swh/devel/swh-model"
"Bug Reports" = "https://gitlab.softwareheritage.org/swh/devel/swh-model/-/issues"
"Funding" = "https://www.softwareheritage.org/donate"
"Documentation" = "https://docs.softwareheritage.org/devel/swh-model/"
"Source" = "https://gitlab.softwareheritage.org/swh/devel/swh-model.git"
[build-system]
requires = ["setuptools", "setuptools-scm"]
build-backend = "setuptools.build_meta"
[tool.setuptools_scm]
fallback_version = "0.0.1"
[tool.black]
target-version = ['py39', 'py310', 'py311', 'py312']
[tool.isort]
multi_line_output = 3
include_trailing_comma = true
force_grid_wrap = 0
use_parentheses = true
ensure_newline_before_comments = true
line_length = 88
force_sort_within_sections = true
known_first_party = ['swh']
[tool.mypy]
namespace_packages = true
warn_unused_ignores = true
explicit_package_bases = true
# ^ Needed for mypy to detect py.typed from swh packages installed
# in editable mode
plugins = []
# 3rd party libraries without stubs (yet)
# [[tool.mypy.overrides]]
# module = [
# "package1.*",
# "package2.*",
# ]
# ignore_missing_imports = true
[tool.flake8]
select = ["C", "E", "F", "W", "B950"]
ignore = [
"E203", # whitespaces before ':' <https://github.com/psf/black/issues/315>
"E231", # missing whitespace after ','
"E501", # line too long, use B950 warning from flake8-bugbear instead
"W503" # line break before binary operator <https://github.com/psf/black/issues/52>
]
max-line-length = 88
[tool.pytest.ini_options]
addopts = "--doctest-modules -p no:pytest_swh_core"
norecursedirs = "build docs .*"
asyncio_mode = "strict"
consider_namespace_packages = true
markers = [
"requires_optional_deps: tests in test_cli.py that should not run if optional dependencies are not installed",
]
swh.core >= 0.3
Click
dulwich
aiohttp
click
pytest >= 8.1
pytz
types-click
types-python-dateutil
types-pytz
types-deprecated
# Add here external Python modules dependencies, one per line. Module names
# should match https://pypi.python.org/pypi names. For the full spec or
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
vcversioner
Click
attrs != 21.1.0 # https://github.com/python-attrs/attrs/issues/804
attrs_strict >= 0.0.7
deprecated
hypothesis
iso8601
python-dateutil
typing_extensions
import hashlib
from setuptools import setup, find_packages
def parse_requirements():
requirements = []
for reqf in ('requirements.txt', 'requirements-swh.txt'):
with open(reqf) as f:
for line in f.readlines():
line = line.strip()
if not line or line.startswith('#'):
continue
requirements.append(line)
return requirements
extra_requirements = []
pyblake2_hash_sets = [
# Built-in implementation in Python 3.6+
{'blake2s', 'blake2b'},
# Potentially shipped by OpenSSL 1.1 (e.g. Python 3.5 in Debian stretch
# has these)
{'blake2s256', 'blake2b512'},
]
for pyblake2_hashes in pyblake2_hash_sets:
if not pyblake2_hashes - set(hashlib.algorithms_available):
# The required blake2 hashes have been found
break
else:
# None of the possible sets of blake2 hashes are available.
# use pyblake2 instead
extra_requirements.append('pyblake2')
setup(
name='swh.model',
description='Software Heritage data model',
author='Software Heritage developers',
author_email='swh-devel@inria.fr',
url='https://forge.softwareheritage.org/diffusion/DMOD/',
packages=find_packages(), # packages's modules
scripts=[], # scripts to package
install_requires=parse_requirements() + extra_requirements,
entry_points='''
[console_scripts]
swh-identify=swh.model.cli:identify
''',
setup_requires=['vcversioner'],
vcversioner={},
include_package_data=True,
)
__path__ = __import__('pkgutil').extend_path(__path__, __name__)
# Copyright (C) 2018 The Software Heritage developers
# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import os
import sys
from typing import Callable, Dict, Iterable, Optional
from swh.model import identifiers as pids
from swh.model.exceptions import ValidationError
from swh.model.from_disk import Content, Directory
# WARNING: do not import unnecessary things here to keep cli startup time under
# control
try:
import click
except ImportError:
print(
"Cannot run swh-identify; the Click package is not installed."
"Please install 'swh.model[cli]' for full functionality.",
file=sys.stderr,
)
exit(1)
try:
import swh.core.cli
class PidParamType(click.ParamType):
name = 'persistent identifier'
cli_command = swh.core.cli.swh.command
except ImportError:
# stub so that swh-identify can be used when swh-core isn't installed
cli_command = click.command
from swh.model.from_disk import Directory
from swh.model.swhids import CoreSWHID
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
# Mapping between dulwich types and Software Heritage ones. Used by snapshot ID
# computation.
_DULWICH_TYPES = {
b"blob": "content",
b"tree": "directory",
b"commit": "revision",
b"tag": "release",
}
class CoreSWHIDParamType(click.ParamType):
"""Click argument that accepts a core SWHID and returns them as
:class:`swh.model.swhids.CoreSWHID` instances"""
name = "SWHID"
def convert(self, value, param, ctx) -> CoreSWHID:
from swh.model.exceptions import ValidationError
def convert(self, value, param, ctx):
try:
pids.parse_persistent_identifier(value)
return value # return as string, as we need just that
return CoreSWHID.from_string(value)
except ValidationError as e:
self.fail('%s is not a valid PID. %s.' % (value, e), param, ctx)
self.fail(f'"{value}" is not a valid core SWHID: {e}', param, ctx)
def swhid_of_file(path) -> CoreSWHID:
from swh.model.from_disk import Content
object = Content.from_file(path=path)
return object.swhid()
def swhid_of_file_content(data) -> CoreSWHID:
from swh.model.from_disk import Content
object = Content.from_bytes(mode=644, data=data)
return object.swhid()
def model_of_dir(
path: bytes,
exclude_patterns: Optional[Iterable[bytes]] = None,
update_info: Optional[Callable[[int], None]] = None,
) -> Directory:
from swh.model.from_disk import accept_all_paths, ignore_directories_patterns
path_filter = (
ignore_directories_patterns(path, exclude_patterns)
if exclude_patterns
else accept_all_paths
)
return Directory.from_disk(
path=path, path_filter=path_filter, progress_callback=update_info
)
def swhid_of_dir(
path: bytes, exclude_patterns: Optional[Iterable[bytes]] = None
) -> CoreSWHID:
obj = model_of_dir(path, exclude_patterns)
return obj.swhid()
def swhid_of_origin(url):
from swh.model.model import Origin
return Origin(url).swhid()
def swhid_of_git_repo(path) -> CoreSWHID:
try:
import dulwich.repo
except ImportError:
raise click.ClickException(
"Cannot compute snapshot identifier; the Dulwich package is not installed. "
"Please install 'swh.model[cli]' for full functionality.",
)
def pid_of_file(path):
object = Content.from_file(path=path).get_data()
return pids.persistent_identifier(pids.CONTENT, object)
from swh.model import hashutil
from swh.model.model import Snapshot
repo = dulwich.repo.Repo(path)
def pid_of_dir(path):
object = Directory.from_disk(path=path).get_data()
return pids.persistent_identifier(pids.DIRECTORY, object)
branches: Dict[bytes, Optional[Dict]] = {}
for ref, target in repo.refs.as_dict().items():
obj = repo[target]
if obj:
branches[ref] = {
"target": hashutil.bytehex_to_hash(target),
"target_type": _DULWICH_TYPES[obj.type_name],
}
else:
branches[ref] = None
for ref, target in repo.refs.get_symrefs().items():
branches[ref] = {
"target": target,
"target_type": "alias",
}
snapshot = {"branches": branches}
return Snapshot.from_dict(snapshot).swhid()
def identify_object(
obj_type: str, follow_symlinks: bool, exclude_patterns: Iterable[bytes], obj
) -> str:
from urllib.parse import urlparse
if obj_type == "auto":
if obj == "-" or os.path.isfile(obj):
obj_type = "content"
elif os.path.isdir(obj):
obj_type = "directory"
else:
try: # URL parsing
if urlparse(obj).scheme:
obj_type = "origin"
else:
raise ValueError
except ValueError:
raise click.BadParameter("cannot detect object type for %s" % obj)
if obj == "-":
content = sys.stdin.buffer.read()
swhid = str(swhid_of_file_content(content))
elif obj_type in ["content", "directory"]:
path = obj.encode(sys.getfilesystemencoding())
if follow_symlinks and os.path.islink(obj):
path = os.path.realpath(obj)
if obj_type == "content":
swhid = str(swhid_of_file(path))
elif obj_type == "directory":
swhid = str(swhid_of_dir(path, exclude_patterns))
elif obj_type == "origin":
swhid = str(swhid_of_origin(obj))
elif obj_type == "snapshot":
swhid = str(swhid_of_git_repo(obj))
else: # shouldn't happen, due to option validation
raise click.BadParameter("invalid object type: " + obj_type)
@click.command()
@click.option('--type', '-t', default='auto',
type=click.Choice(['auto', 'content', 'directory']),
help='type of object to identify (default: auto)')
@click.option('--verify', '-v', metavar='PID', type=PidParamType(),
help='reference identifier to be compared with computed one')
@click.argument('object',
type=click.Path(exists=True, readable=True,
allow_dash=True, path_type=bytes))
def identify(type, verify, object):
"""Compute the Software Heritage persistent identifier (PID) for a given
source code object.
# note: we return original obj instead of path here, to preserve user-given
# file name in output
return swhid
For more details about Software Heritage PIDs see:
\b
@cli_command(context_settings=CONTEXT_SETTINGS)
@click.option(
"--dereference/--no-dereference",
"follow_symlinks",
default=True,
help="follow (or not) symlinks for OBJECTS passed as arguments "
+ "(default: follow)",
)
@click.option(
"--filename/--no-filename",
"show_filename",
default=True,
help="show/hide file name (default: show)",
)
@click.option(
"--type",
"-t",
"obj_type",
default="auto",
type=click.Choice(["auto", "content", "directory", "origin", "snapshot"]),
help="type of object to identify (default: auto)",
)
@click.option(
"--exclude",
"-x",
"exclude_patterns",
metavar="PATTERN",
multiple=True,
help="Exclude directories using glob patterns \
(e.g., ``*.git`` to exclude all .git directories)",
)
@click.option(
"--verify",
"-v",
metavar="SWHID",
type=CoreSWHIDParamType(),
help="reference identifier to be compared with computed one",
)
@click.option(
"-r",
"--recursive",
is_flag=True,
help="compute SWHID recursively",
)
@click.argument("objects", nargs=-1, required=True)
def identify(
obj_type,
verify,
show_filename,
follow_symlinks,
objects,
exclude_patterns,
recursive,
):
"""Compute the Software Heritage persistent identifier (SWHID) for the given
source code object(s).
For more details about SWHIDs see:
https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
\b
Examples:
Tip: you can pass "-" to identify the content of standard input.
Examples::
\b
$ swh-identify /usr/src/linux/kernel/
$ swh identify fork.c kmod.c sched/deadline.c
swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3 fork.c
swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2 kmod.c
swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82 sched/deadline.c
$ swh identify --no-filename /usr/src/linux/kernel/
swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab
\b
$ swh-identify /usr/src/linux/kernel/sched/deadline.c
swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82
$ git clone --mirror https://forge.softwareheritage.org/source/helloworld.git
$ swh identify --type snapshot helloworld.git/
swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93 helloworld.git
"""
if type == 'auto':
if os.path.isfile(object):
type = 'content'
elif os.path.isdir(object):
type = 'directory'
else: # shouldn't happen, due to path validation
raise click.BadParameter('%s is neither a file nor a directory' %
object)
pid = None
if type == 'content':
pid = pid_of_file(object)
elif type == 'directory':
pid = pid_of_dir(object)
else: # shouldn't happen, due to option validation
raise click.BadParameter('invalid object type: ' + type)
from functools import partial
import logging
if verify:
if verify == pid:
click.echo('PID match: %s' % pid)
sys.exit(0)
else:
click.echo('PID mismatch: %s != %s' % (verify, pid))
sys.exit(1)
if exclude_patterns:
exclude_patterns = set(pattern.encode() for pattern in exclude_patterns)
if verify and len(objects) != 1:
raise click.BadParameter("verification requires a single object")
if recursive and not os.path.isdir(objects[0]):
recursive = False
logging.warn("recursive option disabled, input is not a directory object")
if recursive:
if verify:
raise click.BadParameter(
"verification of recursive object identification is not supported"
)
if not obj_type == ("auto" or "directory"):
raise click.BadParameter(
"recursive identification is supported only for directories"
)
path = os.fsencode(objects[0])
dir_obj = model_of_dir(path, exclude_patterns)
for sub_obj in dir_obj.iter_tree():
path_name = "path" if "path" in sub_obj.data.keys() else "data"
path = os.fsdecode(sub_obj.data[path_name])
swhid = str(sub_obj.swhid())
msg = f"{swhid}\t{path}" if show_filename else f"{swhid}"
click.echo(msg)
else:
click.echo(pid)
results = zip(
objects,
map(
partial(identify_object, obj_type, follow_symlinks, exclude_patterns),
objects,
),
)
if verify:
swhid = next(results)[1]
if str(verify) == swhid:
click.echo("SWHID match: %s" % swhid)
sys.exit(0)
else:
click.echo("SWHID mismatch: %s != %s" % (verify, swhid))
sys.exit(1)
else:
for obj, swhid in results:
msg = swhid
if show_filename:
msg = "%s\t%s" % (swhid, os.fsdecode(obj))
click.echo(msg)
if __name__ == '__main__':
if __name__ == "__main__":
identify()
# Copyright (C) 2020-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from __future__ import annotations
"""Utility data structures."""
from collections.abc import Mapping
import copy
from typing import Dict, Generic, Iterable, Optional, Tuple, TypeVar, Union
KT = TypeVar("KT")
VT = TypeVar("VT")
class ImmutableDict(Mapping, Generic[KT, VT]):
"""A frozen dictionary.
This class behaves like a dictionary, but internally stores objects in a tuple,
so it is both immutable and hashable."""
_data: Dict[KT, VT]
def __init__(
self,
data: Union[Iterable[Tuple[KT, VT]], ImmutableDict[KT, VT], Dict[KT, VT]] = {},
):
if isinstance(data, dict):
self._data = data
elif isinstance(data, ImmutableDict):
self._data = data._data
else:
self._data = {k: v for k, v in data}
@property
def data(self):
return tuple(self._data.items())
def __repr__(self):
return f"ImmutableDict({dict(self.data)!r})"
def __getitem__(self, key):
return self._data[key]
def __iter__(self):
for k, v in self.data:
yield k
def __len__(self):
return len(self._data)
def items(self):
yield from self.data
def __hash__(self):
return hash(tuple(sorted(self.data)))
def copy_pop(self, popped_key) -> Tuple[Optional[VT], ImmutableDict[KT, VT]]:
"""Returns a copy of this ImmutableDict without the given key,
as well as the value associated to the key."""
new_items = copy.deepcopy(self._data)
popped_value: Optional[VT] = new_items.pop(popped_key, None)
return (popped_value, ImmutableDict(new_items))
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Primitives for finding unknown content efficiently."""
from __future__ import annotations
from collections import namedtuple
import itertools
import logging
import random
from typing import (
Any,
Callable,
Iterable,
List,
Mapping,
NamedTuple,
Optional,
Set,
Union,
)
from typing_extensions import Protocol, runtime_checkable
from .from_disk import model
from .model import Sha1Git
logger = logging.getLogger(__name__)
# Maximum amount when sampling from the undecided set of directory entries
SAMPLE_SIZE = 1000
# Sets of sha1 of contents, skipped contents and directories respectively
Sample: NamedTuple = namedtuple(
"Sample", ["contents", "skipped_contents", "directories"]
)
@runtime_checkable
class ArchiveDiscoveryInterface(Protocol):
"""Interface used in discovery code to abstract over ways of connecting to
the SWH archive (direct storage, web API, etc.) for all methods needed by
discovery algorithms."""
contents: List[model.Content]
skipped_contents: List[model.SkippedContent]
directories: List[model.Directory]
def __init__(
self,
contents: List[model.Content],
skipped_contents: List[model.SkippedContent],
directories: List[model.Directory],
) -> None:
self.contents = contents
self.skipped_contents = skipped_contents
self.directories = directories
def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]:
"""List content missing from the archive by sha1"""
def skipped_content_missing(
self, skipped_contents: List[Sha1Git]
) -> Iterable[Sha1Git]:
"""List skipped content missing from the archive by sha1"""
def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]:
"""List directories missing from the archive by sha1"""
class BaseDiscoveryGraph:
"""Creates the base structures and methods needed for discovery algorithms.
Subclasses should override ``get_sample`` to affect how the discovery is made.
The `update_info_callback` is an optional argument that will get called for
each new piece of information we get. The callback arguments are `(content,
known)`.
- content: the relevant model.Content object,
- known: a boolean, True if the file is known to the archive False otherwise.
"""
def __init__(
self,
contents,
skipped_contents,
directories,
update_info_callback: Optional[Callable[[Any, bool], None]] = None,
):
self._all_contents: Mapping[
Sha1Git, Union[model.Content, model.SkippedContent]
] = {}
self._undecided_directories: Set[Sha1Git] = set()
self._children: Mapping[Sha1Git, Set[Sha1Git]] = {}
self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {}
self.undecided: Set[Sha1Git] = set()
for content in itertools.chain(contents, skipped_contents):
self.undecided.add(content.sha1_git)
self._all_contents[content.sha1_git] = content
for directory in directories:
self.undecided.add(directory.id)
self._undecided_directories.add(directory.id)
self._children[directory.id] = {c.target for c in directory.entries}
for child in directory.entries:
self._parents.setdefault(child.target, set()).add(directory.id)
self.undecided |= self._undecided_directories
self.known: Set[Sha1Git] = set()
self.unknown: Set[Sha1Git] = set()
self._update_info_callback = update_info_callback
self._sha1_to_obj = {}
for content in itertools.chain(contents, skipped_contents):
self._sha1_to_obj[content.sha1_git] = content
for directory in directories:
self._sha1_to_obj[directory.id] = directory
def mark_known(self, entries: Iterable[Sha1Git]):
"""Mark ``entries`` and those they imply as known in the SWH archive"""
self._mark_entries(entries, self._children, self.known)
def mark_unknown(self, entries: Iterable[Sha1Git]):
"""Mark ``entries`` and those they imply as unknown in the SWH archive"""
self._mark_entries(entries, self._parents, self.unknown)
def _mark_entries(
self,
entries: Iterable[Sha1Git],
transitive_mapping: Mapping[Any, Any],
target_set: Set[Any],
):
"""Use Merkle graph properties to mark a directory entry as known or unknown.
If an entry is known, then all of its descendants are known. If it's
unknown, then all of its ancestors are unknown.
- ``entries``: directory entries to mark along with their ancestors/descendants
where applicable.
- ``transitive_mapping``: mapping from an entry to the next entries to mark
in the hierarchy, if any.
- ``target_set``: set where marked entries will be added.
"""
callback = self._update_info_callback
to_process = set(entries)
while to_process:
current = to_process.pop()
target_set.add(current)
new = current in self.undecided
self.undecided.discard(current)
self._undecided_directories.discard(current)
next_entries = transitive_mapping.get(current, set()) & self.undecided
to_process.update(next_entries)
if new and callback is not None:
obj = self._sha1_to_obj[current]
callback(obj, current in self.known)
def get_sample(
self,
) -> Sample:
"""Return a three-tuple of samples from the undecided sets of contents,
skipped contents and directories respectively.
These samples will be queried against the storage which will tell us
which are known."""
raise NotImplementedError()
def do_query(self, archive: ArchiveDiscoveryInterface, sample: Sample) -> None:
"""Given a three-tuple of samples, ask the archive which are known or
unknown and mark them as such."""
methods = (
archive.content_missing,
archive.skipped_content_missing,
archive.directory_missing,
)
for sample_per_type, method in zip(sample, methods):
if not sample_per_type:
continue
known = set(sample_per_type)
unknown = set(method(list(sample_per_type)))
known -= unknown
self.mark_known(known)
self.mark_unknown(unknown)
class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph):
"""Use a random sampling using only directories.
This allows us to find a statistically good spread of entries in the graph
with a smaller population than using all types of entries. When there are
no more directories, only contents or skipped contents are undecided if any
are left: we send them directly to the storage since they should be few and
their structure flat."""
def get_sample(self) -> Sample:
if self._undecided_directories:
if len(self._undecided_directories) <= SAMPLE_SIZE:
return Sample(
contents=set(),
skipped_contents=set(),
directories=set(self._undecided_directories),
)
sample = random.sample(tuple(self._undecided_directories), SAMPLE_SIZE)
directories = {o for o in sample}
return Sample(
contents=set(), skipped_contents=set(), directories=directories
)
contents = set()
skipped_contents = set()
for sha1 in self.undecided:
obj = self._all_contents[sha1]
obj_type = obj.object_type
if obj_type == model.Content.object_type:
contents.add(sha1)
elif obj_type == model.SkippedContent.object_type:
skipped_contents.add(sha1)
else:
raise TypeError(f"Unexpected object type {obj_type}")
return Sample(
contents=contents, skipped_contents=skipped_contents, directories=set()
)
def filter_known_objects(
archive: ArchiveDiscoveryInterface,
update_info_callback: Optional[Callable[[Any, bool], None]] = None,
):
"""Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
to only return those that are unknown to the SWH archive using a discovery
algorithm.
The `update_info_callback` is an optional argument that will get called for
each new piece of information we get. The callback arguments are `(content,
known)`.
- content: the relevant model.Content object,
- known: a boolean, True if the file is known to the archive False otherwise.
"""
contents = archive.contents
skipped_contents = archive.skipped_contents
directories = archive.directories
contents_count = len(contents)
skipped_contents_count = len(skipped_contents)
directories_count = len(directories)
graph = RandomDirSamplingDiscoveryGraph(
contents,
skipped_contents,
directories,
update_info_callback=update_info_callback,
)
while graph.undecided:
sample = graph.get_sample()
graph.do_query(archive, sample)
contents = [c for c in contents if c.sha1_git in graph.unknown]
skipped_contents = [c for c in skipped_contents if c.sha1_git in graph.unknown]
directories = [c for c in directories if c.id in graph.unknown]
logger.debug(
"Filtered out %d contents, %d skipped contents and %d directories",
contents_count - len(contents),
skipped_contents_count - len(skipped_contents),
directories_count - len(directories),
)
return (contents, skipped_contents, directories)
......@@ -33,11 +33,12 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
NON_FIELD_ERRORS = '__all__'
NON_FIELD_ERRORS = "__all__"
class ValidationError(Exception):
"""An error while validating data."""
def __init__(self, message, code=None, params=None):
"""
The `message` argument can be a single error, a list of errors, or a
......@@ -54,16 +55,15 @@ class ValidationError(Exception):
message = message[0]
if isinstance(message, ValidationError):
if hasattr(message, 'error_dict'):
if hasattr(message, "error_dict"):
message = message.error_dict
# PY2 has a `message` property which is always there so we can't
# duck-type on it. It was introduced in Python 2.5 and already
# deprecated in Python 2.6.
elif not hasattr(message, 'message'):
elif not hasattr(message, "message"):
message = message.error_list
else:
message, code, params = (message.message, message.code,
message.params)
message, code, params = (message.message, message.code, message.params)
if isinstance(message, dict):
self.error_dict = {}
......@@ -78,9 +78,8 @@ class ValidationError(Exception):
# Normalize plain strings to instances of ValidationError.
if not isinstance(message, ValidationError):
message = ValidationError(message)
if hasattr(message, 'error_dict'):
self.error_list.extend(sum(message.error_dict.values(),
[]))
if hasattr(message, "error_dict"):
self.error_list.extend(sum(message.error_dict.values(), []))
else:
self.error_list.extend(message.error_list)
......@@ -94,18 +93,18 @@ class ValidationError(Exception):
def message_dict(self):
# Trigger an AttributeError if this ValidationError
# doesn't have an error_dict.
getattr(self, 'error_dict')
getattr(self, "error_dict")
return dict(self)
@property
def messages(self):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
return sum(dict(self).values(), [])
return list(self)
def update_error_dict(self, error_dict):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
for field, error_list in self.error_dict.items():
error_dict.setdefault(field, []).extend(error_list)
else:
......@@ -113,7 +112,7 @@ class ValidationError(Exception):
return error_dict
def __iter__(self):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
for field, errors in self.error_dict.items():
yield field, list(ValidationError(errors))
else:
......@@ -124,9 +123,13 @@ class ValidationError(Exception):
yield message
def __str__(self):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
return repr(dict(self))
return repr(list(self))
def __repr__(self):
return 'ValidationError(%s)' % self
return "ValidationError(%s)" % self
class InvalidDirectoryPath(Exception):
pass
......@@ -6,8 +6,13 @@
# We do our imports here but we don't use them, so flake8 complains
# flake8: noqa
from .simple import (validate_type, validate_int, validate_str, validate_bytes,
validate_datetime, validate_enum)
from .hashes import (validate_sha1, validate_sha1_git, validate_sha256)
from .compound import (validate_against_schema, validate_all_keys,
validate_any_key)
from .compound import validate_against_schema, validate_all_keys, validate_any_key
from .hashes import validate_sha1, validate_sha1_git, validate_sha256
from .simple import (
validate_bytes,
validate_datetime,
validate_enum,
validate_int,
validate_str,
validate_type,
)
......@@ -6,7 +6,7 @@
from collections import defaultdict
import itertools
from ..exceptions import ValidationError, NON_FIELD_ERRORS
from ..exceptions import NON_FIELD_ERRORS, ValidationError
def validate_against_schema(model, schema, value):
......@@ -26,19 +26,19 @@ def validate_against_schema(model, schema, value):
if not isinstance(value, dict):
raise ValidationError(
'Unexpected type %(type)s for %(model)s, expected dict',
"Unexpected type %(type)s for %(model)s, expected dict",
params={
'model': model,
'type': value.__class__.__name__,
"model": model,
"type": value.__class__.__name__,
},
code='model-unexpected-type',
code="model-unexpected-type",
)
errors = defaultdict(list)
for key, (mandatory, validators) in itertools.chain(
((k, v) for k, v in schema.items() if k != NON_FIELD_ERRORS),
[(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))]
[(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))],
):
if not validators:
continue
......@@ -54,9 +54,9 @@ def validate_against_schema(model, schema, value):
if mandatory:
errors[key].append(
ValidationError(
'Field %(field)s is mandatory',
params={'field': key},
code='model-field-mandatory',
"Field %(field)s is mandatory",
params={"field": key},
code="model-field-mandatory",
)
)
......@@ -74,19 +74,21 @@ def validate_against_schema(model, schema, value):
else:
if not valid:
errdata = {
'validator': validator.__name__,
"validator": validator.__name__,
}
if key == NON_FIELD_ERRORS:
errmsg = 'Validation of model %(model)s failed in ' \
'%(validator)s'
errdata['model'] = model
errcode = 'model-validation-failed'
errmsg = (
"Validation of model %(model)s failed in " "%(validator)s"
)
errdata["model"] = model
errcode = "model-validation-failed"
else:
errmsg = 'Validation of field %(field)s failed in ' \
'%(validator)s'
errdata['field'] = key
errcode = 'field-validation-failed'
errmsg = (
"Validation of field %(field)s failed in " "%(validator)s"
)
errdata["field"] = key
errcode = "field-validation-failed"
errors[key].append(
ValidationError(errmsg, params=errdata, code=errcode)
......@@ -102,11 +104,11 @@ def validate_all_keys(value, keys):
"""Validate that all the given keys are present in value"""
missing_keys = set(keys) - set(value)
if missing_keys:
missing_fields = ', '.join(sorted(missing_keys))
missing_fields = ", ".join(sorted(missing_keys))
raise ValidationError(
'Missing mandatory fields %(missing_fields)s',
params={'missing_fields': missing_fields},
code='missing-mandatory-field'
"Missing mandatory fields %(missing_fields)s",
params={"missing_fields": missing_fields},
code="missing-mandatory-field",
)
return True
......@@ -116,11 +118,11 @@ def validate_any_key(value, keys):
"""Validate that any of the given keys is present in value"""
present_keys = set(keys) & set(value)
if not present_keys:
missing_fields = ', '.join(sorted(keys))
missing_fields = ", ".join(sorted(keys))
raise ValidationError(
'Must contain one of the alternative fields %(missing_fields)s',
params={'missing_fields': missing_fields},
code='missing-alternative-field',
"Must contain one of the alternative fields %(missing_fields)s",
params={"missing_fields": missing_fields},
code="missing-alternative-field",
)
return True
......@@ -4,6 +4,7 @@
# See top-level LICENSE file for more information
import string
from ..exceptions import ValidationError
......@@ -22,22 +23,22 @@ def validate_hash(value, hash_type):
"""
hash_lengths = {
'sha1': 20,
'sha1_git': 20,
'sha256': 32,
"sha1": 20,
"sha1_git": 20,
"sha256": 32,
}
hex_digits = set(string.hexdigits)
if hash_type not in hash_lengths:
raise ValidationError(
'Unexpected hash type %(hash_type)s, expected one of'
' %(hash_types)s',
"Unexpected hash type %(hash_type)s, expected one of" " %(hash_types)s",
params={
'hash_type': hash_type,
'hash_types': ', '.join(sorted(hash_lengths)),
"hash_type": hash_type,
"hash_types": ", ".join(sorted(hash_lengths)),
},
code='unexpected-hash-type')
code="unexpected-hash-type",
)
if isinstance(value, str):
errors = []
......@@ -48,10 +49,10 @@ def validate_hash(value, hash_type):
"Unexpected characters `%(unexpected_chars)s' for hash "
"type %(hash_type)s",
params={
'unexpected_chars': ', '.join(sorted(extra_chars)),
'hash_type': hash_type,
"unexpected_chars": ", ".join(sorted(extra_chars)),
"hash_type": hash_type,
},
code='unexpected-hash-contents',
code="unexpected-hash-contents",
)
)
......@@ -60,14 +61,14 @@ def validate_hash(value, hash_type):
if length != expected_length:
errors.append(
ValidationError(
'Unexpected length %(length)d for hash type '
'%(hash_type)s, expected %(expected_length)d',
"Unexpected length %(length)d for hash type "
"%(hash_type)s, expected %(expected_length)d",
params={
'length': length,
'expected_length': expected_length,
'hash_type': hash_type,
"length": length,
"expected_length": expected_length,
"hash_type": hash_type,
},
code='unexpected-hash-length',
code="unexpected-hash-length",
)
)
......@@ -81,37 +82,37 @@ def validate_hash(value, hash_type):
expected_length = hash_lengths[hash_type]
if length != expected_length:
raise ValidationError(
'Unexpected length %(length)d for hash type '
'%(hash_type)s, expected %(expected_length)d',
"Unexpected length %(length)d for hash type "
"%(hash_type)s, expected %(expected_length)d",
params={
'length': length,
'expected_length': expected_length,
'hash_type': hash_type,
"length": length,
"expected_length": expected_length,
"hash_type": hash_type,
},
code='unexpected-hash-length',
code="unexpected-hash-length",
)
return True
raise ValidationError(
'Unexpected type %(type)s for hash, expected str or bytes',
"Unexpected type %(type)s for hash, expected str or bytes",
params={
'type': value.__class__.__name__,
"type": value.__class__.__name__,
},
code='unexpected-hash-value-type',
code="unexpected-hash-value-type",
)
def validate_sha1(sha1):
"""Validate that sha1 is a valid sha1 hash"""
return validate_hash(sha1, 'sha1')
return validate_hash(sha1, "sha1")
def validate_sha1_git(sha1_git):
"""Validate that sha1_git is a valid sha1_git hash"""
return validate_hash(sha1_git, 'sha1_git')
return validate_hash(sha1_git, "sha1_git")
def validate_sha256(sha256):
"""Validate that sha256 is a valid sha256 hash"""
return validate_hash(sha256, 'sha256')
return validate_hash(sha256, "sha256")
......@@ -13,16 +13,16 @@ def validate_type(value, type):
"""Validate that value is an integer"""
if not isinstance(value, type):
if isinstance(type, tuple):
typestr = 'one of %s' % ', '.join(typ.__name__ for typ in type)
typestr = "one of %s" % ", ".join(typ.__name__ for typ in type)
else:
typestr = type.__name__
raise ValidationError(
'Unexpected type %(type)s, expected %(expected_type)s',
"Unexpected type %(type)s, expected %(expected_type)s",
params={
'type': value.__class__.__name__,
'expected_type': typestr,
"type": value.__class__.__name__,
"expected_type": typestr,
},
code='unexpected-type'
code="unexpected-type",
)
return True
......@@ -54,10 +54,12 @@ def validate_datetime(value):
errors.append(e)
if isinstance(value, datetime.datetime) and value.tzinfo is None:
errors.append(ValidationError(
'Datetimes must be timezone-aware in swh',
code='datetime-without-tzinfo',
))
errors.append(
ValidationError(
"Datetimes must be timezone-aware in swh",
code="datetime-without-tzinfo",
)
)
if errors:
raise ValidationError(errors)
......@@ -69,12 +71,12 @@ def validate_enum(value, expected_values):
"""Validate that value is contained in expected_values"""
if value not in expected_values:
raise ValidationError(
'Unexpected value %(value)s, expected one of %(expected_values)s',
"Unexpected value %(value)s, expected one of %(expected_values)s",
params={
'value': value,
'expected_values': ', '.join(sorted(expected_values)),
"value": value,
"expected_values": ", ".join(sorted(expected_values)),
},
code='unexpected-value',
code="unexpected-value",
)
return True
This diff is collapsed.