Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • anlambert/swh-model
  • lunar/swh-model
  • franckbret/swh-model
  • douardda/swh-model
  • olasd/swh-model
  • swh/devel/swh-model
  • Alphare/swh-model
  • samplet/swh-model
  • marmoute/swh-model
  • rboyer/swh-model
10 results
Show changes
Showing
with 4851 additions and 989 deletions
swh.core >= 0.3
Click
dulwich
Click
dulwich
pytest
aiohttp
click
pytest >= 8.1
pytz
types-click
types-python-dateutil
types-pytz
types-deprecated
# Add here external Python modules dependencies, one per line. Module names
# should match https://pypi.python.org/pypi names. For the full spec or
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
vcversioner
attrs
attrs != 21.1.0 # https://github.com/python-attrs/attrs/issues/804
attrs_strict >= 0.0.7
deprecated
hypothesis
iso8601
python-dateutil
typing_extensions
#!/usr/bin/env python3
# Copyright (C) 2015-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from setuptools import setup, find_packages
from os import path
from io import open
here = path.abspath(path.dirname(__file__))
# Get the long description from the README file
with open(path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
def parse_requirements(name=None):
if name:
reqf = 'requirements-%s.txt' % name
else:
reqf = 'requirements.txt'
requirements = []
if not path.exists(reqf):
return requirements
with open(reqf) as f:
for line in f.readlines():
line = line.strip()
if not line or line.startswith('#'):
continue
requirements.append(line)
return requirements
blake2_requirements = ['pyblake2;python_version<"3.6"']
setup(
name='swh.model',
description='Software Heritage data model',
long_description=long_description,
long_description_content_type='text/markdown',
author='Software Heritage developers',
author_email='swh-devel@inria.fr',
url='https://forge.softwareheritage.org/diffusion/DMOD/',
packages=find_packages(),
setup_requires=['vcversioner'],
install_requires=(parse_requirements() + parse_requirements('swh') +
blake2_requirements),
extras_require={
'cli': parse_requirements('cli'),
'testing': parse_requirements('test'),
},
vcversioner={},
include_package_data=True,
entry_points='''
[console_scripts]
swh-identify=swh.model.cli:identify
[swh.cli.subcommands]
identify=swh.model.cli:identify
''',
classifiers=[
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Development Status :: 5 - Production/Stable",
],
project_urls={
'Bug Reports': 'https://forge.softwareheritage.org/maniphest',
'Funding': 'https://www.softwareheritage.org/donate',
'Source': 'https://forge.softwareheritage.org/source/swh-model',
},
)
from pkgutil import extend_path
from typing import Iterable
__path__ = extend_path(__path__, __name__) # type: Iterable[str]
# Copyright (C) 2018-2019 The Software Heritage developers
# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import click
import dulwich.repo
import os
import sys
from typing import Callable, Dict, Iterable, Optional
from functools import partial
from urllib.parse import urlparse
# WARNING: do not import unnecessary things here to keep cli startup time under
# control
try:
import click
except ImportError:
print(
"Cannot run swh-identify; the Click package is not installed."
"Please install 'swh.model[cli]' for full functionality.",
file=sys.stderr,
)
exit(1)
from swh.model import hashutil
from swh.model import identifiers as pids
from swh.model.exceptions import ValidationError
from swh.model.from_disk import Content, Directory
try:
import swh.core.cli
cli_command = swh.core.cli.swh.command
except ImportError:
# stub so that swh-identify can be used when swh-core isn't installed
cli_command = click.command
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
from swh.model.from_disk import Directory
from swh.model.swhids import CoreSWHID
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
# Mapping between dulwich types and Software Heritage ones. Used by snapshot ID
# computation.
_DULWICH_TYPES = {
b'blob': 'content',
b'tree': 'directory',
b'commit': 'revision',
b'tag': 'release',
b"blob": "content",
b"tree": "directory",
b"commit": "revision",
b"tag": "release",
}
class PidParamType(click.ParamType):
name = 'persistent identifier'
class CoreSWHIDParamType(click.ParamType):
"""Click argument that accepts a core SWHID and returns them as
:class:`swh.model.swhids.CoreSWHID` instances"""
name = "SWHID"
def convert(self, value, param, ctx) -> CoreSWHID:
from swh.model.exceptions import ValidationError
def convert(self, value, param, ctx):
try:
pids.parse_persistent_identifier(value)
return value # return as string, as we need just that
return CoreSWHID.from_string(value)
except ValidationError as e:
self.fail('%s is not a valid PID. %s.' % (value, e), param, ctx)
self.fail(f'"{value}" is not a valid core SWHID: {e}', param, ctx)
def swhid_of_file(path) -> CoreSWHID:
from swh.model.from_disk import Content
object = Content.from_file(path=path)
return object.swhid()
def swhid_of_file_content(data) -> CoreSWHID:
from swh.model.from_disk import Content
object = Content.from_bytes(mode=644, data=data)
return object.swhid()
def model_of_dir(
path: bytes,
exclude_patterns: Optional[Iterable[bytes]] = None,
update_info: Optional[Callable[[int], None]] = None,
) -> Directory:
from swh.model.from_disk import accept_all_paths, ignore_directories_patterns
path_filter = (
ignore_directories_patterns(path, exclude_patterns)
if exclude_patterns
else accept_all_paths
)
return Directory.from_disk(
path=path, path_filter=path_filter, progress_callback=update_info
)
def pid_of_file(path):
object = Content.from_file(path=path).get_data()
return pids.persistent_identifier(pids.CONTENT, object)
def swhid_of_dir(
path: bytes, exclude_patterns: Optional[Iterable[bytes]] = None
) -> CoreSWHID:
obj = model_of_dir(path, exclude_patterns)
return obj.swhid()
def pid_of_file_content(data):
object = Content.from_bytes(mode=644, data=data).get_data()
return pids.persistent_identifier(pids.CONTENT, object)
def swhid_of_origin(url):
from swh.model.model import Origin
def pid_of_dir(path):
object = Directory.from_disk(path=path).get_data()
return pids.persistent_identifier(pids.DIRECTORY, object)
return Origin(url).swhid()
def pid_of_origin(url):
pid = pids.PersistentId(object_type='origin',
object_id=pids.origin_identifier({'url': url}))
return str(pid)
def swhid_of_git_repo(path) -> CoreSWHID:
try:
import dulwich.repo
except ImportError:
raise click.ClickException(
"Cannot compute snapshot identifier; the Dulwich package is not installed. "
"Please install 'swh.model[cli]' for full functionality.",
)
from swh.model import hashutil
from swh.model.model import Snapshot
def pid_of_git_repo(path):
repo = dulwich.repo.Repo(path)
branches = {}
branches: Dict[bytes, Optional[Dict]] = {}
for ref, target in repo.refs.as_dict().items():
obj = repo[target]
if obj:
branches[ref] = {
'target': hashutil.bytehex_to_hash(target),
'target_type': _DULWICH_TYPES[obj.type_name],
"target": hashutil.bytehex_to_hash(target),
"target_type": _DULWICH_TYPES[obj.type_name],
}
else:
branches[ref] = None
for ref, target in repo.refs.get_symrefs().items():
branches[ref] = {
'target': target,
'target_type': 'alias',
"target": target,
"target_type": "alias",
}
snapshot = {'branches': branches}
snapshot = {"branches": branches}
pid = pids.PersistentId(object_type='snapshot',
object_id=pids.snapshot_identifier(snapshot))
return str(pid)
return Snapshot.from_dict(snapshot).swhid()
def identify_object(obj_type, follow_symlinks, obj):
if obj_type == 'auto':
if obj == '-' or os.path.isfile(obj):
obj_type = 'content'
def identify_object(
obj_type: str, follow_symlinks: bool, exclude_patterns: Iterable[bytes], obj
) -> str:
from urllib.parse import urlparse
if obj_type == "auto":
if obj == "-" or os.path.isfile(obj):
obj_type = "content"
elif os.path.isdir(obj):
obj_type = 'directory'
obj_type = "directory"
else:
try: # URL parsing
if urlparse(obj).scheme:
obj_type = 'origin'
obj_type = "origin"
else:
raise ValueError
except ValueError:
raise click.BadParameter('cannot detect object type for %s' %
obj)
pid = None
raise click.BadParameter("cannot detect object type for %s" % obj)
if obj == '-':
if obj == "-":
content = sys.stdin.buffer.read()
pid = pid_of_file_content(content)
elif obj_type in ['content', 'directory']:
swhid = str(swhid_of_file_content(content))
elif obj_type in ["content", "directory"]:
path = obj.encode(sys.getfilesystemencoding())
if follow_symlinks and os.path.islink(obj):
path = os.path.realpath(obj)
if obj_type == 'content':
pid = pid_of_file(path)
elif obj_type == 'directory':
pid = pid_of_dir(path)
elif obj_type == 'origin':
pid = pid_of_origin(obj)
elif obj_type == 'snapshot':
pid = pid_of_git_repo(obj)
if obj_type == "content":
swhid = str(swhid_of_file(path))
elif obj_type == "directory":
swhid = str(swhid_of_dir(path, exclude_patterns))
elif obj_type == "origin":
swhid = str(swhid_of_origin(obj))
elif obj_type == "snapshot":
swhid = str(swhid_of_git_repo(obj))
else: # shouldn't happen, due to option validation
raise click.BadParameter('invalid object type: ' + obj_type)
raise click.BadParameter("invalid object type: " + obj_type)
# note: we return original obj instead of path here, to preserve user-given
# file name in output
return (obj, pid)
@click.command(context_settings=CONTEXT_SETTINGS)
@click.option('--dereference/--no-dereference', 'follow_symlinks',
default=True,
help='follow (or not) symlinks for OBJECTS passed as arguments '
+ '(default: follow)')
@click.option('--filename/--no-filename', 'show_filename', default=True,
help='show/hide file name (default: show)')
@click.option('--type', '-t', 'obj_type', default='auto',
type=click.Choice(['auto', 'content', 'directory', 'origin',
'snapshot']),
help='type of object to identify (default: auto)')
@click.option('--verify', '-v', metavar='PID', type=PidParamType(),
help='reference identifier to be compared with computed one')
@click.argument('objects', nargs=-1)
def identify(obj_type, verify, show_filename, follow_symlinks, objects):
"""Compute the Software Heritage persistent identifier (PID) for the given
return swhid
@cli_command(context_settings=CONTEXT_SETTINGS)
@click.option(
"--dereference/--no-dereference",
"follow_symlinks",
default=True,
help="follow (or not) symlinks for OBJECTS passed as arguments "
+ "(default: follow)",
)
@click.option(
"--filename/--no-filename",
"show_filename",
default=True,
help="show/hide file name (default: show)",
)
@click.option(
"--type",
"-t",
"obj_type",
default="auto",
type=click.Choice(["auto", "content", "directory", "origin", "snapshot"]),
help="type of object to identify (default: auto)",
)
@click.option(
"--exclude",
"-x",
"exclude_patterns",
metavar="PATTERN",
multiple=True,
help="Exclude directories using glob patterns \
(e.g., ``*.git`` to exclude all .git directories)",
)
@click.option(
"--verify",
"-v",
metavar="SWHID",
type=CoreSWHIDParamType(),
help="reference identifier to be compared with computed one",
)
@click.option(
"-r",
"--recursive",
is_flag=True,
help="compute SWHID recursively",
)
@click.argument("objects", nargs=-1, required=True)
def identify(
obj_type,
verify,
show_filename,
follow_symlinks,
objects,
exclude_patterns,
recursive,
):
"""Compute the Software Heritage persistent identifier (SWHID) for the given
source code object(s).
For more details about Software Heritage PIDs see:
For more details about SWHIDs see:
\b
https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
\b
Examples:
Tip: you can pass "-" to identify the content of standard input.
Examples::
\b
$ swh identify fork.c kmod.c sched/deadline.c
swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3 fork.c
swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2 kmod.c
swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82 sched/deadline.c
\b
$ swh identify --no-filename /usr/src/linux/kernel/
swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab
\b
$ git clone --mirror https://forge.softwareheritage.org/source/helloworld.git
$ swh identify --type snapshot helloworld.git/
swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93 helloworld.git
""" # NoQA # overlong lines in shell examples are fine
if not objects:
objects = ['-']
$ swh identify --type snapshot helloworld.git/
swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93 helloworld.git
if verify and len(objects) != 1:
raise click.BadParameter('verification requires a single object')
"""
from functools import partial
import logging
results = map(partial(identify_object, obj_type, follow_symlinks), objects)
if exclude_patterns:
exclude_patterns = set(pattern.encode() for pattern in exclude_patterns)
if verify:
pid = next(results)[1]
if verify == pid:
click.echo('PID match: %s' % pid)
sys.exit(0)
else:
click.echo('PID mismatch: %s != %s' % (verify, pid))
sys.exit(1)
else:
for (obj, pid) in results:
msg = pid
if show_filename:
msg = '%s\t%s' % (pid, os.fsdecode(obj))
if verify and len(objects) != 1:
raise click.BadParameter("verification requires a single object")
if recursive and not os.path.isdir(objects[0]):
recursive = False
logging.warn("recursive option disabled, input is not a directory object")
if recursive:
if verify:
raise click.BadParameter(
"verification of recursive object identification is not supported"
)
if not obj_type == ("auto" or "directory"):
raise click.BadParameter(
"recursive identification is supported only for directories"
)
path = os.fsencode(objects[0])
dir_obj = model_of_dir(path, exclude_patterns)
for sub_obj in dir_obj.iter_tree():
path_name = "path" if "path" in sub_obj.data.keys() else "data"
path = os.fsdecode(sub_obj.data[path_name])
swhid = str(sub_obj.swhid())
msg = f"{swhid}\t{path}" if show_filename else f"{swhid}"
click.echo(msg)
else:
results = zip(
objects,
map(
partial(identify_object, obj_type, follow_symlinks, exclude_patterns),
objects,
),
)
if verify:
swhid = next(results)[1]
if str(verify) == swhid:
click.echo("SWHID match: %s" % swhid)
sys.exit(0)
else:
click.echo("SWHID mismatch: %s != %s" % (verify, swhid))
sys.exit(1)
else:
for obj, swhid in results:
msg = swhid
if show_filename:
msg = "%s\t%s" % (swhid, os.fsdecode(obj))
click.echo(msg)
if __name__ == '__main__':
if __name__ == "__main__":
identify()
# Copyright (C) 2020-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from __future__ import annotations
"""Utility data structures."""
from collections.abc import Mapping
import copy
from typing import Dict, Generic, Iterable, Optional, Tuple, TypeVar, Union
KT = TypeVar("KT")
VT = TypeVar("VT")
class ImmutableDict(Mapping, Generic[KT, VT]):
"""A frozen dictionary.
This class behaves like a dictionary, but internally stores objects in a tuple,
so it is both immutable and hashable."""
_data: Dict[KT, VT]
def __init__(
self,
data: Union[Iterable[Tuple[KT, VT]], ImmutableDict[KT, VT], Dict[KT, VT]] = {},
):
if isinstance(data, dict):
self._data = data
elif isinstance(data, ImmutableDict):
self._data = data._data
else:
self._data = {k: v for k, v in data}
@property
def data(self):
return tuple(self._data.items())
def __repr__(self):
return f"ImmutableDict({dict(self.data)!r})"
def __getitem__(self, key):
return self._data[key]
def __iter__(self):
for k, v in self.data:
yield k
def __len__(self):
return len(self._data)
def items(self):
yield from self.data
def __hash__(self):
return hash(tuple(sorted(self.data)))
def copy_pop(self, popped_key) -> Tuple[Optional[VT], ImmutableDict[KT, VT]]:
"""Returns a copy of this ImmutableDict without the given key,
as well as the value associated to the key."""
new_items = copy.deepcopy(self._data)
popped_value: Optional[VT] = new_items.pop(popped_key, None)
return (popped_value, ImmutableDict(new_items))
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Primitives for finding unknown content efficiently."""
from __future__ import annotations
from collections import namedtuple
import itertools
import logging
import random
from typing import (
Any,
Callable,
Iterable,
List,
Mapping,
NamedTuple,
Optional,
Set,
Union,
)
from typing_extensions import Protocol, runtime_checkable
from .from_disk import model
from .model import Sha1Git
logger = logging.getLogger(__name__)
# Maximum amount when sampling from the undecided set of directory entries
SAMPLE_SIZE = 1000
# Sets of sha1 of contents, skipped contents and directories respectively
Sample: NamedTuple = namedtuple(
"Sample", ["contents", "skipped_contents", "directories"]
)
@runtime_checkable
class ArchiveDiscoveryInterface(Protocol):
"""Interface used in discovery code to abstract over ways of connecting to
the SWH archive (direct storage, web API, etc.) for all methods needed by
discovery algorithms."""
contents: List[model.Content]
skipped_contents: List[model.SkippedContent]
directories: List[model.Directory]
def __init__(
self,
contents: List[model.Content],
skipped_contents: List[model.SkippedContent],
directories: List[model.Directory],
) -> None:
self.contents = contents
self.skipped_contents = skipped_contents
self.directories = directories
def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]:
"""List content missing from the archive by sha1"""
def skipped_content_missing(
self, skipped_contents: List[Sha1Git]
) -> Iterable[Sha1Git]:
"""List skipped content missing from the archive by sha1"""
def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]:
"""List directories missing from the archive by sha1"""
class BaseDiscoveryGraph:
"""Creates the base structures and methods needed for discovery algorithms.
Subclasses should override ``get_sample`` to affect how the discovery is made.
The `update_info_callback` is an optional argument that will get called for
each new piece of information we get. The callback arguments are `(content,
known)`.
- content: the relevant model.Content object,
- known: a boolean, True if the file is known to the archive False otherwise.
"""
def __init__(
self,
contents,
skipped_contents,
directories,
update_info_callback: Optional[Callable[[Any, bool], None]] = None,
):
self._all_contents: Mapping[
Sha1Git, Union[model.Content, model.SkippedContent]
] = {}
self._undecided_directories: Set[Sha1Git] = set()
self._children: Mapping[Sha1Git, Set[Sha1Git]] = {}
self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {}
self.undecided: Set[Sha1Git] = set()
for content in itertools.chain(contents, skipped_contents):
self.undecided.add(content.sha1_git)
self._all_contents[content.sha1_git] = content
for directory in directories:
self.undecided.add(directory.id)
self._undecided_directories.add(directory.id)
self._children[directory.id] = {c.target for c in directory.entries}
for child in directory.entries:
self._parents.setdefault(child.target, set()).add(directory.id)
self.undecided |= self._undecided_directories
self.known: Set[Sha1Git] = set()
self.unknown: Set[Sha1Git] = set()
self._update_info_callback = update_info_callback
self._sha1_to_obj = {}
for content in itertools.chain(contents, skipped_contents):
self._sha1_to_obj[content.sha1_git] = content
for directory in directories:
self._sha1_to_obj[directory.id] = directory
def mark_known(self, entries: Iterable[Sha1Git]):
"""Mark ``entries`` and those they imply as known in the SWH archive"""
self._mark_entries(entries, self._children, self.known)
def mark_unknown(self, entries: Iterable[Sha1Git]):
"""Mark ``entries`` and those they imply as unknown in the SWH archive"""
self._mark_entries(entries, self._parents, self.unknown)
def _mark_entries(
self,
entries: Iterable[Sha1Git],
transitive_mapping: Mapping[Any, Any],
target_set: Set[Any],
):
"""Use Merkle graph properties to mark a directory entry as known or unknown.
If an entry is known, then all of its descendants are known. If it's
unknown, then all of its ancestors are unknown.
- ``entries``: directory entries to mark along with their ancestors/descendants
where applicable.
- ``transitive_mapping``: mapping from an entry to the next entries to mark
in the hierarchy, if any.
- ``target_set``: set where marked entries will be added.
"""
callback = self._update_info_callback
to_process = set(entries)
while to_process:
current = to_process.pop()
target_set.add(current)
new = current in self.undecided
self.undecided.discard(current)
self._undecided_directories.discard(current)
next_entries = transitive_mapping.get(current, set()) & self.undecided
to_process.update(next_entries)
if new and callback is not None:
obj = self._sha1_to_obj[current]
callback(obj, current in self.known)
def get_sample(
self,
) -> Sample:
"""Return a three-tuple of samples from the undecided sets of contents,
skipped contents and directories respectively.
These samples will be queried against the storage which will tell us
which are known."""
raise NotImplementedError()
def do_query(self, archive: ArchiveDiscoveryInterface, sample: Sample) -> None:
"""Given a three-tuple of samples, ask the archive which are known or
unknown and mark them as such."""
methods = (
archive.content_missing,
archive.skipped_content_missing,
archive.directory_missing,
)
for sample_per_type, method in zip(sample, methods):
if not sample_per_type:
continue
known = set(sample_per_type)
unknown = set(method(list(sample_per_type)))
known -= unknown
self.mark_known(known)
self.mark_unknown(unknown)
class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph):
"""Use a random sampling using only directories.
This allows us to find a statistically good spread of entries in the graph
with a smaller population than using all types of entries. When there are
no more directories, only contents or skipped contents are undecided if any
are left: we send them directly to the storage since they should be few and
their structure flat."""
def get_sample(self) -> Sample:
if self._undecided_directories:
if len(self._undecided_directories) <= SAMPLE_SIZE:
return Sample(
contents=set(),
skipped_contents=set(),
directories=set(self._undecided_directories),
)
sample = random.sample(tuple(self._undecided_directories), SAMPLE_SIZE)
directories = {o for o in sample}
return Sample(
contents=set(), skipped_contents=set(), directories=directories
)
contents = set()
skipped_contents = set()
for sha1 in self.undecided:
obj = self._all_contents[sha1]
obj_type = obj.object_type
if obj_type == model.Content.object_type:
contents.add(sha1)
elif obj_type == model.SkippedContent.object_type:
skipped_contents.add(sha1)
else:
raise TypeError(f"Unexpected object type {obj_type}")
return Sample(
contents=contents, skipped_contents=skipped_contents, directories=set()
)
def filter_known_objects(
archive: ArchiveDiscoveryInterface,
update_info_callback: Optional[Callable[[Any, bool], None]] = None,
):
"""Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
to only return those that are unknown to the SWH archive using a discovery
algorithm.
The `update_info_callback` is an optional argument that will get called for
each new piece of information we get. The callback arguments are `(content,
known)`.
- content: the relevant model.Content object,
- known: a boolean, True if the file is known to the archive False otherwise.
"""
contents = archive.contents
skipped_contents = archive.skipped_contents
directories = archive.directories
contents_count = len(contents)
skipped_contents_count = len(skipped_contents)
directories_count = len(directories)
graph = RandomDirSamplingDiscoveryGraph(
contents,
skipped_contents,
directories,
update_info_callback=update_info_callback,
)
while graph.undecided:
sample = graph.get_sample()
graph.do_query(archive, sample)
contents = [c for c in contents if c.sha1_git in graph.unknown]
skipped_contents = [c for c in skipped_contents if c.sha1_git in graph.unknown]
directories = [c for c in directories if c.id in graph.unknown]
logger.debug(
"Filtered out %d contents, %d skipped contents and %d directories",
contents_count - len(contents),
skipped_contents_count - len(skipped_contents),
directories_count - len(directories),
)
return (contents, skipped_contents, directories)
......@@ -33,11 +33,12 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
NON_FIELD_ERRORS = '__all__'
NON_FIELD_ERRORS = "__all__"
class ValidationError(Exception):
"""An error while validating data."""
def __init__(self, message, code=None, params=None):
"""
The `message` argument can be a single error, a list of errors, or a
......@@ -54,16 +55,15 @@ class ValidationError(Exception):
message = message[0]
if isinstance(message, ValidationError):
if hasattr(message, 'error_dict'):
if hasattr(message, "error_dict"):
message = message.error_dict
# PY2 has a `message` property which is always there so we can't
# duck-type on it. It was introduced in Python 2.5 and already
# deprecated in Python 2.6.
elif not hasattr(message, 'message'):
elif not hasattr(message, "message"):
message = message.error_list
else:
message, code, params = (message.message, message.code,
message.params)
message, code, params = (message.message, message.code, message.params)
if isinstance(message, dict):
self.error_dict = {}
......@@ -78,9 +78,8 @@ class ValidationError(Exception):
# Normalize plain strings to instances of ValidationError.
if not isinstance(message, ValidationError):
message = ValidationError(message)
if hasattr(message, 'error_dict'):
self.error_list.extend(sum(message.error_dict.values(),
[]))
if hasattr(message, "error_dict"):
self.error_list.extend(sum(message.error_dict.values(), []))
else:
self.error_list.extend(message.error_list)
......@@ -94,18 +93,18 @@ class ValidationError(Exception):
def message_dict(self):
# Trigger an AttributeError if this ValidationError
# doesn't have an error_dict.
getattr(self, 'error_dict')
getattr(self, "error_dict")
return dict(self)
@property
def messages(self):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
return sum(dict(self).values(), [])
return list(self)
def update_error_dict(self, error_dict):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
for field, error_list in self.error_dict.items():
error_dict.setdefault(field, []).extend(error_list)
else:
......@@ -113,7 +112,7 @@ class ValidationError(Exception):
return error_dict
def __iter__(self):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
for field, errors in self.error_dict.items():
yield field, list(ValidationError(errors))
else:
......@@ -124,9 +123,13 @@ class ValidationError(Exception):
yield message
def __str__(self):
if hasattr(self, 'error_dict'):
if hasattr(self, "error_dict"):
return repr(dict(self))
return repr(list(self))
def __repr__(self):
return 'ValidationError(%s)' % self
return "ValidationError(%s)" % self
class InvalidDirectoryPath(Exception):
pass
......@@ -6,8 +6,13 @@
# We do our imports here but we don't use them, so flake8 complains
# flake8: noqa
from .simple import (validate_type, validate_int, validate_str, validate_bytes,
validate_datetime, validate_enum)
from .hashes import (validate_sha1, validate_sha1_git, validate_sha256)
from .compound import (validate_against_schema, validate_all_keys,
validate_any_key)
from .compound import validate_against_schema, validate_all_keys, validate_any_key
from .hashes import validate_sha1, validate_sha1_git, validate_sha256
from .simple import (
validate_bytes,
validate_datetime,
validate_enum,
validate_int,
validate_str,
validate_type,
)
......@@ -6,7 +6,7 @@
from collections import defaultdict
import itertools
from ..exceptions import ValidationError, NON_FIELD_ERRORS
from ..exceptions import NON_FIELD_ERRORS, ValidationError
def validate_against_schema(model, schema, value):
......@@ -26,19 +26,19 @@ def validate_against_schema(model, schema, value):
if not isinstance(value, dict):
raise ValidationError(
'Unexpected type %(type)s for %(model)s, expected dict',
"Unexpected type %(type)s for %(model)s, expected dict",
params={
'model': model,
'type': value.__class__.__name__,
"model": model,
"type": value.__class__.__name__,
},
code='model-unexpected-type',
code="model-unexpected-type",
)
errors = defaultdict(list)
for key, (mandatory, validators) in itertools.chain(
((k, v) for k, v in schema.items() if k != NON_FIELD_ERRORS),
[(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))]
[(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))],
):
if not validators:
continue
......@@ -54,9 +54,9 @@ def validate_against_schema(model, schema, value):
if mandatory:
errors[key].append(
ValidationError(
'Field %(field)s is mandatory',
params={'field': key},
code='model-field-mandatory',
"Field %(field)s is mandatory",
params={"field": key},
code="model-field-mandatory",
)
)
......@@ -74,19 +74,21 @@ def validate_against_schema(model, schema, value):
else:
if not valid:
errdata = {
'validator': validator.__name__,
"validator": validator.__name__,
}
if key == NON_FIELD_ERRORS:
errmsg = 'Validation of model %(model)s failed in ' \
'%(validator)s'
errdata['model'] = model
errcode = 'model-validation-failed'
errmsg = (
"Validation of model %(model)s failed in " "%(validator)s"
)
errdata["model"] = model
errcode = "model-validation-failed"
else:
errmsg = 'Validation of field %(field)s failed in ' \
'%(validator)s'
errdata['field'] = key
errcode = 'field-validation-failed'
errmsg = (
"Validation of field %(field)s failed in " "%(validator)s"
)
errdata["field"] = key
errcode = "field-validation-failed"
errors[key].append(
ValidationError(errmsg, params=errdata, code=errcode)
......@@ -102,11 +104,11 @@ def validate_all_keys(value, keys):
"""Validate that all the given keys are present in value"""
missing_keys = set(keys) - set(value)
if missing_keys:
missing_fields = ', '.join(sorted(missing_keys))
missing_fields = ", ".join(sorted(missing_keys))
raise ValidationError(
'Missing mandatory fields %(missing_fields)s',
params={'missing_fields': missing_fields},
code='missing-mandatory-field'
"Missing mandatory fields %(missing_fields)s",
params={"missing_fields": missing_fields},
code="missing-mandatory-field",
)
return True
......@@ -116,11 +118,11 @@ def validate_any_key(value, keys):
"""Validate that any of the given keys is present in value"""
present_keys = set(keys) & set(value)
if not present_keys:
missing_fields = ', '.join(sorted(keys))
missing_fields = ", ".join(sorted(keys))
raise ValidationError(
'Must contain one of the alternative fields %(missing_fields)s',
params={'missing_fields': missing_fields},
code='missing-alternative-field',
"Must contain one of the alternative fields %(missing_fields)s",
params={"missing_fields": missing_fields},
code="missing-alternative-field",
)
return True
......@@ -4,6 +4,7 @@
# See top-level LICENSE file for more information
import string
from ..exceptions import ValidationError
......@@ -22,22 +23,22 @@ def validate_hash(value, hash_type):
"""
hash_lengths = {
'sha1': 20,
'sha1_git': 20,
'sha256': 32,
"sha1": 20,
"sha1_git": 20,
"sha256": 32,
}
hex_digits = set(string.hexdigits)
if hash_type not in hash_lengths:
raise ValidationError(
'Unexpected hash type %(hash_type)s, expected one of'
' %(hash_types)s',
"Unexpected hash type %(hash_type)s, expected one of" " %(hash_types)s",
params={
'hash_type': hash_type,
'hash_types': ', '.join(sorted(hash_lengths)),
"hash_type": hash_type,
"hash_types": ", ".join(sorted(hash_lengths)),
},
code='unexpected-hash-type')
code="unexpected-hash-type",
)
if isinstance(value, str):
errors = []
......@@ -48,10 +49,10 @@ def validate_hash(value, hash_type):
"Unexpected characters `%(unexpected_chars)s' for hash "
"type %(hash_type)s",
params={
'unexpected_chars': ', '.join(sorted(extra_chars)),
'hash_type': hash_type,
"unexpected_chars": ", ".join(sorted(extra_chars)),
"hash_type": hash_type,
},
code='unexpected-hash-contents',
code="unexpected-hash-contents",
)
)
......@@ -60,14 +61,14 @@ def validate_hash(value, hash_type):
if length != expected_length:
errors.append(
ValidationError(
'Unexpected length %(length)d for hash type '
'%(hash_type)s, expected %(expected_length)d',
"Unexpected length %(length)d for hash type "
"%(hash_type)s, expected %(expected_length)d",
params={
'length': length,
'expected_length': expected_length,
'hash_type': hash_type,
"length": length,
"expected_length": expected_length,
"hash_type": hash_type,
},
code='unexpected-hash-length',
code="unexpected-hash-length",
)
)
......@@ -81,37 +82,37 @@ def validate_hash(value, hash_type):
expected_length = hash_lengths[hash_type]
if length != expected_length:
raise ValidationError(
'Unexpected length %(length)d for hash type '
'%(hash_type)s, expected %(expected_length)d',
"Unexpected length %(length)d for hash type "
"%(hash_type)s, expected %(expected_length)d",
params={
'length': length,
'expected_length': expected_length,
'hash_type': hash_type,
"length": length,
"expected_length": expected_length,
"hash_type": hash_type,
},
code='unexpected-hash-length',
code="unexpected-hash-length",
)
return True
raise ValidationError(
'Unexpected type %(type)s for hash, expected str or bytes',
"Unexpected type %(type)s for hash, expected str or bytes",
params={
'type': value.__class__.__name__,
"type": value.__class__.__name__,
},
code='unexpected-hash-value-type',
code="unexpected-hash-value-type",
)
def validate_sha1(sha1):
"""Validate that sha1 is a valid sha1 hash"""
return validate_hash(sha1, 'sha1')
return validate_hash(sha1, "sha1")
def validate_sha1_git(sha1_git):
"""Validate that sha1_git is a valid sha1_git hash"""
return validate_hash(sha1_git, 'sha1_git')
return validate_hash(sha1_git, "sha1_git")
def validate_sha256(sha256):
"""Validate that sha256 is a valid sha256 hash"""
return validate_hash(sha256, 'sha256')
return validate_hash(sha256, "sha256")
......@@ -13,16 +13,16 @@ def validate_type(value, type):
"""Validate that value is an integer"""
if not isinstance(value, type):
if isinstance(type, tuple):
typestr = 'one of %s' % ', '.join(typ.__name__ for typ in type)
typestr = "one of %s" % ", ".join(typ.__name__ for typ in type)
else:
typestr = type.__name__
raise ValidationError(
'Unexpected type %(type)s, expected %(expected_type)s',
"Unexpected type %(type)s, expected %(expected_type)s",
params={
'type': value.__class__.__name__,
'expected_type': typestr,
"type": value.__class__.__name__,
"expected_type": typestr,
},
code='unexpected-type'
code="unexpected-type",
)
return True
......@@ -54,10 +54,12 @@ def validate_datetime(value):
errors.append(e)
if isinstance(value, datetime.datetime) and value.tzinfo is None:
errors.append(ValidationError(
'Datetimes must be timezone-aware in swh',
code='datetime-without-tzinfo',
))
errors.append(
ValidationError(
"Datetimes must be timezone-aware in swh",
code="datetime-without-tzinfo",
)
)
if errors:
raise ValidationError(errors)
......@@ -69,12 +71,12 @@ def validate_enum(value, expected_values):
"""Validate that value is contained in expected_values"""
if value not in expected_values:
raise ValidationError(
'Unexpected value %(value)s, expected one of %(expected_values)s',
"Unexpected value %(value)s, expected one of %(expected_values)s",
params={
'value': value,
'expected_values': ', '.join(sorted(expected_values)),
"value": value,
"expected_values": ", ".join(sorted(expected_values)),
},
code='unexpected-value',
code="unexpected-value",
)
return True
# Copyright (C) 2017-2018 The Software Heritage developers
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Conversion from filesystem tree to SWH objects.
This module allows reading a tree of directories and files from a local
filesystem, and convert them to in-memory data structures, which can then
be exported to SWH data model objects, as defined in :mod:`swh.model.model`.
"""
import enum
import fnmatch
import glob
import os
import re
import stat
from typing import (
Any,
Callable,
Dict,
Iterable,
Iterator,
List,
Optional,
Pattern,
Tuple,
Union,
cast,
)
import warnings
from typing import List
import attr
from deprecated import deprecated
from typing_extensions import Final
from .hashutil import MultiHash, HASH_BLOCK_SIZE
from . import model
from .exceptions import InvalidDirectoryPath
from .git_objects import directory_entry_sort_key
from .hashutil import MultiHash, hash_to_hex
from .merkle import MerkleLeaf, MerkleNode
from .identifiers import (
directory_identifier,
identifier_to_bytes as id_to_bytes,
identifier_to_str as id_to_str,
)
from .swhids import CoreSWHID
from .swhids import ObjectType as SWHIDType
class FromDiskType(model._StringCompatibleEnum):
"""Possible object types for "from disk" object."""
CONTENT = "content"
DIRECTORY = "directory"
def __eq__(self, other):
# stay compatible with legacy string comparison (for now)
if isinstance(other, str):
# note: we should issue deprecation warning at some point
return self.value == other
return super().__eq__(other)
def __str__(self):
# preserve interpolation property (for now)
return self.value
def __hash__(self):
# make sure we don't confuse dictionary key matching (for now)
return hash(str(self.value))
# There is an handful of other module that test for
# DiskBackedContent.object_type in conjunction of Content.object_type, give
# them a hand to let them handle compatibility in a smoother way.
#
# Remove this compatibility trick once this user have been migrated
@deprecated(version="v6.13.0", reason="Use model.Content.object_type instead")
def DiskBackedContent(*args, **kwargs):
return model.Content(*args, **kwargs)
@attr.s(frozen=True, slots=True)
class DiskBackedData:
path = attr.ib(type=bytes)
def __call__(self) -> bytes:
with open(self.path, "rb") as fd:
return fd.read()
class DentryPerms(enum.IntEnum):
"""Admissible permissions for directory entries."""
content = 0o100644
"""Content"""
executable_content = 0o100755
......@@ -68,8 +136,9 @@ class Content(MerkleLeaf):
computation.
"""
__slots__ = [] # type: List[str]
type = 'content'
__slots__: List[str] = []
object_type: Final = FromDiskType.CONTENT
@classmethod
def from_bytes(cls, *, mode, data):
......@@ -80,19 +149,22 @@ class Content(MerkleLeaf):
data (bytes): raw contents of the file
"""
ret = MultiHash.from_data(data).digest()
ret['length'] = len(data)
ret['perms'] = mode_to_perms(mode)
ret['data'] = data
ret["length"] = len(data)
ret["perms"] = mode_to_perms(mode)
ret["data"] = data
ret["status"] = "visible"
return cls(ret)
@classmethod
def from_symlink(cls, *, path, mode):
"""Convert a symbolic link to a Software Heritage content entry"""
return cls.from_bytes(mode=mode, data=os.readlink(path))
content = cls.from_bytes(mode=mode, data=os.readlink(path))
content.data["path"] = path
return content
@classmethod
def from_file(cls, *, path, data=False, save_path=False):
def from_file(cls, *, path, max_content_length=None):
"""Compute the Software Heritage content entry corresponding to an
on-disk file.
......@@ -101,56 +173,83 @@ class Content(MerkleLeaf):
- using the content as a directory entry in a directory
Args:
path (bytes): path to the file for which we're computing the
content entry
data (bool): add the file data to the entry
save_path (bool): add the file path to the entry
max_content_length (Optional[int]): if given, all contents larger
than this will be skipped.
"""
file_stat = os.lstat(path)
mode = file_stat.st_mode
length = file_stat.st_size
too_large = max_content_length is not None and length > max_content_length
if stat.S_ISLNK(mode):
# Symbolic link: return a file whose contents are the link target
if too_large:
# Unlike large contents, we can't stream symlinks to
# MultiHash, and we don't want to fit them in memory if
# they exceed max_content_length either.
# Thankfully, this should not happen for reasonable values of
# max_content_length because of OS/filesystem limitations,
# so let's just raise an error.
raise Exception(f"Symlink too large ({length} bytes)")
return cls.from_symlink(path=path, mode=mode)
elif not stat.S_ISREG(mode):
# not a regular file: return the empty file instead
return cls.from_bytes(mode=mode, data=b'')
return cls.from_bytes(mode=mode, data=b"")
length = file_stat.st_size
if not data:
ret = MultiHash.from_path(path).digest()
if too_large:
skip_reason = "Content too large"
else:
skip_reason = None
hashes = MultiHash.from_path(path).digest()
if skip_reason:
ret = {
**hashes,
"status": "absent",
"reason": skip_reason,
}
else:
h = MultiHash(length=length)
chunks = []
with open(path, 'rb') as fobj:
while True:
chunk = fobj.read(HASH_BLOCK_SIZE)
if not chunk:
break
h.update(chunk)
chunks.append(chunk)
ret = h.digest()
ret['data'] = b''.join(chunks)
if save_path:
ret['path'] = path
ret['perms'] = mode_to_perms(mode)
ret['length'] = length
ret = {
**hashes,
"status": "visible",
}
ret["path"] = path
ret["perms"] = mode_to_perms(mode)
ret["length"] = length
obj = cls(ret)
return obj
def swhid(self) -> CoreSWHID:
"""Return node identifier as a SWHID"""
return CoreSWHID(object_type=SWHIDType.CONTENT, object_id=self.hash)
def __repr__(self):
return 'Content(id=%s)' % id_to_str(self.hash)
return "Content(id=%s)" % hash_to_hex(self.hash)
def compute_hash(self):
return self.data['sha1_git']
def accept_all_directories(dirname, entries):
return self.data["sha1_git"]
def to_model(self) -> model.BaseContent:
"""Builds a `model.BaseContent` object based on this leaf."""
data = self.get_data().copy()
data.pop("perms", None)
path = data.pop("path", None)
if data["status"] == "absent":
return model.SkippedContent.from_dict(data)
elif "data" not in data:
data["get_data"] = DiskBackedData(path=path)
return model.Content.from_dict(data)
def accept_all_directories(
dirpath: bytes, dirname: bytes, entries: Optional[Iterable[Any]]
) -> bool:
"""Default filter for :func:`Directory.from_disk` accepting all
directories
......@@ -158,10 +257,23 @@ def accept_all_directories(dirname, entries):
dirname (bytes): directory name
entries (list): directory entries
"""
warnings.warn(
"`accept_all_directories` is deprecated, use `accept_all_paths`",
DeprecationWarning,
)
return True
def accept_all_paths(
path: bytes, name: bytes, entries: Optional[Iterable[Any]]
) -> bool:
"""Default filter for :func:`Directory.from_disk` accepting all paths"""
return True
def ignore_empty_directories(dirname, entries):
def ignore_empty_directories(
dirpath: bytes, dirname: bytes, entries: Optional[Iterable[Any]]
) -> bool:
"""Filter for :func:`directory_to_objects` ignoring empty directories
Args:
......@@ -170,6 +282,9 @@ def ignore_empty_directories(dirname, entries):
Returns:
True if the directory is not empty, false if the directory is empty
"""
if entries is None:
# Files are not ignored
return True
return bool(entries)
......@@ -187,8 +302,16 @@ def ignore_named_directories(names, *, case_sensitive=True):
if not case_sensitive:
names = [name.lower() for name in names]
def named_filter(dirname, entries,
names=names, case_sensitive=case_sensitive):
def named_filter(
dirpath: str,
dirname: str,
entries: Iterable[Any],
names: Iterable[Any] = names,
case_sensitive: bool = case_sensitive,
):
if entries is None:
# Files are not ignored
return True
if case_sensitive:
return dirname not in names
else:
......@@ -197,6 +320,102 @@ def ignore_named_directories(names, *, case_sensitive=True):
return named_filter
# TODO: `extract_regex_objs` has been copied and adapted from `swh.scanner`.
# In the future `swh.scanner` should use the `swh.model` version and remove its own.
def extract_regex_objs(
root_path: bytes, patterns: Iterable[bytes]
) -> Iterator[Pattern[bytes]]:
"""Generates a regex object for each pattern given in input and checks if
the path is a subdirectory or relative to the root path.
Args:
root_path (bytes): path to the root directory
patterns (list of byte): shell patterns to match
Yields:
an SRE_Pattern object
"""
absolute_root_path = os.path.abspath(root_path)
for pattern in patterns:
if os.path.isabs(pattern):
pattern = os.path.relpath(pattern, root_path)
# python 3.10 has a `root_dir` argument for glob, but not the previous
# version. So we adjust the pattern
test_pattern = os.path.join(absolute_root_path, pattern)
for path in glob.glob(test_pattern):
if os.path.isabs(path) and not path.startswith(absolute_root_path):
error_msg = (
b'The path "' + path + b'" is not a subdirectory or relative '
b'to the root directory path: "' + root_path + b'"'
)
raise InvalidDirectoryPath(error_msg)
regex = fnmatch.translate((pattern.decode()))
yield re.compile(regex.encode())
def ignore_directories_patterns(root_path: bytes, patterns: Iterable[bytes]):
"""Filter for :func:`directory_to_objects` to ignore directories
matching certain patterns.
Args:
root_path (bytes): path of the root directory
patterns (list of bytes): patterns to ignore
Returns:
a directory filter for :func:`directory_to_objects`
"""
sre_patterns = set(extract_regex_objs(root_path, patterns))
def pattern_filter(
dirpath: bytes,
dirname: bytes,
entries: Iterable[Any],
patterns: Iterable[Any] = sre_patterns,
root_path: bytes = os.path.abspath(root_path),
):
full_path = os.path.abspath(os.path.join(dirpath, dirname))
relative_path = os.path.relpath(full_path, root_path)
return not any([pattern.match(relative_path) for pattern in patterns])
return pattern_filter
def iter_directory(
directory: "Directory",
) -> Tuple[List[model.Content], List[model.SkippedContent], List[model.Directory]]:
"""Return the directory listing from a disk-memory directory instance.
Raises:
TypeError in case an unexpected object type is listed.
Returns:
Tuple of respectively iterable of content, skipped content and directories.
"""
contents: List[model.Content] = []
skipped_contents: List[model.SkippedContent] = []
directories: List[model.Directory] = []
for i_obj in directory.iter_tree():
if isinstance(i_obj, Directory):
directories.append(i_obj.to_model())
elif isinstance(i_obj, Content):
obj = i_obj.to_model()
if isinstance(obj, model.SkippedContent):
skipped_contents.append(obj)
else:
# FIXME: read the data from disk later (when the
# storage buffer is flushed).
#
c_obj = cast(model.Content, obj)
contents.append(c_obj.with_data())
else:
raise TypeError(f"Unexpected object type from disk: {obj}")
return contents, skipped_contents, directories
class Directory(MerkleNode):
"""Representation of a Software Heritage directory as a node in a Merkle Tree.
......@@ -216,136 +435,259 @@ class Directory(MerkleNode):
the same method. This enables the efficient collection of updated nodes,
for instance when the client is applying diffs.
"""
__slots__ = ['__entries']
type = 'directory'
__slots__ = ["__entries", "__model_object"]
object_type: Final = FromDiskType.DIRECTORY
@classmethod
def from_disk(cls, *, path, data=False, save_path=False,
dir_filter=accept_all_directories):
def from_disk(
cls,
*,
path: bytes,
path_filter: Callable[
[bytes, bytes, Optional[List[bytes]]], bool
] = accept_all_paths,
max_content_length: Optional[int] = None,
progress_callback: Optional[Callable[[int], None]] = None,
) -> "Directory":
"""Compute the Software Heritage objects for a given directory tree
Args:
path (bytes): the directory to traverse
data (bool): whether to add the data to the content objects
save_path (bool): whether to add the path to the content objects
dir_filter (function): a filter to ignore some directories by
name or contents. Takes two arguments: dirname and entries, and
returns True if the directory should be added, False if the
directory should be ignored.
path_filter (function): a filter to ignore some paths.
Takes three arguments: `path`, `name` and `entries`.
`entries` is `None` for files, and a (possibly empty) list of names
for directories.
Returns True if the path should be added, False if the
path should be ignored.
max_content_length (Optional[int]): if given, all contents larger
than this will be skipped.
progress_callback (Optional function): if given, returns for each
non empty directories traversed the number of computed entries.
"""
# the top might have been specified with a final slash. This will
# confuse various code.
#
# We should prevent '/' as is however
if 1 < len(path) and path[-1:] == b"/":
path = path[0:1] + path[1:].rstrip(b"/")
assert len(path) <= 1 or path[-1:] != b"/"
top_path = path
dirs = {}
top_path_prefix_size = len(top_path) + 1
dirs: Dict[bytes, Directory] = {}
dirs[top_path] = cls({"name": os.path.basename(top_path), "path": top_path})
filtered = []
to_visit = [path]
while to_visit:
root = to_visit.pop()
path, name = os.path.split(root)
with os.scandir(root) as it:
entries_list = list(it)
if root != top_path and not path_filter(
path, name, [entry.path for entry in entries_list]
):
# we should not traverse the current directory, so stop right now,
# but also mark it as removed (for later cleanup)
filtered.append(root)
continue
for root, dentries, fentries in os.walk(top_path, topdown=False):
entries = {}
# Join fentries and dentries in the same processing, as symbolic
# links to directories appear in dentries...
for name in fentries + dentries:
path = os.path.join(root, name)
if not os.path.isdir(path) or os.path.islink(path):
content = Content.from_file(path=path, data=data,
save_path=save_path)
entries[name] = content
for entry in entries_list:
if not entry.is_dir(follow_symlinks=False):
if not path_filter(root, entry.name, None):
continue
content = Content.from_file(
path=entry.path, max_content_length=max_content_length
)
entries[entry.name] = content
else:
if dir_filter(name, dirs[path].entries):
entries[name] = dirs[path]
dirs[root] = cls({'name': os.path.basename(root)})
entries[entry.name] = cls({"name": entry.name, "path": entry.path})
dirs[entry.path] = entries[entry.name]
to_visit.append(entry.path)
dirs[root].update(entries)
return dirs[top_path]
if progress_callback is not None:
if len(entries) > 0:
progress_callback(len(entries))
top_dir = dirs[top_path]
for path in reversed(filtered):
path = path[top_path_prefix_size:]
del top_dir[path]
# a bit sad but now we have to traverse the gathered tree structure to
# filter it again (e.g. for the ignore_empty_directory filter to work
# recursively)
todo: List[Tuple[bytes, Directory]] = [(b"", top_dir)]
traversal = []
while todo:
cpath, cdir = todo.pop(0)
traversal.append(cpath)
for dirname, subdir in cdir.items():
if subdir.object_type == FromDiskType.DIRECTORY:
spath = cpath + b"/" + dirname
todo.append((spath, subdir))
for dirpath in reversed(traversal):
node = top_dir[dirpath]
assert node.object_type == FromDiskType.DIRECTORY
path, name = os.path.split(dirpath)
if dirpath and not path_filter(path, name, list(node.keys())):
# should be filtered
del top_dir[dirpath]
top_dir.update_hash(force=True)
return top_dir
def __init__(self, data=None):
super().__init__(data=data)
self.__entries = None
self.__model_object = None
# note: the overwrite could probably be done by parametrysing the
# MerkelNode type, but that is a much bigger rework than the series
# introducing this change.
def iter_tree(self, dedup=True) -> Iterator[Union["Directory", "Content"]]:
"""Yields all children nodes, recursively. Common nodes are deduplicated
by default (deduplication can be turned off setting the given argument
'dedup' to False).
"""
tree = super().iter_tree(dedup=dedup)
yield from cast(Iterator[Union["Directory", "Content"]], tree)
def invalidate_hash(self):
self.__entries = None
self.__model_object = None
super().invalidate_hash()
@staticmethod
def child_to_directory_entry(name, child):
if isinstance(child, Directory):
if child.object_type == FromDiskType.DIRECTORY:
return {
'type': 'dir',
'perms': DentryPerms.directory,
'target': child.hash,
'name': name,
"type": "dir",
"perms": DentryPerms.directory,
"target": child.hash,
"name": name,
}
elif isinstance(child, Content):
elif child.object_type == FromDiskType.CONTENT:
return {
'type': 'file',
'perms': child.data['perms'],
'target': child.hash,
'name': name,
"type": "file",
"perms": child.data["perms"],
"target": child.hash,
"name": name,
}
else:
raise ValueError('unknown child')
raise ValueError(f"unknown child {child}")
def get_data(self, **kwargs):
return {
'id': self.hash,
'entries': self.entries,
"id": self.hash,
"entries": self.entries,
}
@property
def entries(self):
"""Child nodes, sorted by name in the same way
:func:`swh.model.git_objects.directory_git_object` does."""
if self.__entries is None:
self.__entries = [
self.child_to_directory_entry(name, child)
for name, child in self.items()
]
self.__entries = sorted(
(
self.child_to_directory_entry(name, child)
for name, child in self.items()
),
key=directory_entry_sort_key,
)
return self.__entries
def swhid(self) -> CoreSWHID:
"""Return node identifier as a SWHID"""
return CoreSWHID(object_type=SWHIDType.DIRECTORY, object_id=self.hash)
def compute_hash(self):
return id_to_bytes(directory_identifier({'entries': self.entries}))
return self.to_model().id
def to_model(self) -> model.Directory:
"""Builds a `model.Directory` object based on this node;
ignoring its children."""
if self.__model_object is None:
DirectoryEntry = model.DirectoryEntry
entries = []
for name, child in self.items():
if child.object_type == FromDiskType.DIRECTORY:
e = DirectoryEntry(
type="dir",
perms=DentryPerms.directory,
target=child.hash,
name=name,
)
elif child.object_type == FromDiskType.CONTENT:
e = DirectoryEntry(
type="file",
perms=child.data["perms"],
target=child.hash,
name=name,
)
else:
raise ValueError(f"unknown child {child}")
entries.append(e)
entries.sort(key=directory_entry_sort_key)
self.__model_object = model.Directory(entries=tuple(entries))
return self.__model_object
def __getitem__(self, key):
if not isinstance(key, bytes):
raise ValueError('Can only get a bytes from Directory')
raise ValueError("Can only get a bytes from Directory")
# Convenience shortcut
if key == b'':
if key == b"":
return self
if b'/' not in key:
if b"/" not in key:
return super().__getitem__(key)
else:
key1, key2 = key.split(b'/', 1)
key1, key2 = key.split(b"/", 1)
return self.__getitem__(key1)[key2]
def __setitem__(self, key, value):
if not isinstance(key, bytes):
raise ValueError('Can only set a bytes Directory entry')
raise ValueError("Can only set a bytes Directory entry")
if not isinstance(value, (Content, Directory)):
raise ValueError('Can only set a Directory entry to a Content or '
'Directory')
raise ValueError(
"Can only set a Directory entry to a Content or " "Directory"
)
if key == b'':
raise ValueError('Directory entry must have a name')
if b'\x00' in key:
raise ValueError('Directory entry name must not contain nul bytes')
if key == b"":
raise ValueError("Directory entry must have a name")
if b"\x00" in key:
raise ValueError("Directory entry name must not contain nul bytes")
if b'/' not in key:
if b"/" not in key:
return super().__setitem__(key, value)
else:
key1, key2 = key.rsplit(b'/', 1)
key1, key2 = key.rsplit(b"/", 1)
self[key1].__setitem__(key2, value)
def __delitem__(self, key):
if not isinstance(key, bytes):
raise ValueError('Can only delete a bytes Directory entry')
raise ValueError("Can only delete a bytes Directory entry")
if b'/' not in key:
if b"/" not in key:
super().__delitem__(key)
else:
key1, key2 = key.rsplit(b'/', 1)
key1, key2 = key.rsplit(b"/", 1)
del self[key1][key2]
def __contains__(self, key):
if b"/" not in key:
return super().__contains__(key)
else:
key1, key2 = key.split(b"/", 1)
return super().__contains__(key1) and self[key1].__contains__(key2)
def __repr__(self):
return 'Directory(id=%s, entries=[%s])' % (
id_to_str(self.hash),
', '.join(str(entry) for entry in self),
return "Directory(id=%s, entries=[%s])" % (
hash_to_hex(self.hash),
", ".join(str(entry) for entry in self),
)
# Copyright (C) 2015-2019 The Software Heritage developers
# Copyright (C) 2015-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import binascii
import datetime
import hashlib
"""
Converts SWH model objects to git(-like) objects
from functools import lru_cache
from typing import Any, Dict, NamedTuple
Most of the functions in this module take as argument an object from
:mod:`swh.model.model`, and format it like a git object.
from .exceptions import ValidationError
from .fields.hashes import validate_sha1
from .hashutil import hash_git_data, hash_to_hex, MultiHash
They are the inverse functions of those in :mod:`swh.loader.git.converters`,
but with extensions, as SWH's model is a superset of Git's:
* extensions of existing types (eg. revision/commit and release/tag dates
can be expressed with precision up to milliseconds, to support formatting
Mercurial objects)
* new types, for SWH's specific needs (:class:`swh.model.model.RawExtrinsicMetadata`
and :class:`swh.model.model.ExtID`)
* support for somewhat corrupted git objects that we need to reproduce
ORIGIN = 'origin'
SNAPSHOT = 'snapshot'
REVISION = 'revision'
RELEASE = 'release'
DIRECTORY = 'directory'
CONTENT = 'content'
This is used for two purposes:
PID_NAMESPACE = 'swh'
PID_VERSION = 1
PID_TYPES = ['ori', 'snp', 'rel', 'rev', 'dir', 'cnt']
PID_SEP = ':'
PID_CTXT_SEP = ';'
* Format manifests that can be hashed to produce :ref:`intrinsic identifiers
<persistent-identifiers>`
* Write git objects to reproduce git repositories that were ingested in the archive.
"""
@lru_cache()
def identifier_to_bytes(identifier):
"""Convert a text identifier to bytes.
from __future__ import annotations
import datetime
from functools import lru_cache
from typing import Dict, Iterable, List, Optional, Tuple, Union, cast
import warnings
from . import model
from .collections import ImmutableDict
from .hashutil import git_object_header, hash_to_bytehex
Args:
identifier: an identifier, either a 40-char hexadecimal string or a
bytes object of length 20
Returns:
The length 20 bytestring corresponding to the given identifier
Raises:
ValueError: if the identifier is of an unexpected type or length.
def content_git_object(content: model.Content) -> bytes:
"""Formats a content as a git blob.
A content's identifier is the blob sha1 à la git of the tagged content.
"""
content = cast(model.Content, content)
if isinstance(identifier, bytes):
if len(identifier) != 20:
raise ValueError(
'Wrong length for bytes identifier %s, expected 20' %
len(identifier))
return identifier
if content.data is None:
raise model.MissingData("Content data is None, cannot format.")
if isinstance(identifier, str):
if len(identifier) != 40:
raise ValueError(
'Wrong length for str identifier %s, expected 40' %
len(identifier))
return bytes.fromhex(identifier)
return git_object_header("blob", len(content.data)) + content.data
raise ValueError('Wrong type for identifier %s, expected bytes or str' %
identifier.__class__.__name__)
def directory_entry_sort_key(entry: model.DirectoryEntry):
"""The sorting key for tree entries"""
if isinstance(entry, dict):
type_ = entry["type"]
name = entry["name"]
else:
type_ = entry.type
name = entry.name
@lru_cache()
def identifier_to_str(identifier):
"""Convert an identifier to an hexadecimal string.
if type_ == "dir":
return name + b"/"
else:
return name
Args:
identifier: an identifier, either a 40-char hexadecimal string or a
bytes object of length 20
Returns:
The length 40 string corresponding to the given identifier, hex encoded
@lru_cache()
def _perms_to_bytes(perms):
"""Convert the perms value to its canonical bytes representation"""
oc = oct(perms)[2:]
return oc.encode("ascii")
Raises:
ValueError: if the identifier is of an unexpected type or length.
"""
if isinstance(identifier, str):
if len(identifier) != 40:
raise ValueError(
'Wrong length for str identifier %s, expected 40' %
len(identifier))
return identifier
def escape_newlines(snippet):
"""Escape the newlines present in snippet according to git rules.
if isinstance(identifier, bytes):
if len(identifier) != 20:
raise ValueError(
'Wrong length for bytes identifier %s, expected 20' %
len(identifier))
return binascii.hexlify(identifier).decode()
New lines in git manifests are escaped by indenting the next line by one
space.
raise ValueError('Wrong type for identifier %s, expected bytes or str' %
identifier.__class__.__name__)
"""
if b"\n" in snippet:
return b"\n ".join(snippet.split(b"\n"))
else:
return snippet
def content_identifier(content):
"""Return the intrinsic identifier for a content.
A content's identifier is the sha1, sha1_git and sha256 checksums of its
data.
def format_date(date: model.Timestamp) -> bytes:
"""Convert a date object into an UTC timestamp encoded as ascii bytes.
Args:
content: a content conforming to the Software Heritage schema
Git stores timestamps as an integer number of seconds since the UNIX epoch.
Returns:
A dictionary with all the hashes for the data
However, Software Heritage stores timestamps as an integer number of
microseconds (postgres type "datetime with timezone").
Raises:
KeyError: if the content doesn't have a data member.
Therefore, we print timestamps with no microseconds as integers, and
timestamps with microseconds as floating point values. We elide the
trailing zeroes from microsecond values, to "future-proof" our
representation if we ever need more precision in timestamps.
"""
if isinstance(date, dict):
# For backward compatibility
date = model.Timestamp.from_dict(date)
return MultiHash.from_data(content['data']).digest()
if not date.microseconds:
return str(date.seconds).encode()
else:
float_value = "%d.%06d" % (date.seconds, date.microseconds)
return float_value.rstrip("0").encode()
def _sort_key(entry):
"""The sorting key for tree entries"""
if entry['type'] == 'dir':
return entry['name'] + b'/'
else:
return entry['name']
def normalize_timestamp(time_representation):
"""Normalize a time representation for processing by Software Heritage
This function supports a numeric timestamp (representing a number of
seconds since the UNIX epoch, 1970-01-01 at 00:00 UTC), a
:obj:`datetime.datetime` object (with timezone information), or a
normalized Software Heritage time representation (idempotency).
@lru_cache()
def _perms_to_bytes(perms):
"""Convert the perms value to its bytes representation"""
oc = oct(perms)[2:]
return oc.encode('ascii')
Args:
time_representation: the representation of a timestamp
Returns:
dict: a normalized dictionary with three keys:
def escape_newlines(snippet):
"""Escape the newlines present in snippet according to git rules.
- timestamp: a dict with two optional keys:
New lines in git manifests are escaped by indenting the next line by one
space.
- seconds: the integral number of seconds since the UNIX epoch
- microseconds: the integral number of microseconds
- offset: the timezone offset as a number of minutes relative to
UTC
- negative_utc: a boolean representing whether the offset is -0000
when offset = 0.
"""
if b'\n' in snippet:
return b'\n '.join(snippet.split(b'\n'))
if time_representation is None:
return None
else:
return snippet
return model.TimestampWithTimezone.from_dict(time_representation).to_dict()
def directory_identifier(directory):
"""Return the intrinsic identifier for a directory.
def directory_git_object(directory: Union[Dict, model.Directory]) -> bytes:
"""Formats a directory as a git tree.
A directory's identifier is the tree sha1 à la git of a directory listing,
using the following algorithm, which is equivalent to the git algorithm for
......@@ -179,229 +179,123 @@ def directory_identifier(directory):
(Note that there is no separator between entries)
"""
if isinstance(directory, dict):
# For backward compatibility
warnings.warn(
"directory_git_object's argument should be a swh.model.model.Directory "
"object.",
DeprecationWarning,
stacklevel=2,
)
directory = model.Directory.from_dict(directory)
directory = cast(model.Directory, directory)
components = []
for entry in sorted(directory['entries'], key=_sort_key):
components.extend([
_perms_to_bytes(entry['perms']),
b'\x20',
entry['name'],
b'\x00',
identifier_to_bytes(entry['target']),
])
return identifier_to_str(hash_git_data(b''.join(components), 'tree'))
def format_date(date):
"""Convert a date object into an UTC timestamp encoded as ascii bytes.
Git stores timestamps as an integer number of seconds since the UNIX epoch.
However, Software Heritage stores timestamps as an integer number of
microseconds (postgres type "datetime with timezone").
Therefore, we print timestamps with no microseconds as integers, and
timestamps with microseconds as floating point values. We elide the
trailing zeroes from microsecond values, to "future-proof" our
representation if we ever need more precision in timestamps.
"""
if not isinstance(date, dict):
raise ValueError('format_date only supports dicts, %r received' % date)
seconds = date.get('seconds', 0)
microseconds = date.get('microseconds', 0)
if not microseconds:
return str(seconds).encode()
else:
float_value = ('%d.%06d' % (seconds, microseconds))
return float_value.rstrip('0').encode()
for entry in sorted(directory.entries, key=directory_entry_sort_key):
components.extend(
[
_perms_to_bytes(entry.perms),
b"\x20",
entry.name,
b"\x00",
entry.target,
]
)
@lru_cache()
def format_offset(offset, negative_utc=None):
"""Convert an integer number of minutes into an offset representation.
return format_git_object_from_parts("tree", components)
The offset representation is [+-]hhmm where:
- hh is the number of hours;
- mm is the number of minutes.
def format_git_object_from_headers(
git_type: str,
headers: Iterable[Tuple[bytes, bytes]],
message: Optional[bytes] = None,
) -> bytes:
"""Format a git_object comprised of a git header and a manifest,
which is itself a sequence of `headers`, and an optional `message`.
A null offset is represented as +0000.
"""
if offset < 0 or offset == 0 and negative_utc:
sign = '-'
else:
sign = '+'
The git_object format, compatible with the git format for tag and commit
objects, is as follows:
hours = abs(offset) // 60
minutes = abs(offset) % 60
- for each `key`, `value` in `headers`, emit:
t = '%s%02d%02d' % (sign, hours, minutes)
return t.encode()
- the `key`, literally
- an ascii space (``\\x20``)
- the `value`, with newlines escaped using :func:`escape_newlines`,
- an ascii newline (``\\x0a``)
- if the `message` is not None, emit:
def normalize_timestamp(time_representation):
"""Normalize a time representation for processing by Software Heritage
This function supports a numeric timestamp (representing a number of
seconds since the UNIX epoch, 1970-01-01 at 00:00 UTC), a
:obj:`datetime.datetime` object (with timezone information), or a
normalized Software Heritage time representation (idempotency).
- an ascii newline (``\\x0a``)
- the `message`, literally
Args:
time_representation: the representation of a timestamp
headers: a sequence of key/value headers stored in the manifest;
message: an optional message used to trail the manifest.
Returns:
dict: a normalized dictionary with three keys:
- timestamp: a dict with two optional keys:
- seconds: the integral number of seconds since the UNIX epoch
- microseconds: the integral number of microseconds
- offset: the timezone offset as a number of minutes relative to
UTC
- negative_utc: a boolean representing whether the offset is -0000
when offset = 0.
the formatted git_object as bytes
"""
entries: List[bytes] = []
if time_representation is None:
return None
for key, value in headers:
entries.extend((key, b" ", escape_newlines(value), b"\n"))
negative_utc = False
if message is not None:
entries.extend((b"\n", message))
if isinstance(time_representation, dict):
ts = time_representation['timestamp']
if isinstance(ts, dict):
seconds = ts.get('seconds', 0)
microseconds = ts.get('microseconds', 0)
elif isinstance(ts, int):
seconds = ts
microseconds = 0
else:
raise ValueError(
'normalize_timestamp received non-integer timestamp member:'
' %r' % ts)
offset = time_representation['offset']
if 'negative_utc' in time_representation:
negative_utc = time_representation['negative_utc']
elif isinstance(time_representation, datetime.datetime):
seconds = int(time_representation.timestamp())
microseconds = time_representation.microsecond
utcoffset = time_representation.utcoffset()
if utcoffset is None:
raise ValueError(
'normalize_timestamp received datetime without timezone: %s' %
time_representation)
# utcoffset is an integer number of minutes
seconds_offset = utcoffset.total_seconds()
offset = int(seconds_offset) // 60
elif isinstance(time_representation, int):
seconds = time_representation
microseconds = 0
offset = 0
else:
raise ValueError(
'normalize_timestamp received non-integer timestamp:'
' %r' % time_representation)
return format_git_object_from_parts(git_type, entries)
return {
'timestamp': {
'seconds': seconds,
'microseconds': microseconds,
},
'offset': offset,
'negative_utc': negative_utc,
}
def format_git_object_from_parts(git_type: str, parts: Iterable[bytes]) -> bytes:
"""Similar to :func:`format_git_object_from_headers`, but for manifests made of
a flat list of entries, instead of key-value + message, ie. trees and snapshots."""
concatenated_parts = b"".join(parts)
def format_author(author):
"""Format the specification of an author.
header = git_object_header(git_type, len(concatenated_parts))
return header + concatenated_parts
An author is either a byte string (passed unchanged), or a dict with three
keys, fullname, name and email.
If the fullname exists, return it; if it doesn't, we construct a fullname
using the following heuristics: if the name value is None, we return the
email in angle brackets, else, we return the name, a space, and the email
in angle brackets.
def format_author_data(
author: model.Person, date_offset: Optional[model.TimestampWithTimezone]
) -> bytes:
"""Format authorship data according to git standards.
"""
if isinstance(author, bytes) or author is None:
return author
Git authorship data has two components:
if 'fullname' in author:
return author['fullname']
- an author specification, usually a name and email, but in practice an
arbitrary bytestring
- optionally, a timestamp with a UTC offset specification
ret = []
if author['name'] is not None:
ret.append(author['name'])
if author['email'] is not None:
ret.append(b''.join([b'<', author['email'], b'>']))
The authorship data is formatted thus::
return b' '.join(ret)
def format_author_line(header, author, date_offset):
"""Format a an author line according to git standards.
An author line has three components:
- a header, describing the type of author (author, committer, tagger)
- a name and email, which is an arbitrary bytestring
- optionally, a timestamp with UTC offset specification
The author line is formatted thus::
`header` `name and email`[ `timestamp` `utc_offset`]
`name and email`[ `timestamp` `utc_offset`]
The timestamp is encoded as a (decimal) number of seconds since the UNIX
epoch (1970-01-01 at 00:00 UTC). As an extension to the git format, we
support fractional timestamps, using a dot as the separator for the decimal
part.
The utc offset is a number of minutes encoded as '[+-]HHMM'. Note some
The utc offset is a number of minutes encoded as '[+-]HHMM'. Note that some
tools can pass a negative offset corresponding to the UTC timezone
('-0000'), which is valid and is encoded as such.
For convenience, this function returns the whole line with its trailing
newline.
Args:
header: the header of the author line (one of 'author', 'committer',
'tagger')
author: an author specification (dict with two bytes values: name and
email, or byte value)
date_offset: a normalized date/time representation as returned by
:func:`normalize_timestamp`.
Returns:
the newline-terminated byte string containing the author line
the byte string containing the authorship data
"""
ret = [header.encode(), b' ', escape_newlines(format_author(author))]
date_offset = normalize_timestamp(date_offset)
ret = [author.fullname]
if date_offset is not None:
date_f = format_date(date_offset['timestamp'])
offset_f = format_offset(date_offset['offset'],
date_offset['negative_utc'])
date_f = format_date(date_offset.timestamp)
ret.extend([b' ', date_f, b' ', offset_f])
ret.extend([b" ", date_f, b" ", date_offset.offset_bytes])
ret.append(b'\n')
return b''.join(ret)
return b"".join(ret)
def revision_identifier(revision):
"""Return the intrinsic identifier for a revision.
def revision_git_object(revision: Union[Dict, model.Revision]) -> bytes:
"""Formats a revision as a git tree.
The fields used for the revision identifier computation are:
......@@ -411,7 +305,7 @@ def revision_identifier(revision):
- author_date
- committer
- committer_date
- metadata -> extra_headers
- extra_headers or metadata -> extra_headers
- message
A revision's identifier is the 'git'-checksum of a commit manifest
......@@ -432,7 +326,7 @@ def revision_identifier(revision):
The directory identifier is the ascii representation of its hexadecimal
encoding.
Author and committer are formatted with the :func:`format_author` function.
Author and committer are formatted using the :attr:`Person.fullname` attribute only.
Dates are formatted with the :func:`format_offset` function.
Extra headers are an ordered list of [key, value] pairs. Keys are strings
......@@ -450,79 +344,82 @@ def revision_identifier(revision):
type.
"""
components = [
b'tree ', identifier_to_str(revision['directory']).encode(), b'\n',
]
for parent in revision['parents']:
if parent:
components.extend([
b'parent ', identifier_to_str(parent).encode(), b'\n',
])
if isinstance(revision, dict):
# For backward compatibility
warnings.warn(
"revision_git_object's argument should be a swh.model.model.Revision "
"object.",
DeprecationWarning,
stacklevel=2,
)
revision = model.Revision.from_dict(revision)
revision = cast(model.Revision, revision)
components.extend([
format_author_line('author', revision['author'], revision['date']),
format_author_line('committer', revision['committer'],
revision['committer_date']),
])
headers = [(b"tree", hash_to_bytehex(revision.directory))]
for parent in revision.parents:
if parent:
headers.append((b"parent", hash_to_bytehex(parent)))
if revision.author is not None:
headers.append((b"author", format_author_data(revision.author, revision.date)))
if revision.committer is not None:
headers.append(
(
b"committer",
format_author_data(revision.committer, revision.committer_date),
)
)
# Handle extra headers
metadata = revision.get('metadata')
if not metadata:
metadata = {}
metadata = revision.metadata or ImmutableDict()
extra_headers = revision.extra_headers or ()
if not extra_headers and "extra_headers" in metadata:
extra_headers = metadata["extra_headers"]
for key, value in metadata.get('extra_headers', []):
headers.extend(extra_headers)
# Integer values: decimal representation
if isinstance(value, int):
value = str(value).encode('utf-8')
return format_git_object_from_headers("commit", headers, revision.message)
# Unicode string values: utf-8 encoding
if isinstance(value, str):
value = value.encode('utf-8')
# encode the key to utf-8
components.extend([key.encode('utf-8'), b' ',
escape_newlines(value), b'\n'])
if revision['message'] is not None:
components.extend([b'\n', revision['message']])
commit_raw = b''.join(components)
return identifier_to_str(hash_git_data(commit_raw, 'commit'))
def target_type_to_git(target_type):
def target_type_to_git(target_type: model.ReleaseTargetType) -> bytes:
"""Convert a software heritage target type to a git object type"""
return {
'content': b'blob',
'directory': b'tree',
'revision': b'commit',
'release': b'tag',
'snapshot': b'refs'
model.ReleaseTargetType.CONTENT: b"blob",
model.ReleaseTargetType.DIRECTORY: b"tree",
model.ReleaseTargetType.REVISION: b"commit",
model.ReleaseTargetType.RELEASE: b"tag",
model.ReleaseTargetType.SNAPSHOT: b"refs",
}[target_type]
def release_identifier(release):
"""Return the intrinsic identifier for a release."""
components = [
b'object ', identifier_to_str(release['target']).encode(), b'\n',
b'type ', target_type_to_git(release['target_type']), b'\n',
b'tag ', release['name'], b'\n',
]
if 'author' in release and release['author']:
components.append(
format_author_line('tagger', release['author'], release['date'])
def release_git_object(release: Union[Dict, model.Release]) -> bytes:
if isinstance(release, dict):
# For backward compatibility
warnings.warn(
"release_git_object's argument should be a swh.model.model.Directory "
"object.",
DeprecationWarning,
stacklevel=2,
)
release = model.Release.from_dict(release)
release = cast(model.Release, release)
headers = [
(b"object", hash_to_bytehex(release.target)),
(b"type", target_type_to_git(release.target_type)),
(b"tag", release.name),
]
if release['message'] is not None:
components.extend([b'\n', release['message']])
if release.author is not None:
headers.append((b"tagger", format_author_data(release.author, release.date)))
return identifier_to_str(hash_git_data(b''.join(components), 'tag'))
return format_git_object_from_headers("tag", headers, release.message)
def snapshot_identifier(snapshot, *, ignore_unresolved=False):
"""Return the intrinsic identifier for a snapshot.
def snapshot_git_object(
snapshot: Union[Dict, model.Snapshot], *, ignore_unresolved: bool = False
) -> bytes:
"""Formats a snapshot as a git-like object.
Snapshots are a set of named branches, which are pointers to objects at any
level of the Software Heritage DAG.
......@@ -567,242 +464,209 @@ def snapshot_identifier(snapshot, *, ignore_unresolved=False):
length but are length-encoded to avoid ambiguity.
Args:
snapshot (dict): the snapshot of which to compute the identifier. A
single entry is needed, ``'branches'``, which is itself a :class:`dict`
mapping each branch to its target
ignore_unresolved (bool): if `True`, ignore unresolved branch aliases.
Returns:
str: the intrinsic identifier for `snapshot`
ignore_unresolved: if False (the default), raises an exception when
alias branches point to non-existing branches
"""
if isinstance(snapshot, dict):
# For backward compatibility
warnings.warn(
"snapshot_git_object's argument should be a swh.model.model.Snapshot "
"object.",
DeprecationWarning,
stacklevel=2,
)
snapshot = model.Snapshot.from_dict(snapshot)
snapshot = cast(model.Snapshot, snapshot)
unresolved = []
lines = []
for name, target in sorted(snapshot['branches'].items()):
for name, target in sorted(snapshot.branches.items()):
if not target:
target_type = b'dangling'
target_id = b''
elif target['target_type'] == 'alias':
target_type = b'alias'
target_id = target['target']
if target_id not in snapshot['branches'] or target_id == name:
target_type = b"dangling"
target_id = b""
elif target.target_type == model.SnapshotTargetType.ALIAS:
target_type = b"alias"
target_id = target.target
if target_id not in snapshot.branches or target_id == name:
unresolved.append((name, target_id))
else:
target_type = target['target_type'].encode()
target_id = identifier_to_bytes(target['target'])
lines.extend([
target_type, b'\x20', name, b'\x00',
('%d:' % len(target_id)).encode(), target_id,
])
target_type = target.target_type.value.encode()
target_id = target.target
lines.extend(
[
target_type,
b"\x20",
name,
b"\x00",
("%d:" % len(target_id)).encode(),
target_id,
]
)
if unresolved and not ignore_unresolved:
raise ValueError('Branch aliases unresolved: %s' %
', '.join('%s -> %s' % x for x in unresolved),
unresolved)
raise ValueError(
"Branch aliases unresolved: %s"
% ", ".join("%r -> %r" % x for x in unresolved),
unresolved,
)
return identifier_to_str(hash_git_data(b''.join(lines), 'snapshot'))
return format_git_object_from_parts("snapshot", lines)
def origin_identifier(origin):
"""Return the intrinsic identifier for an origin.
def raw_extrinsic_metadata_git_object(
metadata: Union[Dict, model.RawExtrinsicMetadata],
) -> bytes:
"""Formats RawExtrinsicMetadata as a git-like object.
An origin's identifier is the sha1 checksum of the entire origin URL
A raw_extrinsic_metadata identifier is a salted sha1 (using the git
hashing algorithm with the ``raw_extrinsic_metadata`` object type) of
a manifest following the format::
"""
return hashlib.sha1(origin['url'].encode('ascii')).hexdigest()
_object_type_map = {
ORIGIN: {
'short_name': 'ori',
'key_id': 'id'
},
SNAPSHOT: {
'short_name': 'snp',
'key_id': 'id'
},
RELEASE: {
'short_name': 'rel',
'key_id': 'id'
},
REVISION: {
'short_name': 'rev',
'key_id': 'id'
},
DIRECTORY: {
'short_name': 'dir',
'key_id': 'id'
},
CONTENT: {
'short_name': 'cnt',
'key_id': 'sha1_git'
}
}
_PersistentId = NamedTuple(
'PersistentId', [
('namespace', str),
('scheme_version', int),
('object_type', str),
('object_id', str),
('metadata', Dict[str, Any]),
])
class PersistentId(_PersistentId):
"""
Named tuple holding the relevant info associated to a Software Heritage
persistent identifier.
target $ExtendedSwhid
discovery_date $Timestamp
authority $StrWithoutSpaces $IRI
fetcher $Str $Version
format $StrWithoutSpaces
origin $IRI <- optional
visit $IntInDecimal <- optional
snapshot $CoreSwhid <- optional
release $CoreSwhid <- optional
revision $CoreSwhid <- optional
path $Bytes <- optional
directory $CoreSwhid <- optional
Args:
namespace (str): the namespace of the identifier, defaults to 'swh'
scheme_version (int): the scheme version of the identifier,
defaults to 1
object_type (str): the type of object the identifier points to,
either 'content', 'directory', 'release', 'revision' or 'snapshot'
object_id (dict/bytes/str): object's dict representation or
object identifier
metadata (dict): optional dict filled with metadata related to
pointed object
Raises:
swh.model.exceptions.ValidationError: In case of invalid object type
or id
Once created, it contains the following attributes:
Attributes:
namespace (str): the namespace of the identifier
scheme_version (int): the scheme version of the identifier
object_type (str): the type of object the identifier points to
object_id (str): hexadecimal representation of the object hash
metadata (dict): metadata related to the pointed object
To get the raw persistent identifier string from an instance of
this named tuple, use the :func:`str` function::
pid = PersistentId(
object_type='content',
object_id='8ff44f081d43176474b267de5451f2c2e88089d0'
)
pid_str = str(pid)
# 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0'
"""
__slots__ = ()
def __new__(cls, namespace=PID_NAMESPACE, scheme_version=PID_VERSION,
object_type='', object_id='', metadata={}):
o = _object_type_map.get(object_type)
if not o:
raise ValidationError('Wrong input: Supported types are %s' % (
list(_object_type_map.keys())))
if namespace != PID_NAMESPACE:
raise ValidationError(
"Wrong format: only supported namespace is '%s'"
% PID_NAMESPACE)
if scheme_version != PID_VERSION:
raise ValidationError(
'Wrong format: only supported version is %d' % PID_VERSION)
# internal swh representation resolution
if isinstance(object_id, dict):
object_id = object_id[o['key_id']]
validate_sha1(object_id) # can raise if invalid hash
object_id = hash_to_hex(object_id)
return super(cls, PersistentId).__new__(
cls, namespace, scheme_version, object_type, object_id, metadata)
def __str__(self):
o = _object_type_map.get(self.object_type)
pid = PID_SEP.join([self.namespace, str(self.scheme_version),
o['short_name'], self.object_id])
if self.metadata:
for k, v in self.metadata.items():
pid += '%s%s=%s' % (PID_CTXT_SEP, k, v)
return pid
def persistent_identifier(object_type, object_id, scheme_version=1,
metadata={}):
"""Compute persistent identifier (stable over time) as per
documentation.
Documentation:
https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html # noqa
$MetadataBytes
Args:
object_type (str): object's type, either 'content', 'directory',
'release', 'revision' or 'snapshot'
object_id (dict/bytes/str): object's dict representation or object
identifier
scheme_version (int): persistent identifier scheme version,
defaults to 1
metadata (dict): metadata related to the pointed object
Raises:
swh.model.exceptions.ValidationError: In case of invalid object type
or id
$IRI must be RFC 3987 IRIs (so they may contain newlines, that are escaped as
described below)
Returns:
str: the persistent identifier
$StrWithoutSpaces and $Version are ASCII strings, and may not contain spaces.
$Str is an UTF-8 string.
$CoreSwhid are core SWHIDs, as defined in :ref:`persistent-identifiers`.
$ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for
origins and 'emd' for raw extrinsic metadata)
$Timestamp is a decimal representation of the rounded-down integer number of
seconds since the UNIX epoch (1970-01-01 00:00:00 UTC),
with no leading '0' (unless the timestamp value is zero) and no timezone.
It may be negative by prefixing it with a '-', which must not be followed
by a '0'.
Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields,
ie. by adding a space after them.
"""
pid = PersistentId(scheme_version=scheme_version, object_type=object_type,
object_id=object_id, metadata=metadata)
return str(pid)
if isinstance(metadata, dict):
# For backward compatibility
warnings.warn(
"raw_extrinsic_metadata_git_object's argument should be a "
"swh.model.model.RawExtrinsicMetadata object.",
DeprecationWarning,
stacklevel=2,
)
metadata = model.RawExtrinsicMetadata.from_dict(metadata)
metadata = cast(model.RawExtrinsicMetadata, metadata)
# equivalent to using math.floor(dt.timestamp()) to round down,
# as int(dt.timestamp()) rounds toward zero,
# which would map two seconds on the 0 timestamp.
#
# This should never be an issue in practice as Software Heritage didn't
# start collecting metadata before 2015.
timestamp = (
metadata.discovery_date.astimezone(datetime.timezone.utc)
.replace(microsecond=0)
.timestamp()
)
assert timestamp.is_integer()
headers = [
(b"target", str(metadata.target).encode()),
(b"discovery_date", str(int(timestamp)).encode("ascii")),
(
b"authority",
f"{metadata.authority.type.value} {metadata.authority.url}".encode(),
),
(
b"fetcher",
f"{metadata.fetcher.name} {metadata.fetcher.version}".encode(),
),
(b"format", metadata.format.encode()),
]
for key in (
"origin",
"visit",
"snapshot",
"release",
"revision",
"path",
"directory",
):
if getattr(metadata, key, None) is not None:
value: bytes
if key == "path":
value = getattr(metadata, key)
else:
value = str(getattr(metadata, key)).encode()
def parse_persistent_identifier(persistent_id):
"""Parse swh's :ref:`persistent-identifiers` scheme.
headers.append((key.encode("ascii"), value))
Args:
persistent_id (str): A persistent identifier
return format_git_object_from_headers(
"raw_extrinsic_metadata", headers, metadata.metadata
)
Raises:
swh.model.exceptions.ValidationError: in case of:
* missing mandatory values (4)
* invalid namespace supplied
* invalid version supplied
* invalid type supplied
* missing hash
* invalid hash identifier supplied
def extid_git_object(extid: model.ExtID) -> bytes:
"""Formats an extid as a gi-like object.
Returns:
PersistentId: a named tuple holding the parsing result
An ExtID identifier is a salted sha1 (using the git hashing algorithm with
the ``extid`` object type) of a manifest following the format:
```
extid_type $StrWithoutSpaces
[extid_version $Str]
extid $Bytes
target $CoreSwhid
[payload_type $StrWithoutSpaces]
[payload $ContentIdentifier]
```
$StrWithoutSpaces is an ASCII string, and may not contain spaces.
Newlines in $Bytes are escaped as with other git fields, ie. by adding a
space after them.
The extid_version line is only generated if the version is non-zero.
The payload_type and payload lines are only generated if they are not
:const:`None`. $ContentIdentifier is the object ID of a content object.
"""
# <pid>;<contextual-information>
persistent_id_parts = persistent_id.split(PID_CTXT_SEP)
pid_data = persistent_id_parts.pop(0).split(':')
if len(pid_data) != 4:
raise ValidationError(
'Wrong format: There should be 4 mandatory values')
# Checking for parsing errors
_ns, _version, _type, _id = pid_data
pid_data[1] = int(pid_data[1])
for otype, data in _object_type_map.items():
if _type == data['short_name']:
pid_data[2] = otype
break
if not _id:
raise ValidationError(
'Wrong format: Identifier should be present')
persistent_id_metadata = {}
for part in persistent_id_parts:
try:
key, val = part.split('=')
persistent_id_metadata[key] = val
except Exception:
msg = 'Contextual data is badly formatted, form key=val expected'
raise ValidationError(msg)
pid_data.append(persistent_id_metadata)
return PersistentId(*pid_data)
headers = [
(b"extid_type", extid.extid_type.encode("ascii")),
]
extid_version = extid.extid_version
if extid_version != 0:
headers.append((b"extid_version", str(extid_version).encode("ascii")))
headers.extend(
[
(b"extid", extid.extid),
(b"target", str(extid.target).encode("ascii")),
]
)
payload_type = extid.payload_type
if payload_type is not None:
headers.append((b"payload_type", payload_type.encode("ascii")))
payload = extid.payload
if payload is not None:
headers.append((b"payload", payload))
return format_git_object_from_headers("extid", headers)
......@@ -54,15 +54,16 @@ Basic usage examples:
import binascii
import functools
import hashlib
import os
from io import BytesIO
from typing import Callable, Dict
import os
from typing import Callable, Dict, Optional, Union
ALGORITHMS = set(['sha1', 'sha256', 'sha1_git', 'blake2s256', 'blake2b512'])
ALGORITHMS = set(
["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5", "sha512"]
)
"""Hashing algorithms supported by this module"""
DEFAULT_ALGORITHMS = set(['sha1', 'sha256', 'sha1_git', 'blake2s256'])
DEFAULT_ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256"])
"""Algorithms computed by default when calling the functions from this module.
Subset of :const:`ALGORITHMS`.
......@@ -71,7 +72,7 @@ Subset of :const:`ALGORITHMS`.
HASH_BLOCK_SIZE = 32768
"""Block size for streaming hash computations made in this module"""
_blake2_hash_cache = {} # type: Dict[str, Callable]
_blake2_hash_cache: Dict[str, Callable] = {}
class MultiHash:
......@@ -87,12 +88,13 @@ class MultiHash:
computed and returned.
"""
def __init__(self, hash_names=DEFAULT_ALGORITHMS, length=None):
self.state = {}
self.track_length = False
for name in hash_names:
if name == 'length':
self.state['length'] = 0
if name == "length":
self.state["length"] = 0
self.track_length = True
else:
self.state[name] = _new_hash(name, length)
......@@ -116,7 +118,7 @@ class MultiHash:
@classmethod
def from_path(cls, path, hash_names=DEFAULT_ALGORITHMS):
length = os.path.getsize(path)
with open(path, 'rb') as f:
with open(path, "rb") as f:
ret = cls.from_file(f, hash_names=hash_names, length=length)
return ret
......@@ -128,48 +130,45 @@ class MultiHash:
def update(self, chunk):
for name, h in self.state.items():
if name == 'length':
if name == "length":
continue
h.update(chunk)
if self.track_length:
self.state['length'] += len(chunk)
self.state["length"] += len(chunk)
def digest(self):
return {
name: h.digest() if name != 'length' else h
name: h.digest() if name != "length" else h
for name, h in self.state.items()
}
def hexdigest(self):
return {
name: h.hexdigest() if name != 'length' else h
name: h.hexdigest() if name != "length" else h
for name, h in self.state.items()
}
def bytehexdigest(self):
return {
name: hash_to_bytehex(h.digest()) if name != 'length' else h
name: hash_to_bytehex(h.digest()) if name != "length" else h
for name, h in self.state.items()
}
def copy(self):
copied_state = {
name: h.copy() if name != 'length' else h
for name, h in self.state.items()
name: h.copy() if name != "length" else h for name, h in self.state.items()
}
return self.from_state(copied_state, self.track_length)
def _new_blake2_hash(algo):
"""Return a function that initializes a blake2 hash.
"""
"""Return a function that initializes a blake2 hash."""
if algo in _blake2_hash_cache:
return _blake2_hash_cache[algo]()
lalgo = algo.lower()
if not lalgo.startswith('blake2'):
raise ValueError('Algorithm %s is not a blake2 hash' % algo)
if not lalgo.startswith("blake2"):
raise ValueError("Algorithm %s is not a blake2 hash" % algo)
blake_family = lalgo[:7]
......@@ -178,27 +177,14 @@ def _new_blake2_hash(algo):
try:
digest_size, remainder = divmod(int(lalgo[7:]), 8)
except ValueError:
raise ValueError(
'Unknown digest size for algo %s' % algo
) from None
raise ValueError("Unknown digest size for algo %s" % algo) from None
if remainder:
raise ValueError(
'Digest size for algorithm %s must be a multiple of 8' % algo
"Digest size for algorithm %s must be a multiple of 8" % algo
)
if lalgo in hashlib.algorithms_available:
# Handle the case where OpenSSL ships the given algorithm
# (e.g. Python 3.5 on Debian 9 stretch)
_blake2_hash_cache[algo] = lambda: hashlib.new(lalgo)
else:
# Try using the built-in implementation for Python 3.6+
if blake_family in hashlib.algorithms_available:
blake2 = getattr(hashlib, blake_family)
else:
import pyblake2
blake2 = getattr(pyblake2, blake_family)
_blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size)
blake2 = getattr(hashlib, blake_family)
_blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size)
return _blake2_hash_cache[algo]()
......@@ -208,18 +194,16 @@ def _new_hashlib_hash(algo):
Handle the swh-specific names for the blake2-related algorithms
"""
if algo.startswith('blake2'):
if algo.startswith("blake2"):
return _new_blake2_hash(algo)
else:
return hashlib.new(algo)
def _new_git_hash(base_algo, git_type, length):
"""Initialize a digest object (as returned by python's hashlib) for the
requested algorithm, and feed it with the header for a git object of the
given type and length.
def git_object_header(git_type: str, length: int) -> bytes:
"""Returns the header for a git object of the given type and length.
The header for hashing a git object consists of:
The header of a git object consists of:
- The type of the object (encoded in ASCII)
- One ASCII space (\x20)
- The length of the object (decimal encoded in ASCII)
......@@ -234,15 +218,26 @@ def _new_git_hash(base_algo, git_type, length):
Returns:
a hashutil.hash object
"""
git_object_types = {
"blob",
"tree",
"commit",
"tag",
"snapshot",
"raw_extrinsic_metadata",
"extid",
}
h = _new_hashlib_hash(base_algo)
git_header = '%s %d\0' % (git_type, length)
h.update(git_header.encode('ascii'))
if git_type not in git_object_types:
raise ValueError(
"Unexpected git object type %s, expected one of %s"
% (git_type, ", ".join(sorted(git_object_types)))
)
return h
return ("%s %d\0" % (git_type, length)).encode("ascii")
def _new_hash(algo, length=None):
def _new_hash(algo: str, length: Optional[int] = None):
"""Initialize a digest object (as returned by python's hashlib) for
the requested algorithm. See the constant ALGORITHMS for the list
of supported algorithms. If a git-specific hashing algorithm is
......@@ -264,19 +259,22 @@ def _new_hash(algo, length=None):
"""
if algo not in ALGORITHMS:
raise ValueError(
'Unexpected hashing algorithm %s, expected one of %s' %
(algo, ', '.join(sorted(ALGORITHMS))))
"Unexpected hashing algorithm %s, expected one of %s"
% (algo, ", ".join(sorted(ALGORITHMS)))
)
if algo.endswith('_git'):
if algo.endswith("_git"):
if length is None:
raise ValueError('Missing length for git hashing algorithm')
raise ValueError("Missing length for git hashing algorithm")
base_algo = algo[:-4]
return _new_git_hash(base_algo, 'blob', length)
h = _new_hashlib_hash(base_algo)
h.update(git_object_header("blob", length))
return h
return _new_hashlib_hash(algo)
def hash_git_data(data, git_type, base_algo='sha1'):
def hash_git_data(data, git_type, base_algo="sha1"):
"""Hash the given data as a git object of type git_type.
Args:
......@@ -289,21 +287,15 @@ def hash_git_data(data, git_type, base_algo='sha1'):
Raises:
ValueError if the git_type is unexpected.
"""
git_object_types = {'blob', 'tree', 'commit', 'tag', 'snapshot'}
if git_type not in git_object_types:
raise ValueError('Unexpected git object type %s, expected one of %s' %
(git_type, ', '.join(sorted(git_object_types))))
h = _new_git_hash(base_algo, git_type, len(data))
h = _new_hashlib_hash(base_algo)
h.update(git_object_header(git_type, len(data)))
h.update(data)
return h.digest()
@functools.lru_cache()
def hash_to_hex(hash):
def hash_to_hex(hash: Union[str, bytes]) -> str:
"""Converts a hash (in hex or bytes form) to its hexadecimal ascii form
Args:
......@@ -315,11 +307,11 @@ def hash_to_hex(hash):
"""
if isinstance(hash, str):
return hash
return binascii.hexlify(hash).decode('ascii')
return binascii.hexlify(hash).decode("ascii")
@functools.lru_cache()
def hash_to_bytehex(hash):
def hash_to_bytehex(hash: bytes) -> bytes:
"""Converts a hash to its hexadecimal bytes representation
Args:
......@@ -332,7 +324,7 @@ def hash_to_bytehex(hash):
@functools.lru_cache()
def hash_to_bytes(hash):
def hash_to_bytes(hash: Union[str, bytes]) -> bytes:
"""Converts a hash (in hex or bytes form) to its raw bytes form
Args:
......@@ -348,7 +340,7 @@ def hash_to_bytes(hash):
@functools.lru_cache()
def bytehex_to_hash(hex):
def bytehex_to_hash(hex: bytes) -> bytes:
"""Converts a hexadecimal bytes representation of a hash to that hash
Args:
......
# Copyright (C) 2019 The Software Heritage developers
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import attr
import datetime
import functools
import string
from typing import Any, Callable, List, Sequence, Set, Tuple, Union
from deprecated import deprecated
from hypothesis import assume
from hypothesis.extra.dateutil import timezones
from hypothesis.strategies import (
binary, builds, characters, composite, dictionaries, from_regex,
integers, just, lists, none, one_of, sampled_from, text, tuples,
SearchStrategy,
binary,
booleans,
builds,
characters,
composite,
datetimes,
dictionaries,
from_regex,
integers,
just,
lists,
none,
one_of,
sampled_from,
sets,
text,
tuples,
)
from .from_disk import DentryPerms
from .model import (
Person, Timestamp, TimestampWithTimezone, Origin, OriginVisit,
Snapshot, SnapshotBranch, TargetType, Release, Revision,
Directory, DirectoryEntry, Content, SkippedContent
BaseContent,
BaseModel,
Content,
Directory,
DirectoryEntry,
MetadataAuthority,
MetadataFetcher,
ModelObjectType,
Origin,
OriginVisit,
OriginVisitStatus,
Person,
RawExtrinsicMetadata,
Release,
ReleaseTargetType,
Revision,
RevisionType,
SkippedContent,
Snapshot,
SnapshotBranch,
SnapshotTargetType,
Timestamp,
TimestampWithTimezone,
)
from .identifiers import snapshot_identifier, identifier_to_bytes
from .swhids import ExtendedObjectType, ExtendedSWHID
pgsql_alphabet = characters(
blacklist_categories=('Cs', ),
blacklist_characters=['\u0000']) # postgresql does not like these
blacklist_categories=["Cs"],
blacklist_characters=["\u0000"],
) # postgresql does not like these
def optional(strategy):
......@@ -42,208 +82,532 @@ def sha1():
return binary(min_size=20, max_size=20)
def binaries_without_bytes(blacklist: Sequence[int]):
"""Like hypothesis.strategies.binary, but takes a sequence of bytes that
should not be included."""
return lists(sampled_from([i for i in range(256) if i not in blacklist])).map(bytes)
@composite
def extended_swhids(draw):
object_type = draw(sampled_from(ExtendedObjectType))
object_id = draw(sha1_git())
return ExtendedSWHID(object_type=object_type, object_id=object_id)
def aware_datetimes():
# datetimes in Software Heritage are not used for software artifacts
# (which may be much older than 2000), but only for objects like scheduler
# task runs, and origin visits, which were created by Software Heritage,
# so at least in 2015.
# We're forbidding old datetimes, because until 1956, many timezones had seconds
# in their "UTC offsets" (see
# <https://en.wikipedia.org/wiki/Time_zone#Worldwide_time_zones>), which is not
# encodable in ISO8601; and we need our datetimes to be ISO8601-encodable in the
# RPC protocol
min_value = datetime.datetime(2000, 1, 1, 0, 0, 0)
return datetimes(min_value=min_value, timezones=timezones())
@composite
def urls(draw):
protocol = draw(sampled_from(['git', 'http', 'https', 'deb']))
domain = draw(from_regex(r'\A([a-z]([a-z0-9-]*)\.){1,3}[a-z0-9]+\Z'))
def iris(draw):
protocol = draw(sampled_from(["git", "http", "https", "deb"]))
domain = draw(from_regex(r"\A([a-z]([a-z0-9é🏛️-]*)\.){1,3}([a-z0-9é])+\Z"))
return '%s://%s' % (protocol, domain)
return "%s://%s" % (protocol, domain)
def persons():
return builds(Person)
@composite
def persons_d(draw):
fullname = draw(binary())
email = draw(optional(binary()))
name = draw(optional(binary()))
assume(not (len(fullname) == 32 and email is None and name is None))
return dict(fullname=fullname, name=name, email=email)
def persons(**kwargs):
return persons_d(**kwargs).map(Person.from_dict)
def timestamps_d(**kwargs):
defaults = dict(
seconds=integers(Timestamp.MIN_SECONDS, Timestamp.MAX_SECONDS),
microseconds=integers(Timestamp.MIN_MICROSECONDS, Timestamp.MAX_MICROSECONDS),
)
return builds(dict, **{**defaults, **kwargs})
def timestamps():
max_seconds = datetime.datetime.max.replace(
tzinfo=datetime.timezone.utc).timestamp()
min_seconds = datetime.datetime.min.replace(
tzinfo=datetime.timezone.utc).timestamp()
return builds(
Timestamp,
seconds=integers(min_seconds, max_seconds),
microseconds=integers(0, 1000000))
return timestamps_d().map(Timestamp.from_dict)
def timestamps_with_timezone():
return builds(
TimestampWithTimezone,
timestamp=timestamps(),
offset=integers(min_value=-14*60, max_value=14*60))
@composite
def timestamps_with_timezone_d(
draw,
*,
timestamp=timestamps_d(),
offset=integers(min_value=-14 * 60, max_value=14 * 60),
negative_utc=booleans(),
):
timestamp = draw(timestamp)
offset = draw(offset)
negative_utc = draw(negative_utc)
assume(not (negative_utc and offset))
return dict(timestamp=timestamp, offset=offset, negative_utc=negative_utc)
timestamps_with_timezone = timestamps_with_timezone_d().map(
TimestampWithTimezone.from_dict
)
def origins():
return builds(
Origin,
type=sampled_from(['git', 'hg', 'svn', 'pypi', 'deb']),
url=urls())
def origins_d(*, url=iris().filter(lambda iri: len(iri.encode()) < 2048)):
return builds(dict, url=url)
def origin_visits():
return builds(
OriginVisit,
visit=integers(0, 1000),
origin=urls(),
status=sampled_from(['ongoing', 'full', 'partial']),
def origins(**kwargs):
return origins_d(**kwargs).map(Origin.from_dict)
def origin_visits_d(**kwargs):
defaults = dict(
visit=integers(1, 1000),
origin=iris(),
date=aware_datetimes(),
type=pgsql_text(),
snapshot=optional(sha1_git()))
)
return builds(dict, **{**defaults, **kwargs})
@composite
def releases(draw):
(date, author) = draw(one_of(
tuples(none(), none()),
tuples(timestamps_with_timezone(), persons())))
rel = draw(builds(
Release,
author=none(),
date=none(),
target=sha1_git()))
return attr.evolve(
rel,
date=date,
author=author)
def revision_metadata():
def origin_visits(**kwargs):
return origin_visits_d(**kwargs).map(OriginVisit.from_dict)
def metadata_dicts():
return dictionaries(pgsql_text(), pgsql_text())
def revisions():
return builds(
Revision,
date=timestamps_with_timezone(),
committer_date=timestamps_with_timezone(),
parents=lists(sha1_git()),
def origin_visit_statuses_d(**kwargs):
defaults = dict(
visit=integers(1, 1000),
origin=iris(),
type=optional(sampled_from(["git", "svn", "pypi", "debian"])),
status=sampled_from(
["created", "ongoing", "full", "partial", "not_found", "failed"]
),
date=aware_datetimes(),
snapshot=optional(sha1_git()),
metadata=optional(metadata_dicts()),
)
return builds(dict, **{**defaults, **kwargs})
def origin_visit_statuses(**kwargs):
return origin_visit_statuses_d(**kwargs).map(OriginVisitStatus.from_dict)
@composite
def releases_d(draw, **kwargs):
defaults = dict(
target_type=sampled_from([x.value for x in ReleaseTargetType]),
name=binary(),
message=optional(binary()),
synthetic=booleans(),
target=sha1_git(),
metadata=optional(revision_metadata()),
raw_manifest=optional(binary()),
)
d = draw(
one_of(
# None author/date:
builds(dict, author=none(), date=none(), **{**defaults, **kwargs}),
# non-None author/date:
builds(
dict,
date=timestamps_with_timezone_d(),
author=persons_d(),
**{**defaults, **kwargs},
),
# it is also possible for date to be None but not author, but let's not
# overwhelm hypothesis with this edge case
)
)
if d["raw_manifest"] is None:
del d["raw_manifest"]
return d
def releases(**kwargs):
return releases_d(**kwargs).map(Release.from_dict)
revision_metadata = metadata_dicts
def extra_headers():
return lists(
tuples(binary(min_size=0, max_size=50), binary(min_size=0, max_size=500))
).map(tuple)
@composite
def revisions_d(draw, **kwargs):
defaults = dict(
message=optional(binary()),
synthetic=booleans(),
parents=tuples(sha1_git()),
directory=sha1_git(),
metadata=one_of(none(), revision_metadata()))
type=sampled_from([x.value for x in RevisionType]),
metadata=optional(revision_metadata()),
extra_headers=extra_headers(),
raw_manifest=optional(binary()),
)
d = draw(
one_of(
# None author/committer/date/committer_date
builds(
dict,
author=none(),
committer=none(),
date=none(),
committer_date=none(),
**{**defaults, **kwargs},
),
# non-None author/committer/date/committer_date
builds(
dict,
author=persons_d(),
committer=persons_d(),
date=timestamps_with_timezone_d(),
committer_date=timestamps_with_timezone_d(),
**{**defaults, **kwargs},
),
# There are many other combinations, but let's not overwhelm hypothesis
# with these edge cases
)
)
# TODO: metadata['extra_headers'] can have binary keys and values
if d["raw_manifest"] is None:
del d["raw_manifest"]
return d
def directory_entries():
return builds(
DirectoryEntry,
def revisions(**kwargs):
return revisions_d(**kwargs).map(Revision.from_dict)
def directory_entries_d(**kwargs):
defaults = dict(
name=binaries_without_bytes(b"/"),
target=sha1_git(),
perms=sampled_from([perm.value for perm in DentryPerms]))
)
return one_of(
builds(
dict,
type=just("file"),
perms=one_of(
integers(min_value=0o100000, max_value=0o100777), # regular file
integers(min_value=0o120000, max_value=0o120777), # symlink
),
**{**defaults, **kwargs},
),
builds(
dict,
type=just("dir"),
perms=integers(
min_value=DentryPerms.directory,
max_value=DentryPerms.directory + 0o777,
),
**{**defaults, **kwargs},
),
builds(
dict,
type=just("rev"),
perms=integers(
min_value=DentryPerms.revision,
max_value=DentryPerms.revision + 0o777,
),
**{**defaults, **kwargs},
),
)
def directories():
return builds(
Directory,
entries=lists(directory_entries()))
def directory_entries(**kwargs):
return directory_entries_d(**kwargs).map(DirectoryEntry)
@composite
def directories_d(draw, raw_manifest=optional(binary())):
d = draw(builds(dict, entries=tuples(directory_entries_d())))
d["raw_manifest"] = draw(raw_manifest)
if d["raw_manifest"] is None:
del d["raw_manifest"]
return d
def directories(**kwargs):
return directories_d(**kwargs).map(Directory.from_dict)
def contents_d():
return one_of(present_contents_d(), skipped_contents_d())
def contents():
return one_of(present_contents(), skipped_contents())
@composite
def present_contents(draw):
return draw(builds(
Content,
length=integers(min_value=0, max_value=2**63-1),
sha1=sha1(),
sha1_git=sha1_git(),
sha256=binary(min_size=32, max_size=32),
blake2s256=binary(min_size=32, max_size=32),
status=one_of(just('visible'), just('hidden')),
data=binary(),
))
def present_contents_d(**kwargs):
defaults = dict(
data=binary(max_size=4096),
ctime=optional(aware_datetimes()),
status=one_of(just("visible"), just("hidden")),
)
return builds(dict, **{**defaults, **kwargs})
def present_contents(**kwargs):
return present_contents_d().map(lambda d: Content.from_data(**d))
@composite
def skipped_contents(draw):
return draw(builds(
SkippedContent,
length=integers(min_value=-1, max_value=2**63-1),
sha1=optional(sha1()),
sha1_git=optional(sha1_git()),
sha256=optional(binary(min_size=32, max_size=32)),
blake2s256=optional(binary(min_size=32, max_size=32)),
status=just('absent'),
reason=pgsql_text(),
))
def skipped_contents_d(
draw, reason=pgsql_text(), status=just("absent"), ctime=optional(aware_datetimes())
):
result = BaseContent._hash_data(draw(binary(max_size=4096)))
result.pop("data")
nullify_attrs = draw(
sets(sampled_from(["sha1", "sha1_git", "sha256", "blake2s256"]))
)
for k in nullify_attrs:
result[k] = None
result["reason"] = draw(reason)
result["status"] = draw(status)
result["ctime"] = draw(ctime)
return result
def skipped_contents(**kwargs):
return skipped_contents_d().map(SkippedContent.from_dict)
def branch_names():
return binary(min_size=1)
def branch_targets_object():
def snapshot_targets_object_d():
return builds(
SnapshotBranch,
dict,
target=sha1_git(),
target_type=sampled_from([
TargetType.CONTENT, TargetType.DIRECTORY, TargetType.REVISION,
TargetType.RELEASE, TargetType.SNAPSHOT]))
target_type=sampled_from(
[x.value for x in SnapshotTargetType if x.value not in ("alias",)]
),
)
branch_targets_object_d = deprecated(
version="v6.13.0", reason="use snapshot_targets_object_d"
)(snapshot_targets_object_d)
def branch_targets_alias():
def snapshot_targets_alias_d():
return builds(
SnapshotBranch,
target_type=just(TargetType.ALIAS))
dict, target=sha1_git(), target_type=just("alias")
) # SnapshotTargetType.ALIAS.value))
branch_targets_alias_d = deprecated(
version="v6.13.0", reason="use snapshot_targets_alias_d"
)(snapshot_targets_alias_d)
def branch_targets(*, only_objects=False):
def snapshot_targets_d(*, only_objects=False):
if only_objects:
return branch_targets_object()
return snapshot_targets_object_d()
else:
return one_of(branch_targets_alias(), branch_targets_object())
return one_of(snapshot_targets_alias_d(), snapshot_targets_object_d())
branch_targets_d = deprecated(version="v6.13.0", reason="use snapshot_targets_d")(
snapshot_targets_d
)
def snapshot_targets(*, only_objects=False):
return builds(
SnapshotBranch.from_dict, snapshot_targets_d(only_objects=only_objects)
)
@composite
def snapshots(draw, *, min_size=0, max_size=100, only_objects=False):
branches = draw(dictionaries(
keys=branch_names(),
values=one_of(
none(),
branch_targets(only_objects=only_objects)
),
min_size=min_size,
max_size=max_size,
))
def snapshots_d(draw, *, min_size=0, max_size=100, only_objects=False):
branches = draw(
dictionaries(
keys=branch_names(),
values=optional(snapshot_targets_d(only_objects=only_objects)),
min_size=min_size,
max_size=max_size,
)
)
if not only_objects:
# Make sure aliases point to actual branches
unresolved_aliases = {
target.target
for target in branches.values()
if (target
and target.target_type == 'alias'
and target.target not in branches)
}
for alias in unresolved_aliases:
branches[alias] = draw(branch_targets(only_objects=True))
branch: target["target"]
for branch, target in branches.items()
if (
target
and target["target_type"] == "alias"
and target["target"] not in branches
)
}
for alias_name, alias_target in unresolved_aliases.items():
# Override alias branch with one pointing to a real object
# if max_size constraint is reached
alias = alias_target if len(branches) < max_size else alias_name
branches[alias] = draw(snapshot_targets_d(only_objects=True))
# Ensure no cycles between aliases
while True:
try:
id_ = snapshot_identifier({
'branches': {
name: branch.to_dict() if branch else None
for (name, branch) in branches.items()}})
snapshot = Snapshot.from_dict(
{
"branches": {
name: branch or None for (name, branch) in branches.items()
}
}
)
except ValueError as e:
for (source, target) in e.args[1]:
branches[source] = draw(branch_targets(only_objects=True))
for source, target in e.args[1]:
branches[source] = draw(snapshot_targets_d(only_objects=True))
else:
break
return Snapshot(
id=identifier_to_bytes(id_),
branches=branches)
return snapshot.to_dict()
def objects():
return one_of(
origins().map(lambda x: ('origin', x)),
origin_visits().map(lambda x: ('origin_visit', x)),
snapshots().map(lambda x: ('snapshot', x)),
releases().map(lambda x: ('release', x)),
revisions().map(lambda x: ('revision', x)),
directories().map(lambda x: ('directory', x)),
contents().map(lambda x: ('content', x)),
def snapshots(*, min_size=0, max_size=100, only_objects=False):
return snapshots_d(
min_size=min_size, max_size=max_size, only_objects=only_objects
).map(Snapshot.from_dict)
def metadata_authorities(url=iris()):
return builds(MetadataAuthority, url=url, metadata=just(None))
def metadata_fetchers(**kwargs):
defaults = dict(
name=text(min_size=1, alphabet=string.printable),
version=text(
min_size=1,
alphabet=string.ascii_letters + string.digits + string.punctuation,
),
)
return builds(
MetadataFetcher,
metadata=just(None),
**{**defaults, **kwargs},
)
def object_dicts():
return objects().map(lambda x: (x[0], x[1].to_dict()))
def raw_extrinsic_metadata(**kwargs):
defaults = dict(
target=extended_swhids(),
discovery_date=aware_datetimes(),
authority=metadata_authorities(),
fetcher=metadata_fetchers(),
format=text(min_size=1, alphabet=string.printable),
)
return builds(RawExtrinsicMetadata, **{**defaults, **kwargs})
def raw_extrinsic_metadata_d(**kwargs):
return raw_extrinsic_metadata(**kwargs).map(RawExtrinsicMetadata.to_dict)
def _tuplify(object_type: ModelObjectType, obj: BaseModel):
return (object_type, obj)
def objects(
# remove the Union once deprecated usage have been migrated
blacklist_types: Union[Set[ModelObjectType] | Any] = {
ModelObjectType.ORIGIN_VISIT_STATUS,
},
split_content: bool = False,
):
"""generates a random couple (type, obj)
which obj is an instance of the Model class corresponding to obj_type.
`blacklist_types` is a list of obj_type to exclude from the strategy.
If `split_content` is True, generates Content and SkippedContent under different
obj_type, resp. "content" and "skipped_content".
"""
strategies: List[
Tuple[ModelObjectType, Callable[[], SearchStrategy[BaseModel]]]
] = [
(ModelObjectType.ORIGIN, origins),
(ModelObjectType.ORIGIN_VISIT, origin_visits),
(ModelObjectType.ORIGIN_VISIT_STATUS, origin_visit_statuses),
(ModelObjectType.SNAPSHOT, snapshots),
(ModelObjectType.RELEASE, releases),
(ModelObjectType.REVISION, revisions),
(ModelObjectType.DIRECTORY, directories),
(ModelObjectType.RAW_EXTRINSIC_METADATA, raw_extrinsic_metadata),
]
if split_content:
strategies.append((ModelObjectType.CONTENT, present_contents))
strategies.append((ModelObjectType.SKIPPED_CONTENT, skipped_contents))
else:
strategies.append((ModelObjectType.CONTENT, contents))
candidates = [
obj_gen().map(functools.partial(_tuplify, obj_type))
for (obj_type, obj_gen) in strategies
if obj_type not in blacklist_types
]
return one_of(*candidates)
def object_dicts(
blacklist_types=(ModelObjectType.ORIGIN_VISIT_STATUS,), split_content=False
):
"""generates a random couple (type, dict)
which dict is suitable for <ModelForType>.from_dict() factory methods.
`blacklist_types` is a list of obj_type to exclude from the strategy.
If `split_content` is True, generates Content and SkippedContent under different
obj_type, resp. "content" and "skipped_content".
"""
strategies = [
(ModelObjectType.ORIGIN, origins_d),
(ModelObjectType.ORIGIN_VISIT, origin_visits_d),
(ModelObjectType.ORIGIN_VISIT_STATUS, origin_visit_statuses_d),
(ModelObjectType.SNAPSHOT, snapshots_d),
(ModelObjectType.RELEASE, releases_d),
(ModelObjectType.REVISION, revisions_d),
(ModelObjectType.DIRECTORY, directories_d),
(ModelObjectType.RAW_EXTRINSIC_METADATA, raw_extrinsic_metadata_d),
]
if split_content:
strategies.append((ModelObjectType.CONTENT, present_contents_d))
strategies.append((ModelObjectType.SKIPPED_CONTENT, skipped_contents_d))
else:
strategies.append((ModelObjectType.CONTENT, contents_d))
args = [
obj_gen().map(lambda x, obj_type=obj_type: (obj_type, x))
for (obj_type, obj_gen) in strategies
if obj_type not in blacklist_types
]
return one_of(*args)
# Copyright (C) 2017 The Software Heritage developers
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Merkle tree data structure"""
import abc
import collections
from typing import List, Optional
def deep_update(left, right):
"""Recursively update the left mapping with deeply nested values from the right
mapping.
This function is useful to merge the results of several calls to
:func:`MerkleNode.collect`.
Arguments:
left: a mapping (modified by the update operation)
right: a mapping
Returns:
the left mapping, updated with nested values from the right mapping
Example:
>>> a = {
... 'key1': {
... 'key2': {
... 'key3': 'value1/2/3',
... },
... },
... }
>>> deep_update(a, {
... 'key1': {
... 'key2': {
... 'key4': 'value1/2/4',
... },
... },
... }) == {
... 'key1': {
... 'key2': {
... 'key3': 'value1/2/3',
... 'key4': 'value1/2/4',
... },
... },
... }
True
>>> deep_update(a, {
... 'key1': {
... 'key2': {
... 'key3': 'newvalue1/2/3',
... },
... },
... }) == {
... 'key1': {
... 'key2': {
... 'key3': 'newvalue1/2/3',
... 'key4': 'value1/2/4',
... },
... },
... }
True
from __future__ import annotations
"""
for key, rvalue in right.items():
if isinstance(rvalue, collections.Mapping):
new_lvalue = deep_update(left.get(key, {}), rvalue)
left[key] = new_lvalue
else:
left[key] = rvalue
return left
import abc
from typing import Any, Dict, Iterator, List, Set
class MerkleNode(dict, metaclass=abc.ABCMeta):
......@@ -102,16 +39,18 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
The collection of updated data from the tree is implemented through the
:func:`collect` function and associated helpers.
Attributes:
data (dict): data associated to the current node
parents (list): known parents of the current node
collected (bool): whether the current node has been collected
"""
__slots__ = ['parents', 'data', '__hash', 'collected']
type = None # type: Optional[str] # TODO: make this an enum
"""Type of the current node (used as a classifier for :func:`collect`)"""
__slots__ = ["parents", "data", "__hash", "collected"]
data: Dict
"""data associated to the current node"""
parents: List
"""known parents of the current node"""
collected: bool
"""whether the current node has been collected"""
def __init__(self, data=None):
super().__init__()
......@@ -120,6 +59,16 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
self.__hash = None
self.collected = False
def __eq__(self, other):
return (
isinstance(other, MerkleNode)
and super().__eq__(other)
and self.data == other.data
)
def __ne__(self, other):
return not self.__eq__(other)
def invalidate_hash(self):
"""Invalidate the cached hash of the current node."""
if not self.__hash:
......@@ -130,7 +79,7 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
for parent in self.parents:
parent.invalidate_hash()
def update_hash(self, *, force=False):
def update_hash(self, *, force=False) -> Any:
"""Recursively compute the hash of the current node.
Args:
......@@ -150,20 +99,23 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
return self.__hash
@property
def hash(self):
def hash(self) -> Any:
"""The hash of the current node, as calculated by
:func:`compute_hash`.
"""
return self.update_hash()
def __hash__(self):
return hash(self.hash)
@abc.abstractmethod
def compute_hash(self):
def compute_hash(self) -> Any:
"""Compute the hash of the current node.
The hash should depend on the data of the node, as well as on hashes
of the children nodes.
"""
raise NotImplementedError('Must implement compute_hash method')
raise NotImplementedError("Must implement compute_hash method")
def __setitem__(self, name, new_child):
"""Add a child, invalidating the current hash"""
......@@ -212,47 +164,24 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
"""
return self.data
def collect_node(self, **kwargs):
"""Collect the data for the current node, for use by :func:`collect`.
Arguments:
kwargs: passed as-is to :func:`get_data`.
Returns:
A :class:`dict` compatible with :func:`collect`.
"""
def collect_node(self) -> Set[MerkleNode]:
"""Collect the current node if it has not been yet, for use by :func:`collect`."""
if not self.collected:
self.collected = True
return {self.type: {self.hash: self.get_data(**kwargs)}}
return {self}
else:
return {}
def collect(self, **kwargs):
"""Collect the data for all nodes in the subtree rooted at `self`.
return set()
The data is deduplicated by type and by hash.
Arguments:
kwargs: passed as-is to :func:`get_data`.
def collect(self) -> Set[MerkleNode]:
"""Collect the added and modified nodes in the subtree rooted at `self`
since the last collect operation.
Returns:
A :class:`dict` with the following structure::
{
'typeA': {
node1.hash: node1.get_data(),
node2.hash: node2.get_data(),
},
'typeB': {
node3.hash: node3.get_data(),
...
},
...
}
A :class:`set` of collected nodes
"""
ret = self.collect_node(**kwargs)
ret = self.collect_node()
for child in self.values():
deep_update(ret, child.collect(**kwargs))
ret.update(child.collect())
return ret
......@@ -266,23 +195,39 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
for child in self.values():
child.reset_collect()
def iter_tree(self, dedup=True) -> Iterator[MerkleNode]:
"""Yields all children nodes, recursively. Common nodes are deduplicated
by default (deduplication can be turned off setting the given argument
'dedup' to False).
"""
yield from self._iter_tree(seen=set(), dedup=dedup)
def _iter_tree(self, seen: Set[bytes], dedup) -> Iterator[MerkleNode]:
if self.hash not in seen:
if dedup:
seen.add(self.hash)
yield self
for child in self.values():
yield from child._iter_tree(seen=seen, dedup=dedup)
class MerkleLeaf(MerkleNode):
"""A leaf to a Merkle tree.
A Merkle leaf is simply a Merkle node with children disabled.
"""
__slots__ = [] # type: List[str]
__slots__: List[str] = []
def __setitem__(self, name, child):
raise ValueError('%s is a leaf' % self.__class__.__name__)
raise ValueError("%s is a leaf" % self.__class__.__name__)
def __getitem__(self, name):
raise ValueError('%s is a leaf' % self.__class__.__name__)
raise ValueError("%s is a leaf" % self.__class__.__name__)
def __delitem__(self, name):
raise ValueError('%s is a leaf' % self.__class__.__name__)
raise ValueError("%s is a leaf" % self.__class__.__name__)
def update(self, new_children):
"""Children update operation. Disabled for leaves."""
raise ValueError('%s is a leaf' % self.__class__.__name__)
raise ValueError("%s is a leaf" % self.__class__.__name__)
# Copyright (C) 2018-2019 The Software Heritage developers
# Copyright (C) 2018-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
"""
Implementation of Software Heritage's data model
See :ref:`data-model` for an overview of the data model.
The classes defined in this module are immutable
`attrs objects <https://attrs.org/>`__ and enums.
All classes define a ``from_dict`` class method and a ``to_dict``
method to convert between them and msgpack-serializable objects.
"""
from abc import ABCMeta, abstractmethod
from __future__ import annotations
from abc import ABC, abstractmethod
import collections
import datetime
from enum import Enum
from typing import List, Optional, Dict
import hashlib
from typing import (
Any,
Callable,
Dict,
Iterable,
List,
Optional,
Tuple,
Type,
TypeVar,
Union,
)
import warnings
import attr
from attr._make import _AndValidator
from attr.validators import and_
from attrs_strict import AttributeTypeError
import dateutil.parser
import iso8601
from typing_extensions import Final
from . import git_objects
from .collections import ImmutableDict
from .hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytehex, hash_to_hex
from .swhids import CoreSWHID
from .swhids import ExtendedObjectType as SwhidExtendedObjectType
from .swhids import ExtendedSWHID
from .swhids import ObjectType as SwhidObjectType
class MissingData(Exception):
"""Raised by `Content.with_data` when it has no way of fetching the
data (but not when fetching the data fails)."""
pass
KeyType = Union[Dict[str, str], Dict[str, bytes], bytes]
"""The type returned by BaseModel.unique_key()."""
from .identifiers import (
normalize_timestamp, directory_identifier, revision_identifier,
release_identifier, snapshot_identifier
)
from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes
SHA1_SIZE = 20
_OFFSET_CHARS = frozenset(b"+-0123456789")
# TODO: Limit this to 20 bytes
Sha1Git = bytes
class BaseModel:
Sha1 = bytes
def hash_repr(h: bytes) -> str:
if h is None:
return "None"
else:
return f"hash_to_bytes('{hash_to_hex(h)}')"
def parents_repr(parents: Tuple[Sha1Git, ...]):
return repr(tuple(hash_repr(p) for p in parents)).replace('"', "")
def freeze_optional_dict(
d: Union[None, Dict, ImmutableDict],
) -> Optional[ImmutableDict]:
if isinstance(d, dict):
return ImmutableDict(d)
else:
return d
def dictify(value):
"Helper function used by BaseModel.to_dict()"
if isinstance(value, BaseModel):
return value.to_dict()
elif isinstance(value, (CoreSWHID, ExtendedSWHID)):
return str(value)
elif isinstance(value, Enum):
return value.value
elif isinstance(value, (dict, ImmutableDict)):
return {k: dictify(v) for k, v in value.items()}
elif isinstance(value, tuple):
return tuple(dictify(v) for v in value)
else:
return value
def generic_type_validator(instance, attribute, value):
"""validates the type of an attribute value whatever the attribute type"""
raise NotImplementedError("generic type check should have been optimized")
def _true_validator(instance, attribute, value, expected_type=None, origin_value=None):
pass
def _none_validator(instance, attribute, value, expected_type=None, origin_value=None):
if value is not None:
if origin_value is None:
origin_value = value
raise AttributeTypeError(origin_value, attribute)
def _origin_type_validator(
instance, attribute, value, expected_type=None, origin_value=None
):
# This is functionally equivalent to using just this:
# return isinstance(value, type)
# but using type equality before isinstance allows very quick checks
# when the exact class is used (which is the overwhelming majority of cases)
# while still allowing subclasses to be used.
if expected_type is None:
expected_type = attribute.type
if not (type(value) is expected_type or isinstance(value, expected_type)):
if origin_value is None:
origin_value = value
raise AttributeTypeError(origin_value, attribute)
def _tuple_infinite_validator(
instance,
attribute,
value,
expected_type=None,
origin_value=None,
):
type_ = type(value)
if origin_value is None:
origin_value = value
if type_ != tuple and not isinstance(value, tuple):
raise AttributeTypeError(origin_value, attribute)
if expected_type is None:
expected_type = attribute.type
args = expected_type.__args__
# assert len(args) == 2 and args[1] is Ellipsis
expected_value_type = args[0]
validator = optimized_validator(expected_value_type)
for i in value:
validator(
instance,
attribute,
i,
expected_type=expected_value_type,
origin_value=origin_value,
)
def _tuple_bytes_bytes_validator(
instance,
attribute,
value,
expected_type=None,
origin_value=None,
):
type_ = type(value)
if type_ != tuple and not isinstance(value, tuple):
if origin_value is None:
origin_value = value
raise AttributeTypeError(origin_value, attribute)
if len(value) != 2:
if origin_value is None:
origin_value = value
raise AttributeTypeError(origin_value, attribute)
if type(value[0]) is not bytes or type(value[1]) is not bytes:
if origin_value is None:
origin_value = value
raise AttributeTypeError(origin_value, attribute)
def _tuple_finite_validator(
instance,
attribute,
value,
expected_type=None,
origin_value=None,
):
# might be useful to optimise the sub-validator tuple, in practice, we only
# have [bytes, bytes]
type_ = type(value)
if origin_value is None:
origin_value = value
if type_ != tuple and not isinstance(value, tuple):
raise AttributeTypeError(origin_value, attribute)
if expected_type is None:
expected_type = attribute.type
args = expected_type.__args__
# assert len(args) != 2 or args[1] is Ellipsis
if len(args) != len(value):
raise AttributeTypeError(origin_value, attribute)
for item_type, item in zip(args, value):
validator = optimized_validator(item_type)
validator(
instance,
attribute,
item,
expected_type=item_type,
origin_value=origin_value,
)
def _immutable_dict_validator(
instance,
attribute,
value,
expected_type=None,
origin_value=None,
):
value_type = type(value)
if origin_value is None:
origin_value = value
if value_type != ImmutableDict and not isinstance(value, ImmutableDict):
raise AttributeTypeError(origin_value, attribute)
if expected_type is None:
expected_type = attribute.type
(expected_key_type, expected_value_type) = expected_type.__args__
key_validator = optimized_validator(expected_key_type)
value_validator = optimized_validator(expected_value_type)
for item_key, item_value in value.items():
key_validator(
instance,
attribute,
item_key,
expected_type=expected_key_type,
origin_value=origin_value,
)
value_validator(
instance,
attribute,
item_value,
expected_type=expected_value_type,
origin_value=origin_value,
)
def optimized_validator(type_):
if type_ is object or type_ is Any:
return _true_validator
if type_ is None:
return _none_validator
origin = getattr(type_, "__origin__", None)
# Non-generic type, check it directly
if origin is None:
return _origin_type_validator
# Then, if it's a container, check its items.
if origin is tuple:
args = type_.__args__
if len(args) == 2 and args[1] is Ellipsis:
# Infinite tuple
return _tuple_infinite_validator
elif args == (bytes, bytes):
return _tuple_bytes_bytes_validator
else:
return _tuple_finite_validator
elif origin is Union:
args = type_.__args__
all_validators = tuple((optimized_validator(t), t) for t in args)
def union_validator(
instance,
attribute,
value,
expected_type=None,
origin_value=None,
):
if origin_value is None:
origin_value = value
for validator, type_ in all_validators:
try:
validator(
instance,
attribute,
value,
expected_type=type_,
origin_value=origin_value,
)
except AttributeTypeError:
pass
else:
break
else:
raise AttributeTypeError(origin_value, attribute)
return union_validator
elif origin is ImmutableDict:
return _immutable_dict_validator
# No need to check dict or list. because they are converted to ImmutableDict
# and tuple respectively.
raise NotImplementedError(f"Type-checking {type_}")
def optimize_all_validators(cls, old_fields):
"""process validators to turn them into a faster version … eventually"""
new_fields = []
for f in old_fields:
validator = f.validator
if validator is generic_type_validator:
validator = optimized_validator(f.type)
elif isinstance(validator, _AndValidator):
new_and = []
for v in validator._validators:
if v is generic_type_validator:
v = optimized_validator(f.type)
new_and.append(v)
validator = and_(*new_and)
else:
validator = None
if validator is not None:
f = f.evolve(validator=validator)
new_fields.append(f)
if attr.__version__ < "21.3.0":
# https://github.com/python-attrs/attrs/issues/821
from attr._make import _make_attr_tuple_class
attr_names = [f.name for f in new_fields]
AttrsClass = _make_attr_tuple_class(cls.__name__, attr_names)
return AttrsClass(new_fields)
else:
return new_fields
ModelType = TypeVar("ModelType", bound="BaseModel")
HashableModelType = TypeVar("HashableModelType", bound="BaseHashableModel")
class _StringCompatibleEnum(Enum):
def __eq__(self, other):
# stay compatible with legacy string comparison (for now)
if isinstance(other, str):
warnings.warn(
"Use the enum value instead of string",
category=DeprecationWarning,
stacklevel=2,
)
return self.value == other
return super().__eq__(other)
def __str__(self):
# preserve interpolation property (for now)
return self.value
def __hash__(self):
# make sure we don't confuse dictionary key matching (for now)
return hash(str(self.value))
class ModelObjectType(_StringCompatibleEnum):
"""Possible object types of Model object"""
CONTENT = "content"
DIRECTORY = "directory"
DIRECTORY_ENTRY = "directory_entry"
EXTID = "extid"
METADATA_AUTHORITY = "metadata_authority"
METADATA_FETCHER = "metadata_fetcher"
ORIGIN = "origin"
ORIGIN_VISIT = "origin_visit"
ORIGIN_VISIT_STATUS = "origin_visit_status"
PERSON = "person"
RAW_EXTRINSIC_METADATA = "raw_extrinsic_metadata"
RELEASE = "release"
REVISION = "revision"
SKIPPED_CONTENT = "skipped_content"
SNAPSHOT = "snapshot"
SNAPSHOT_BRANCH = "snapshot_branch"
TIMESTAMP = "timestamp"
TIMESTAMP_WITH_TIMEZONE = "timestamp_with_timezone"
class BaseModel(ABC):
"""Base class for SWH model classes.
Provides serialization/deserialization to/from Python dictionaries,
that are suitable for JSON/msgpack-like formats."""
__slots__ = ()
@property
@abstractmethod
def object_type(self) -> ModelObjectType:
# Some juggling to please mypy
#
# Note: starting from Python 3.11 we can combine @property with
# @classmethod which is the real intend here.
raise NotImplementedError
def to_dict(self):
"""Wrapper of `attr.asdict` that can be overridden by subclasses
that have special handling of some of the fields."""
def dictify(value):
if isinstance(value, BaseModel):
return value.to_dict()
elif isinstance(value, Enum):
return value.value
elif isinstance(value, dict):
return {k: dictify(v) for k, v in value.items()}
elif isinstance(value, list):
return [dictify(v) for v in value]
else:
return value
ret = attr.asdict(self, recurse=False)
return dictify(ret)
return dictify(attr.asdict(self, recurse=False))
@classmethod
def from_dict(cls, d):
......@@ -55,324 +427,1088 @@ class BaseModel:
recursively builds the corresponding objects."""
return cls(**d)
def evolve(self: ModelType, **kwargs) -> ModelType:
"""Alias to call :func:`attr.evolve` on this object, returning a new object."""
return attr.evolve(self, **kwargs) # type: ignore[misc]
def anonymize(self: ModelType) -> Optional[ModelType]:
"""Returns an anonymized version of the object, if needed.
If the object model does not need/support anonymization, returns None.
"""
return None
def unique_key(self) -> KeyType:
"""Returns a unique key for this object, that can be used for
deduplication."""
raise NotImplementedError(f"unique_key for {self}")
def check(self) -> None:
"""Performs internal consistency checks, and raises an error if one fails."""
# without the type-ignore comment below, attr >= 22.1.0 causes mypy to report:
# Argument 1 has incompatible type "BaseModel"; expected "AttrsInstance"
attr.validate(self) # type: ignore[arg-type]
class HashableObject(metaclass=ABCMeta):
def _compute_hash_from_manifest(manifest: bytes) -> Sha1Git:
return hashlib.new("sha1", manifest).digest()
class BaseHashableModel(BaseModel, ABC):
"""Mixin to automatically compute object identifier hash when
the associated model is instantiated."""
@staticmethod
@abstractmethod
def compute_hash(object_dict):
__slots__ = ()
id: Sha1Git
def compute_hash(self) -> bytes:
"""Derived model classes must implement this to compute
the object hash from its dict representation."""
pass
the object hash.
This method is called by the object initialization if the `id`
attribute is set to an empty value.
"""
return self._compute_hash_from_attributes()
@abstractmethod
def _compute_hash_from_attributes(self) -> Sha1Git:
raise NotImplementedError(f"_compute_hash_from_attributes for {self}")
def __attrs_post_init__(self):
if not self.id:
obj_id = hash_to_bytes(self.compute_hash(self.to_dict()))
object.__setattr__(self, 'id', obj_id)
obj_id = self.compute_hash()
object.__setattr__(self, "id", obj_id)
def evolve(self: HashableModelType, **kwargs) -> HashableModelType:
"""Alias to call :func:`attr.evolve` on this object, returning a new object
with its ``id`` recomputed based on the content."""
if "id" in kwargs:
raise TypeError(
f"{self.__class__.__name__}.evolve recomputes the id itself; "
f"use attr.evolve to change the id manually."
)
obj = attr.evolve(self, **kwargs) # type: ignore[misc]
new_hash = obj.compute_hash()
return attr.evolve(obj, id=new_hash) # type: ignore[misc]
def unique_key(self) -> KeyType:
return self.id
def check(self) -> None:
super().check()
if self.id != self.compute_hash():
raise ValueError("'id' does not match recomputed hash.")
HashableObject = BaseHashableModel # deprecated alias
class HashableObjectWithManifest(BaseHashableModel):
"""Derived class of BaseHashableModel, for objects that may need to store
verbatim git objects as ``raw_manifest`` to preserve original hashes."""
@attr.s(frozen=True)
__slots__ = ()
raw_manifest: Optional[bytes] = None
"""Stores the original content of git objects when they cannot be faithfully
represented using only the other attributes.
This should only be used as a last resort, and only set in the Git loader,
for objects too corrupt to fit the data model."""
def to_dict(self):
d = super().to_dict()
if d["raw_manifest"] is None:
del d["raw_manifest"]
return d
def compute_hash(self) -> bytes:
"""Derived model classes must implement this to compute
the object hash.
This method is called by the object initialization if the `id`
attribute is set to an empty value.
"""
if self.raw_manifest is None:
return super().compute_hash() # calls self._compute_hash_from_attributes()
else:
return _compute_hash_from_manifest(self.raw_manifest)
def check(self) -> None:
super().check()
if (
self.raw_manifest is not None
and self.id == self._compute_hash_from_attributes()
):
raise ValueError(
f"{self} has a non-none raw_manifest attribute, but does not need it."
)
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class Person(BaseModel):
"""Represents the author/committer of a revision or release."""
name = attr.ib(type=bytes)
email = attr.ib(type=bytes)
fullname = attr.ib(type=bytes)
object_type: Final = ModelObjectType.PERSON
fullname = attr.ib(type=bytes, validator=generic_type_validator)
name = attr.ib(type=Optional[bytes], validator=generic_type_validator, eq=False)
email = attr.ib(type=Optional[bytes], validator=generic_type_validator, eq=False)
@classmethod
def from_fullname(cls, fullname: bytes):
"""Returns a Person object, by guessing the name and email from the
fullname, in the `name <email>` format.
The fullname is left unchanged."""
if fullname is None:
raise TypeError("fullname is None.")
name: Optional[bytes]
email: Optional[bytes]
try:
open_bracket = fullname.index(b"<")
except ValueError:
name = fullname
email = None
else:
raw_name = fullname[:open_bracket]
raw_email = fullname[open_bracket + 1 :]
if not raw_name:
name = None
else:
name = raw_name.strip()
try:
close_bracket = raw_email.rindex(b">")
except ValueError:
email = raw_email
else:
email = raw_email[:close_bracket]
return Person(
name=name or None,
email=email or None,
fullname=fullname,
)
def anonymize(self) -> Person:
"""Returns an anonymized version of the Person object.
Anonymization is simply a Person which fullname is the hashed, with unset name
or email.
"""
return Person(
fullname=hashlib.sha256(self.fullname).digest(),
name=None,
email=None,
)
@classmethod
def from_dict(cls, d):
"""
If the fullname is missing, construct a fullname
using the following heuristics: if the name value is None, we return the
email in angle brackets, else, we return the name, a space, and the email
in angle brackets.
"""
if "fullname" not in d:
parts = []
if d["name"] is not None:
parts.append(d["name"])
if d["email"] is not None:
parts.append(b"".join([b"<", d["email"], b">"]))
fullname = b" ".join(parts)
d = {**d, "fullname": fullname}
d = {"name": None, "email": None, **d}
return super().from_dict(d)
class TimestampOverflowException(ValueError):
"""Raised when trying to build :class:`Timestamp` from a timestamp too far in
the past or future"""
@attr.s(frozen=True)
pass
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class Timestamp(BaseModel):
"""Represents a naive timestamp from a VCS."""
object_type: Final = ModelObjectType.TIMESTAMP
seconds = attr.ib(type=int)
microseconds = attr.ib(type=int)
# maximum and minimum values allowed by datetime.datetime.fromtimestamp()
MIN_SECONDS = -62135510961 # 0001-01-02T00:00:00
MAX_SECONDS = 253402297199 # 9999-12-31T23:59:59
MIN_MICROSECONDS = 0
MAX_MICROSECONDS = 10**6 - 1
@seconds.validator
def check_seconds(self, attribute, value):
"""Check that seconds fit in a 64-bits signed integer."""
if not (-2**63 <= value < 2**63):
raise ValueError('Seconds must be a signed 64-bits integer.')
"""Check that ``seconds`` can be stored in all supported mediums
(PostgreSQL/Cassandra/ORC; PostgreSQL being the limiting factor)."""
if value.__class__ is not int:
raise AttributeTypeError(value, attribute)
# common good sense; less strict than the checks below
# if not (-(2**63) <= value < 2**63):
# raise TimestampOverflowException("Seconds must be a signed 64-bits integer.")
# values outside this range do not fit in Python's datetime, so we cannot
# write them to postgresql with psycopg2
if not (self.MIN_SECONDS <= value <= self.MAX_SECONDS):
raise TimestampOverflowException(
f"Seconds must be in [{self.MIN_SECONDS}, {self.MAX_SECONDS}]"
)
@microseconds.validator
def check_microseconds(self, attribute, value):
"""Checks that microseconds are positive and < 1000000."""
if not (0 <= value < 10**6):
raise ValueError('Microseconds must be in [0, 1000000[.')
if value.__class__ is not int:
raise AttributeTypeError(value, attribute)
if not (self.MIN_MICROSECONDS <= value <= self.MAX_MICROSECONDS):
raise ValueError(
"Microseconds must be in [{self.MIN_MICROSECONDS}, {self.MAX_MICROSECONDS}]."
)
@attr.s(frozen=True)
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class TimestampWithTimezone(BaseModel):
"""Represents a TZ-aware timestamp from a VCS."""
timestamp = attr.ib(type=Timestamp)
offset = attr.ib(type=int)
negative_utc = attr.ib(type=bool)
@offset.validator
def check_offset(self, attribute, value):
"""Checks the offset is a 16-bits signed integer (in theory, it
should always be between -14 and +14 hours)."""
if not (-2**15 <= value < 2**15):
# max 14 hours offset in theory, but you never know what
# you'll find in the wild...
raise ValueError('offset too large: %d minutes' % value)
object_type: Final = ModelObjectType.TIMESTAMP_WITH_TIMEZONE
timestamp = attr.ib(type=Timestamp, validator=generic_type_validator)
offset_bytes = attr.ib(type=bytes, validator=generic_type_validator)
"""Raw git representation of the timezone, as an offset from UTC.
It should follow this format: ``+HHMM`` or ``-HHMM`` (including ``+0000`` and
``-0000``).
However, when created from git objects, it must be the exact bytes used in the
original objects, so it may differ from this format when they do.
"""
@classmethod
def from_dict(cls, d):
def from_numeric_offset(
cls, timestamp: Timestamp, offset: int, negative_utc: bool
) -> TimestampWithTimezone:
"""Returns a :class:`TimestampWithTimezone` instance from the old dictionary
format (with ``offset`` and ``negative_utc`` instead of ``offset_bytes``).
"""
negative = offset < 0 or negative_utc
(hours, minutes) = divmod(abs(offset), 60)
offset_bytes = f"{'-' if negative else '+'}{hours:02}{minutes:02}".encode()
tstz = TimestampWithTimezone(timestamp=timestamp, offset_bytes=offset_bytes)
assert tstz.offset_minutes() == offset, (tstz.offset_minutes(), offset)
return tstz
@classmethod
def from_dict(
cls, time_representation: Union[Dict, datetime.datetime, int]
) -> TimestampWithTimezone:
"""Builds a TimestampWithTimezone from any of the formats
accepted by :func:`swh.model.normalize_timestamp`."""
d = normalize_timestamp(d)
return cls(
timestamp=Timestamp.from_dict(d['timestamp']),
offset=d['offset'],
negative_utc=d['negative_utc'])
# TODO: this accept way more types than just dicts; find a better
# name
if isinstance(time_representation, dict):
ts = time_representation["timestamp"]
if isinstance(ts, dict):
seconds = ts.get("seconds", 0)
microseconds = ts.get("microseconds", 0)
elif isinstance(ts, int):
seconds = ts
microseconds = 0
else:
raise ValueError(
f"TimestampWithTimezone.from_dict received non-integer timestamp "
f"member {ts!r}"
)
timestamp = Timestamp(seconds=seconds, microseconds=microseconds)
if "offset_bytes" in time_representation:
return cls(
timestamp=timestamp,
offset_bytes=time_representation["offset_bytes"],
)
else:
# old format
offset = time_representation["offset"]
negative_utc = time_representation.get("negative_utc") or False
return cls.from_numeric_offset(timestamp, offset, negative_utc)
elif isinstance(time_representation, datetime.datetime):
# TODO: warn when using from_dict() on a datetime
utcoffset = time_representation.utcoffset()
time_representation = time_representation.astimezone(datetime.timezone.utc)
microseconds = time_representation.microsecond
if microseconds:
time_representation = time_representation.replace(microsecond=0)
seconds = int(time_representation.timestamp())
if utcoffset is None:
raise ValueError(
f"TimestampWithTimezone.from_dict received datetime without "
f"timezone: {time_representation}"
)
# utcoffset is an integer number of minutes
seconds_offset = utcoffset.total_seconds()
offset = int(seconds_offset) // 60
# TODO: warn if remainder is not zero
return cls.from_numeric_offset(
Timestamp(seconds=seconds, microseconds=microseconds), offset, False
)
elif isinstance(time_representation, int):
# TODO: warn when using from_dict() on an int
seconds = time_representation
timestamp = Timestamp(seconds=time_representation, microseconds=0)
return cls(timestamp=timestamp, offset_bytes=b"+0000")
else:
raise ValueError(
f"TimestampWithTimezone.from_dict received non-integer timestamp: "
f"{time_representation!r}"
)
@classmethod
def from_datetime(cls, dt: datetime.datetime) -> TimestampWithTimezone:
return cls.from_dict(dt)
def to_datetime(self) -> datetime.datetime:
"""Convert to a datetime (with a timezone set to the recorded fixed UTC offset)
Beware that this conversion can be lossy: ``-0000`` and 'weird' offsets
cannot be represented. Also note that it may fail due to type overflow.
"""
td = datetime.timedelta(minutes=self.offset_minutes())
try:
tz = datetime.timezone(td)
except ValueError:
# Larger or smaller than 24h, so it's bogus. self.timestamp.seconds is
# a number of seconds since Epoch, so it's safe to ignore the timezone
# and replace it with any other one. We arbitrarily pick UTC.
tz = datetime.timezone.utc
timestamp = datetime.datetime.fromtimestamp(self.timestamp.seconds, tz)
timestamp = timestamp.replace(microsecond=self.timestamp.microseconds)
return timestamp
@classmethod
def from_iso8601(cls, s):
"""Builds a TimestampWithTimezone from an ISO8601-formatted string."""
dt = iso8601.parse_date(s)
tstz = cls.from_datetime(dt)
if dt.tzname() == "-00:00":
assert tstz.offset_bytes == b"+0000"
tstz = attr.evolve(tstz, offset_bytes=b"-0000")
return tstz
@staticmethod
def _parse_offset_bytes(offset_bytes: bytes) -> int:
"""Parses an ``offset_bytes`` value (in Git's ``[+-]HHMM`` format),
and returns the corresponding numeric values (in number of minutes).
Tries to account for some mistakes in the format, to support incorrect
Git implementations.
>>> TimestampWithTimezone._parse_offset_bytes(b"+0000")
0
>>> TimestampWithTimezone._parse_offset_bytes(b"-0000")
0
>>> TimestampWithTimezone._parse_offset_bytes(b"+0200")
120
>>> TimestampWithTimezone._parse_offset_bytes(b"-0200")
-120
>>> TimestampWithTimezone._parse_offset_bytes(b"+200")
120
>>> TimestampWithTimezone._parse_offset_bytes(b"-200")
-120
>>> TimestampWithTimezone._parse_offset_bytes(b"+02")
120
>>> TimestampWithTimezone._parse_offset_bytes(b"-02")
-120
>>> TimestampWithTimezone._parse_offset_bytes(b"+0010")
10
>>> TimestampWithTimezone._parse_offset_bytes(b"-0010")
-10
>>> TimestampWithTimezone._parse_offset_bytes(b"+200000000000000000")
0
>>> TimestampWithTimezone._parse_offset_bytes(b"+0160") # 60 minutes...
0
"""
offset_str = offset_bytes.decode()
assert offset_str[0] in "+-"
sign = int(offset_str[0] + "1")
if len(offset_str) <= 3:
hours = int(offset_str[1:])
minutes = 0
else:
hours = int(offset_str[1:-2])
minutes = int(offset_str[-2:])
@attr.s(frozen=True)
class Origin(BaseModel):
offset = sign * (hours * 60 + minutes)
if (0 <= minutes <= 59) and (-(2**15) <= offset < 2**15):
return offset
else:
# can't parse it to a reasonable value; give up and pretend it's UTC.
return 0
def offset_minutes(self):
"""Returns the offset, as a number of minutes since UTC.
>>> TimestampWithTimezone(
... Timestamp(seconds=1642765364, microseconds=0), offset_bytes=b"+0000"
... ).offset_minutes()
0
>>> TimestampWithTimezone(
... Timestamp(seconds=1642765364, microseconds=0), offset_bytes=b"+0200"
... ).offset_minutes()
120
>>> TimestampWithTimezone(
... Timestamp(seconds=1642765364, microseconds=0), offset_bytes=b"-0200"
... ).offset_minutes()
-120
>>> TimestampWithTimezone(
... Timestamp(seconds=1642765364, microseconds=0), offset_bytes=b"+0530"
... ).offset_minutes()
330
"""
return self._parse_offset_bytes(self.offset_bytes)
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class Origin(BaseHashableModel):
"""Represents a software source: a VCS and an URL."""
url = attr.ib(type=str)
type = attr.ib(type=Optional[str], default=None)
def to_dict(self):
r = super().to_dict()
r.pop('type', None)
return r
object_type: Final = ModelObjectType.ORIGIN
url = attr.ib(type=str, validator=generic_type_validator)
id = attr.ib(type=Sha1Git, validator=generic_type_validator, default=b"")
def unique_key(self) -> KeyType:
return {"url": self.url}
@attr.s(frozen=True)
def _compute_hash_from_attributes(self) -> bytes:
return _compute_hash_from_manifest(self.url.encode("utf-8"))
def swhid(self) -> ExtendedSWHID:
"""Returns a SWHID representing this origin."""
return ExtendedSWHID(
object_type=SwhidExtendedObjectType.ORIGIN,
object_id=self.id,
)
@url.validator
def check_url(self, attribute, value):
if len(value.encode()) >= 2048:
# Rationale for this value:
# 1. Needs to be stored in a postgresql btree, which is limited to
# somewhere around 2700 bytes
# 2. URLs longer than 2048 characters won't work very well in browsers,
# and repository URLs are often meant to at least display something
# when opened in a browser. https://stackoverflow.com/a/417184/539465
# 3. Even though this field is actually an IRI, it is usually in ASCII
# so this should be a good-enough approximation
raise ValueError("Origin URL is too long")
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class OriginVisit(BaseModel):
"""Represents a visit of an origin at a given point in time, by a
"""Represents an origin visit with a given type at a given point in time, by a
SWH loader."""
origin = attr.ib(type=str)
object_type: Final = ModelObjectType.ORIGIN_VISIT
origin = attr.ib(type=str, validator=generic_type_validator)
date = attr.ib(type=datetime.datetime)
status = attr.ib(
type=str,
validator=attr.validators.in_(['ongoing', 'full', 'partial']))
type = attr.ib(type=str)
snapshot = attr.ib(type=Optional[Sha1Git])
metadata = attr.ib(type=Optional[Dict[str, object]],
default=None)
visit = attr.ib(type=Optional[int],
default=None)
type = attr.ib(type=str, validator=generic_type_validator)
"""Should not be set before calling 'origin_visit_add()'."""
visit = attr.ib(type=Optional[int], validator=generic_type_validator, default=None)
@date.validator
def check_date(self, attribute, value):
"""Checks the date has a timezone."""
if value.__class__ is not datetime.datetime:
raise AttributeTypeError(value, attribute)
if value is not None and value.tzinfo is None:
raise ValueError("date must be a timezone-aware datetime.")
def to_dict(self):
"""Serializes the date as a string and omits the visit id if it is
`None`."""
ov = super().to_dict()
if ov['visit'] is None:
del ov['visit']
if ov["visit"] is None:
del ov["visit"]
return ov
@classmethod
def from_dict(cls, d):
"""Parses the date from a string, and accepts missing visit ids."""
d = d.copy()
date = d.pop('date')
return cls(
date=(date
if isinstance(date, datetime.datetime)
else dateutil.parser.parse(date)),
**d)
def unique_key(self) -> KeyType:
return {"origin": self.origin, "date": str(self.date)}
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class OriginVisitStatus(BaseModel):
"""Represents a visit update of an origin at a given point in time."""
object_type: Final = ModelObjectType.ORIGIN_VISIT_STATUS
origin = attr.ib(type=str, validator=generic_type_validator)
visit = attr.ib(type=int, validator=generic_type_validator)
class TargetType(Enum):
date = attr.ib(type=datetime.datetime)
status = attr.ib(
type=str,
validator=attr.validators.in_(
["created", "ongoing", "full", "partial", "not_found", "failed"]
),
)
snapshot = attr.ib(
type=Optional[Sha1Git], validator=generic_type_validator, repr=hash_repr
)
# Type is optional be to able to use it before adding it to the database model
type = attr.ib(type=Optional[str], validator=generic_type_validator, default=None)
metadata = attr.ib(
type=Optional[ImmutableDict[str, object]],
validator=generic_type_validator,
converter=freeze_optional_dict,
default=None,
)
@date.validator
def check_date(self, attribute, value):
"""Checks the date has a timezone."""
if value.__class__ is not datetime.datetime:
raise AttributeTypeError(value, attribute)
if value is not None and value.tzinfo is None:
raise ValueError("date must be a timezone-aware datetime.")
def unique_key(self) -> KeyType:
return {"origin": self.origin, "visit": str(self.visit), "date": str(self.date)}
def origin_swhid(self) -> ExtendedSWHID:
return Origin(url=self.origin).swhid()
def snapshot_swhid(self) -> Optional[CoreSWHID]:
if self.snapshot is None:
return None
return CoreSWHID(object_type=SwhidObjectType.SNAPSHOT, object_id=self.snapshot)
class SnapshotTargetType(Enum):
"""The type of content pointed to by a snapshot branch. Usually a
revision or an alias."""
CONTENT = 'content'
DIRECTORY = 'directory'
REVISION = 'revision'
RELEASE = 'release'
SNAPSHOT = 'snapshot'
ALIAS = 'alias'
CONTENT = "content"
DIRECTORY = "directory"
REVISION = "revision"
RELEASE = "release"
SNAPSHOT = "snapshot"
ALIAS = "alias"
def __repr__(self):
return f"SnapshotTargetType.{self.name}"
# Remove this compatibility trick once all user have been migrated.
#
# We cannot use @deprecated as this would modify SnapshotTargetType directly
TargetType = SnapshotTargetType
class ObjectType(Enum):
class ReleaseTargetType(Enum):
"""The type of content pointed to by a release. Usually a revision"""
CONTENT = 'content'
DIRECTORY = 'directory'
REVISION = 'revision'
RELEASE = 'release'
SNAPSHOT = 'snapshot'
CONTENT = "content"
DIRECTORY = "directory"
REVISION = "revision"
RELEASE = "release"
SNAPSHOT = "snapshot"
@attr.s(frozen=True)
def __repr__(self):
return f"ReleaseTargetType.{self.name}"
# Remove this compatibility trick once all user have been migrated.
#
# We cannot use @deprecated as this would modify SnapshotTargetType directly
ObjectType = ReleaseTargetType
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class SnapshotBranch(BaseModel):
"""Represents one of the branches of a snapshot."""
target = attr.ib(type=bytes)
target_type = attr.ib(type=TargetType)
object_type: Final = ModelObjectType.SNAPSHOT_BRANCH
target = attr.ib(type=bytes, repr=hash_repr)
target_type = attr.ib(type=SnapshotTargetType, validator=generic_type_validator)
@target.validator
def check_target(self, attribute, value):
"""Checks the target type is not an alias, checks the target is a
valid sha1_git."""
if self.target_type != TargetType.ALIAS and self.target is not None:
if value.__class__ is not bytes:
raise AttributeTypeError(value, attribute)
if self.target_type != SnapshotTargetType.ALIAS and self.target is not None:
if len(value) != 20:
raise ValueError('Wrong length for bytes identifier: %d' %
len(value))
raise ValueError("Wrong length for bytes identifier: %d" % len(value))
@classmethod
def from_dict(cls, d):
return cls(
target=d['target'],
target_type=TargetType(d['target_type']))
return cls(target=d["target"], target_type=SnapshotTargetType(d["target_type"]))
def swhid(self) -> Optional[CoreSWHID]:
"""Returns a SWHID for the current branch or None if the branch has no
target or is an alias."""
if self.target is None or self.target_type == SnapshotTargetType.ALIAS:
return None
return CoreSWHID(
object_id=self.target, object_type=SwhidObjectType[self.target_type.name]
)
@attr.s(frozen=True)
class Snapshot(BaseModel, HashableObject):
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class Snapshot(BaseHashableModel):
"""Represents the full state of an origin at a given point in time."""
branches = attr.ib(type=Dict[bytes, Optional[SnapshotBranch]])
id = attr.ib(type=Sha1Git, default=b'')
@staticmethod
def compute_hash(object_dict):
return snapshot_identifier(object_dict)
object_type: Final = ModelObjectType.SNAPSHOT
branches = attr.ib(
type=ImmutableDict[bytes, Optional[SnapshotBranch]],
validator=generic_type_validator,
converter=freeze_optional_dict,
)
id = attr.ib(
type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
)
def _compute_hash_from_attributes(self) -> bytes:
return _compute_hash_from_manifest(
git_objects.snapshot_git_object(self, ignore_unresolved=True)
)
@classmethod
def from_dict(cls, d):
d = d.copy()
return cls(
branches={
name: SnapshotBranch.from_dict(branch) if branch else None
for (name, branch) in d.pop('branches').items()
},
**d)
@attr.s(frozen=True)
class Release(BaseModel, HashableObject):
name = attr.ib(type=bytes)
message = attr.ib(type=bytes)
target = attr.ib(type=Optional[Sha1Git])
target_type = attr.ib(type=ObjectType)
synthetic = attr.ib(type=bool)
author = attr.ib(type=Optional[Person],
default=None)
date = attr.ib(type=Optional[TimestampWithTimezone],
default=None)
metadata = attr.ib(type=Optional[Dict[str, object]],
default=None)
id = attr.ib(type=Sha1Git, default=b'')
@staticmethod
def compute_hash(object_dict):
return release_identifier(object_dict)
branches=ImmutableDict(
(name, SnapshotBranch.from_dict(branch) if branch else None)
for (name, branch) in d.pop("branches").items()
),
**d,
)
def swhid(self) -> CoreSWHID:
"""Returns a SWHID representing this object."""
return CoreSWHID(object_type=SwhidObjectType.SNAPSHOT, object_id=self.id)
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class Release(HashableObjectWithManifest, BaseModel):
object_type: Final = ModelObjectType.RELEASE
name = attr.ib(type=bytes, validator=generic_type_validator)
message = attr.ib(type=Optional[bytes], validator=generic_type_validator)
target = attr.ib(
type=Optional[Sha1Git], validator=generic_type_validator, repr=hash_repr
)
target_type = attr.ib(type=ReleaseTargetType, validator=generic_type_validator)
synthetic = attr.ib(type=bool, validator=generic_type_validator)
author = attr.ib(
type=Optional[Person], validator=generic_type_validator, default=None
)
date = attr.ib(
type=Optional[TimestampWithTimezone],
validator=generic_type_validator,
default=None,
)
metadata = attr.ib(
type=Optional[ImmutableDict[str, object]],
validator=generic_type_validator,
converter=freeze_optional_dict,
default=None,
)
id = attr.ib(
type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
)
raw_manifest = attr.ib(type=Optional[bytes], default=None)
def _compute_hash_from_attributes(self) -> bytes:
return _compute_hash_from_manifest(git_objects.release_git_object(self))
@author.validator
def check_author(self, attribute, value):
"""If the author is `None`, checks the date is `None` too."""
if self.author is None and self.date is not None:
raise ValueError('release date must be None if author is None.')
raise ValueError("release date must be None if author is None.")
def to_dict(self):
rel = super().to_dict()
if rel['metadata'] is None:
del rel['metadata']
if rel["metadata"] is None:
del rel["metadata"]
return rel
@classmethod
def from_dict(cls, d):
d = d.copy()
if d.get('author'):
d['author'] = Person.from_dict(d['author'])
if d.get('date'):
d['date'] = TimestampWithTimezone.from_dict(d['date'])
return cls(
target_type=ObjectType(d.pop('target_type')),
**d)
if d.get("author"):
d["author"] = Person.from_dict(d["author"])
if d.get("date"):
d["date"] = TimestampWithTimezone.from_dict(d["date"])
return cls(target_type=ReleaseTargetType(d.pop("target_type")), **d)
def swhid(self) -> CoreSWHID:
"""Returns a SWHID representing this object."""
return CoreSWHID(object_type=SwhidObjectType.RELEASE, object_id=self.id)
def target_swhid(self) -> Optional[CoreSWHID]:
"""Returns the SWHID for the target of this release or None if unset."""
if self.target is None:
return None
return CoreSWHID(
object_id=self.target, object_type=SwhidObjectType[self.target_type.name]
)
def anonymize(self) -> Release:
"""Returns an anonymized version of the Release object.
Anonymization consists in replacing the author with an anonymized Person object.
"""
author = self.author and self.author.anonymize()
return attr.evolve(self, author=author)
class RevisionType(Enum):
GIT = 'git'
TAR = 'tar'
DSC = 'dsc'
SUBVERSION = 'svn'
MERCURIAL = 'hg'
@attr.s(frozen=True)
class Revision(BaseModel, HashableObject):
message = attr.ib(type=bytes)
author = attr.ib(type=Person)
committer = attr.ib(type=Person)
date = attr.ib(type=Optional[TimestampWithTimezone])
committer_date = attr.ib(type=Optional[TimestampWithTimezone])
type = attr.ib(type=RevisionType)
directory = attr.ib(type=Sha1Git)
synthetic = attr.ib(type=bool)
metadata = attr.ib(type=Optional[Dict[str, object]],
default=None)
parents = attr.ib(type=List[Sha1Git],
default=attr.Factory(list))
id = attr.ib(type=Sha1Git, default=b'')
GIT = "git"
TAR = "tar"
DSC = "dsc"
SUBVERSION = "svn"
MERCURIAL = "hg"
CVS = "cvs"
BAZAAR = "bzr"
def __repr__(self):
return f"RevisionType.{self.name}"
def tuplify_extra_headers(value: Iterable):
return tuple((k, v) for k, v in value)
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class Revision(HashableObjectWithManifest, BaseModel):
object_type: Final = ModelObjectType.REVISION
message = attr.ib(type=Optional[bytes], validator=generic_type_validator)
author = attr.ib(type=Optional[Person], validator=generic_type_validator)
committer = attr.ib(type=Optional[Person], validator=generic_type_validator)
date = attr.ib(
type=Optional[TimestampWithTimezone], validator=generic_type_validator
)
committer_date = attr.ib(
type=Optional[TimestampWithTimezone], validator=generic_type_validator
)
type = attr.ib(type=RevisionType, validator=generic_type_validator)
directory = attr.ib(type=Sha1Git, validator=generic_type_validator, repr=hash_repr)
synthetic = attr.ib(type=bool, validator=generic_type_validator)
metadata = attr.ib(
type=Optional[ImmutableDict[str, object]],
validator=generic_type_validator,
converter=freeze_optional_dict,
default=None,
)
parents = attr.ib(
type=Tuple[Sha1Git, ...],
validator=generic_type_validator,
default=(),
repr=parents_repr,
)
id = attr.ib(
type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
)
extra_headers = attr.ib(
type=Tuple[Tuple[bytes, bytes], ...],
validator=generic_type_validator,
converter=tuplify_extra_headers,
default=(),
)
raw_manifest = attr.ib(type=Optional[bytes], default=None)
@staticmethod
def compute_hash(object_dict):
return revision_identifier(object_dict)
def __attrs_post_init__(self):
super().__attrs_post_init__()
# ensure metadata is a deep copy of whatever was given, and if needed
# extract extra_headers from there
if self.metadata:
metadata = self.metadata
if not self.extra_headers and "extra_headers" in metadata:
(extra_headers, metadata) = metadata.copy_pop("extra_headers")
object.__setattr__(
self,
"extra_headers",
tuplify_extra_headers(extra_headers),
)
attr.validate(self)
object.__setattr__(self, "metadata", metadata)
def _compute_hash_from_attributes(self) -> bytes:
return _compute_hash_from_manifest(git_objects.revision_git_object(self))
@author.validator
def check_author(self, attribute, value):
"""If the author is `None`, checks the date is `None` too."""
if self.author is None and self.date is not None:
raise ValueError("revision date must be None if author is None.")
@committer.validator
def check_committer(self, attribute, value):
"""If the committer is `None`, checks the committer_date is `None` too."""
if self.committer is None and self.committer_date is not None:
raise ValueError(
"revision committer_date must be None if committer is None."
)
@classmethod
def from_dict(cls, d):
d = d.copy()
date = d.pop('date')
date = d.pop("date")
if date:
date = TimestampWithTimezone.from_dict(date)
committer_date = d.pop('committer_date')
committer_date = d.pop("committer_date")
if committer_date:
committer_date = TimestampWithTimezone.from_dict(
committer_date)
committer_date = TimestampWithTimezone.from_dict(committer_date)
author = d.pop("author")
if author:
author = Person.from_dict(author)
committer = d.pop("committer")
if committer:
committer = Person.from_dict(committer)
return cls(
author=Person.from_dict(d.pop('author')),
committer=Person.from_dict(d.pop('committer')),
author=author,
committer=committer,
date=date,
committer_date=committer_date,
type=RevisionType(d.pop('type')),
**d)
type=RevisionType(d.pop("type")),
parents=tuple(d.pop("parents")), # for BW compat
**d,
)
def swhid(self) -> CoreSWHID:
"""Returns a SWHID representing this object."""
return CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=self.id)
@attr.s(frozen=True)
def directory_swhid(self) -> CoreSWHID:
"""Returns the SWHID for the directory referenced by the revision."""
return CoreSWHID(
object_type=SwhidObjectType.DIRECTORY, object_id=self.directory
)
def parent_swhids(self) -> List[CoreSWHID]:
"""Returns a list of SWHID for the parent revisions."""
return [
CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=parent)
for parent in self.parents
]
def anonymize(self) -> Revision:
"""Returns an anonymized version of the Revision object.
Anonymization consists in replacing the author and committer with an anonymized
Person object.
"""
return attr.evolve(
self,
author=None if self.author is None else self.author.anonymize(),
committer=None if self.committer is None else self.committer.anonymize(),
)
_DIR_ENTRY_TYPES = ["file", "dir", "rev"]
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class DirectoryEntry(BaseModel):
object_type: Final = ModelObjectType.DIRECTORY_ENTRY
name = attr.ib(type=bytes)
type = attr.ib(type=str,
validator=attr.validators.in_(['file', 'dir', 'rev']))
target = attr.ib(type=Sha1Git)
perms = attr.ib(type=int)
type = attr.ib(type=str, validator=attr.validators.in_(_DIR_ENTRY_TYPES))
target = attr.ib(type=Sha1Git, validator=generic_type_validator, repr=hash_repr)
perms = attr.ib(type=int, validator=generic_type_validator, converter=int, repr=oct)
"""Usually one of the values of `swh.model.from_disk.DentryPerms`."""
@attr.s(frozen=True)
class Directory(BaseModel, HashableObject):
entries = attr.ib(type=List[DirectoryEntry])
id = attr.ib(type=Sha1Git, default=b'')
@staticmethod
def compute_hash(object_dict):
return directory_identifier(object_dict)
DIR_ENTRY_TYPE_TO_SWHID_OBJECT_TYPE = {
"file": SwhidObjectType.CONTENT,
"dir": SwhidObjectType.DIRECTORY,
"rev": SwhidObjectType.REVISION,
}
@name.validator
def check_name(self, attribute, value):
if value.__class__ is not bytes:
raise AttributeTypeError(value, attribute)
if b"/" in value:
raise ValueError(f"{value!r} is not a valid directory entry name.")
def swhid(self) -> CoreSWHID:
"""Returns a SWHID for this directory entry"""
return CoreSWHID(
object_type=DirectoryEntry.DIR_ENTRY_TYPE_TO_SWHID_OBJECT_TYPE[self.type],
object_id=self.target,
)
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class Directory(HashableObjectWithManifest, BaseModel):
object_type: Final = ModelObjectType.DIRECTORY
entries = attr.ib(type=Tuple[DirectoryEntry, ...], validator=generic_type_validator)
id = attr.ib(
type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
)
raw_manifest = attr.ib(type=Optional[bytes], default=None)
def _compute_hash_from_attributes(self) -> bytes:
return _compute_hash_from_manifest(git_objects.directory_git_object(self))
@entries.validator
def check_entries(self, attribute, value):
seen = set()
for entry in value:
if entry.name in seen:
# Cannot use self.swhid() here, self.id may be None
raise ValueError(
f"swh:1:dir:{hash_to_hex(self.id)} has duplicated entry name: "
f"{entry.name!r}"
)
seen.add(entry.name)
@classmethod
def from_dict(cls, d):
d = d.copy()
return cls(
entries=[DirectoryEntry.from_dict(entry)
for entry in d.pop('entries')],
**d)
entries=tuple(
DirectoryEntry.from_dict(entry) for entry in d.pop("entries")
),
**d,
)
def swhid(self) -> CoreSWHID:
"""Returns a SWHID representing this object."""
return CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=self.id)
@attr.s(frozen=True)
class BaseContent(BaseModel):
def to_dict(self):
content = super().to_dict()
if content['ctime'] is None:
del content['ctime']
return content
@classmethod
def from_possibly_duplicated_entries(
cls,
*,
entries: Tuple[DirectoryEntry, ...],
id: Sha1Git = b"",
raw_manifest: Optional[bytes] = None,
) -> Tuple[bool, "Directory"]:
"""Constructs a ``Directory`` object from a list of entries that may contain
duplicated names.
This is required to represent legacy objects, that were ingested in the
storage database before this check was added.
As it is impossible for a ``Directory`` instances to have more than one entry
with a given names, this function computes a ``raw_manifest`` and renames one of
the entries before constructing the ``Directory``.
Returns:
``(is_corrupt, directory)`` where ``is_corrupt`` is True iff some
entry names were indeed duplicated
"""
# First, try building a Directory object normally without any extra computation,
# which works the overwhelming majority of the time:
try:
return (False, Directory(entries=entries, id=id, raw_manifest=raw_manifest))
except ValueError:
pass
# If it fails:
# 1. compute a raw_manifest if there isn't already one:
if raw_manifest is None:
# invalid_directory behaves like a Directory object, but without the
# duplicated entry check; which allows computing its raw_manifest
invalid_directory = type("", (), {})()
invalid_directory.entries = entries
raw_manifest = git_objects.directory_git_object(invalid_directory)
# 2. look for duplicated entries:
entries_by_name: Dict[bytes, Dict[str, List[DirectoryEntry]]] = (
collections.defaultdict(lambda: collections.defaultdict(list))
)
for entry in entries:
entries_by_name[entry.name][entry.type].append(entry)
# 3. strip duplicates
deduplicated_entries = []
for entry_lists in entries_by_name.values():
# We could pick one entry at random to keep the original name; but we try to
# "minimize" the impact, by preserving entries of type "rev" first
# (because renaming them would likely break git submodules entirely
# when this directory is written to disk),
# then entries of type "dir" (because renaming them affects the path
# of every file in the dir, instead of just one "cnt").
dir_entry_types = ("rev", "dir", "file")
assert set(dir_entry_types) == set(_DIR_ENTRY_TYPES)
picked_winner = False # when True, all future entries must be renamed
for type_ in dir_entry_types:
for entry in entry_lists[type_]:
if not picked_winner:
# this is the "most important" entry according to this
# heuristic; it gets to keep its name.
deduplicated_entries.append(entry)
picked_winner = True
else:
# the heuristic already found an entry more important than
# this one; so this one must be renamed to something.
# we pick the beginning of its hash, it should be good enough
# to avoid any conflict.
new_name = (
entry.name + b"_" + hash_to_bytehex(entry.target)[0:10]
)
renamed_entry = attr.evolve(entry, name=new_name)
deduplicated_entries.append(renamed_entry)
# Finally, return the "fixed" the directory
dir_ = Directory(
entries=tuple(deduplicated_entries), id=id, raw_manifest=raw_manifest
)
return (True, dir_)
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class BaseContent(BaseModel, ABC):
status = attr.ib(
type=str, validator=attr.validators.in_(["visible", "hidden", "absent"])
)
@staticmethod
def _hash_data(data: bytes):
"""Hash some data, returning most of the fields of a content object"""
d = MultiHash.from_data(data).digest()
d["data"] = data
d["length"] = len(data)
return d
@classmethod
def from_dict(cls, d, use_subclass=True):
if use_subclass:
# Chooses a subclass to instantiate instead.
if d['status'] == 'absent':
if d["status"] == "absent":
return SkippedContent.from_dict(d)
else:
return Content.from_dict(d)
......@@ -381,90 +1517,612 @@ class BaseContent(BaseModel):
def get_hash(self, hash_name):
if hash_name not in DEFAULT_ALGORITHMS:
raise ValueError('{} is not a valid hash name.'.format(hash_name))
raise ValueError("{} is not a valid hash name.".format(hash_name))
return getattr(self, hash_name)
def hashes(self) -> Dict[str, bytes]:
"""Returns a dictionary {hash_name: hash_value}"""
return {algo: getattr(self, algo) for algo in DEFAULT_ALGORITHMS}
@attr.s(frozen=True)
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class Content(BaseContent):
sha1 = attr.ib(type=bytes)
sha1_git = attr.ib(type=Sha1Git)
sha256 = attr.ib(type=bytes)
blake2s256 = attr.ib(type=bytes)
object_type: Final = ModelObjectType.CONTENT
sha1 = attr.ib(type=bytes, validator=generic_type_validator, repr=hash_repr)
sha1_git = attr.ib(type=Sha1Git, validator=generic_type_validator, repr=hash_repr)
sha256 = attr.ib(type=bytes, validator=generic_type_validator, repr=hash_repr)
blake2s256 = attr.ib(type=bytes, validator=generic_type_validator, repr=hash_repr)
length = attr.ib(type=int)
status = attr.ib(
type=str,
default='visible',
validator=attr.validators.in_(['visible', 'hidden']))
data = attr.ib(type=Optional[bytes],
default=None)
ctime = attr.ib(type=Optional[datetime.datetime],
default=None)
validator=attr.validators.in_(["visible", "hidden"]),
default="visible",
)
data = attr.ib(type=Optional[bytes], validator=generic_type_validator, default=None)
get_data = attr.ib(
type=Optional[Callable[[], bytes]],
default=None,
cmp=False,
)
ctime = attr.ib(
type=Optional[datetime.datetime],
default=None,
eq=False,
)
@length.validator
def check_length(self, attribute, value):
"""Checks the length is positive."""
if value.__class__ is not int:
raise AttributeTypeError(value, attribute)
if value < 0:
raise ValueError('Length must be positive.')
raise ValueError("Length must be positive.")
@ctime.validator
def check_ctime(self, attribute, value):
"""Checks the ctime has a timezone."""
if value is not None:
if value.__class__ is not datetime.datetime:
raise AttributeTypeError(value, attribute)
if value.tzinfo is None:
raise ValueError("ctime must be a timezone-aware datetime.")
def to_dict(self):
content = super().to_dict()
if content['data'] is None:
del content['data']
content = super(Content, self.with_data(raise_if_missing=False)).to_dict()
for k in ("get_data", "data", "ctime"):
if content[k] is None:
del content[k]
return content
@classmethod
def from_data(cls, data, status="visible", ctime=None) -> Content:
"""Generate a Content from a given `data` byte string.
This populates the Content with the hashes and length for the data
passed as argument, as well as the data itself.
"""
d = cls._hash_data(data)
d["status"] = status
d["ctime"] = ctime
return cls(**d)
@classmethod
def from_dict(cls, d):
if isinstance(d.get("ctime"), str):
d = d.copy()
d["ctime"] = dateutil.parser.parse(d["ctime"])
return super().from_dict(d, use_subclass=False)
def with_data(self, raise_if_missing: bool = True) -> Content:
"""Loads the ``data`` attribute if ``get_data`` is not :const:`None`.
@attr.s(frozen=True)
class SkippedContent(BaseContent):
sha1 = attr.ib(type=Optional[bytes])
sha1_git = attr.ib(type=Optional[Sha1Git])
sha256 = attr.ib(type=Optional[bytes])
blake2s256 = attr.ib(type=Optional[bytes])
This call is almost a no-op, but subclasses may overload this method
to lazy-load data (eg. from disk or objstorage).
length = attr.ib(type=int)
Args:
raise_if_missing: if :const:`True` (default), raise :class:`MissingData`
exception if no data is attached to content object
"""
if self.data is not None:
return self
new_data = None
if self.get_data is not None:
new_data = self.get_data()
if new_data is None and raise_if_missing:
raise MissingData("Content data and get_data are both None.")
return attr.evolve(self, data=new_data, get_data=None)
status = attr.ib(
type=str,
validator=attr.validators.in_(['absent']))
reason = attr.ib(type=Optional[str],
default=None)
def unique_key(self) -> KeyType:
return self.sha1 # TODO: use a dict of hashes
origin = attr.ib(type=Optional[Origin],
default=None)
def swhid(self) -> CoreSWHID:
"""Returns a SWHID representing this object."""
return CoreSWHID(object_type=SwhidObjectType.CONTENT, object_id=self.sha1_git)
ctime = attr.ib(type=Optional[datetime.datetime],
default=None)
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class SkippedContent(BaseContent):
object_type: Final = ModelObjectType.SKIPPED_CONTENT
sha1 = attr.ib(
type=Optional[bytes], validator=generic_type_validator, repr=hash_repr
)
sha1_git = attr.ib(
type=Optional[Sha1Git], validator=generic_type_validator, repr=hash_repr
)
sha256 = attr.ib(
type=Optional[bytes], validator=generic_type_validator, repr=hash_repr
)
blake2s256 = attr.ib(
type=Optional[bytes], validator=generic_type_validator, repr=hash_repr
)
length = attr.ib(type=Optional[int])
status = attr.ib(type=str, validator=attr.validators.in_(["absent"]))
reason = attr.ib(type=Optional[str], default=None)
origin = attr.ib(type=Optional[str], validator=generic_type_validator, default=None)
ctime = attr.ib(
type=Optional[datetime.datetime],
validator=generic_type_validator,
default=None,
eq=False,
)
@reason.validator
def check_reason(self, attribute, value):
"""Checks the reason is full if status != absent."""
assert self.reason == value
if value is None:
raise ValueError('Must provide a reason if content is absent.')
raise ValueError("Must provide a reason if content is absent.")
elif value.__class__ is not str:
raise AttributeTypeError(value, attribute)
@length.validator
def check_length(self, attribute, value):
"""Checks the length is positive or -1."""
if value < -1:
raise ValueError('Length must be positive or -1.')
if value.__class__ is not int:
raise AttributeTypeError(value, attribute)
elif value < -1:
raise ValueError("Length must be positive or -1.")
@ctime.validator
def check_ctime(self, attribute, value):
"""Checks the ctime has a timezone."""
if value is not None:
if value.__class__ is not datetime.datetime:
raise AttributeTypeError(value, attribute)
elif value.tzinfo is None:
raise ValueError("ctime must be a timezone-aware datetime.")
def to_dict(self):
content = super().to_dict()
if content['origin'] is None:
del content['origin']
if content["origin"] is None:
del content["origin"]
if content["ctime"] is None:
del content["ctime"]
return content
@classmethod
def from_data(
cls, data: bytes, reason: str, ctime: Optional[datetime.datetime] = None
) -> SkippedContent:
"""Generate a SkippedContent from a given `data` byte string.
This populates the SkippedContent with the hashes and length for the
data passed as argument.
You can use `attr.evolve` on such a generated content to nullify some
of its attributes, e.g. for tests.
"""
d = cls._hash_data(data)
del d["data"]
d["status"] = "absent"
d["reason"] = reason
d["ctime"] = ctime
return cls(**d)
@classmethod
def from_dict(cls, d):
d2 = d
d = d.copy()
if d.pop('data', None) is not None:
raise ValueError('SkippedContent has no "data" attribute %r' % d2)
return super().from_dict(d, use_subclass=False)
d2 = d.copy()
if d2.pop("data", None) is not None:
raise ValueError('SkippedContent has no "data" attribute %r' % d)
return super().from_dict(d2, use_subclass=False)
def unique_key(self) -> KeyType:
return self.hashes()
def swhid(self) -> Optional[CoreSWHID]:
"""Returns a SWHID representing this object or None if unset."""
if self.sha1_git is None:
return None
return CoreSWHID(object_type=SwhidObjectType.CONTENT, object_id=self.sha1_git)
class MetadataAuthorityType(Enum):
DEPOSIT_CLIENT = "deposit_client"
FORGE = "forge"
REGISTRY = "registry"
def __repr__(self):
return f"MetadataAuthorityType.{self.name}"
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class MetadataAuthority(BaseModel):
"""Represents an entity that provides metadata about an origin or
software artifact."""
object_type: Final = ModelObjectType.METADATA_AUTHORITY
type = attr.ib(type=MetadataAuthorityType, validator=generic_type_validator)
url = attr.ib(type=str, validator=generic_type_validator)
metadata = attr.ib(
type=Optional[ImmutableDict[str, Any]],
default=None,
validator=generic_type_validator,
converter=freeze_optional_dict,
)
def to_dict(self):
d = super().to_dict()
if d["metadata"] is None:
del d["metadata"]
return d
@classmethod
def from_dict(cls, d):
d = {
**d,
"type": MetadataAuthorityType(d["type"]),
}
return super().from_dict(d)
def unique_key(self) -> KeyType:
return {"type": self.type.value, "url": self.url}
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class MetadataFetcher(BaseModel):
"""Represents a software component used to fetch metadata from a metadata
authority, and ingest them into the Software Heritage archive."""
object_type: Final = ModelObjectType.METADATA_FETCHER
name = attr.ib(type=str, validator=generic_type_validator)
version = attr.ib(type=str, validator=generic_type_validator)
metadata = attr.ib(
type=Optional[ImmutableDict[str, Any]],
default=None,
validator=generic_type_validator,
converter=freeze_optional_dict,
)
def to_dict(self):
d = super().to_dict()
if d["metadata"] is None:
del d["metadata"]
return d
def unique_key(self) -> KeyType:
return {"name": self.name, "version": self.version}
def normalize_discovery_date(value: Any) -> datetime.datetime:
if not isinstance(value, datetime.datetime):
raise TypeError("discovery_date must be a timezone-aware datetime.")
if value.tzinfo is None:
raise ValueError("discovery_date must be a timezone-aware datetime.")
# Normalize timezone to utc, and truncate microseconds to 0
return value.astimezone(datetime.timezone.utc).replace(microsecond=0)
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class RawExtrinsicMetadata(BaseHashableModel):
object_type: Final = ModelObjectType.RAW_EXTRINSIC_METADATA
# target object
target = attr.ib(type=ExtendedSWHID, validator=generic_type_validator)
# source
discovery_date = attr.ib(type=datetime.datetime, converter=normalize_discovery_date)
authority = attr.ib(type=MetadataAuthority, validator=generic_type_validator)
fetcher = attr.ib(type=MetadataFetcher, validator=generic_type_validator)
# the metadata itself
format = attr.ib(type=str, validator=generic_type_validator)
metadata = attr.ib(type=bytes, validator=generic_type_validator)
# context
origin = attr.ib(type=Optional[str], default=None, validator=generic_type_validator)
visit = attr.ib(type=Optional[int], default=None)
snapshot = attr.ib(type=Optional[CoreSWHID], default=None)
release = attr.ib(type=Optional[CoreSWHID], default=None)
revision = attr.ib(type=Optional[CoreSWHID], default=None)
path = attr.ib(type=Optional[bytes], default=None)
directory = attr.ib(type=Optional[CoreSWHID], default=None)
id = attr.ib(
type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
)
def _compute_hash_from_attributes(self) -> bytes:
return _compute_hash_from_manifest(
git_objects.raw_extrinsic_metadata_git_object(self)
)
@origin.validator
def check_origin(self, attribute, value):
if value is None:
return
if value.__class__ is not str:
raise AttributeTypeError(value, attribute)
obj_type = self.target.object_type
if not (
obj_type is SwhidExtendedObjectType.SNAPSHOT
or obj_type is SwhidExtendedObjectType.RELEASE
or obj_type is SwhidExtendedObjectType.REVISION
or obj_type is SwhidExtendedObjectType.DIRECTORY
or obj_type is SwhidExtendedObjectType.CONTENT
):
raise ValueError(
f"Unexpected 'origin' context for "
f"{self.target.object_type.name.lower()} object: {value}"
)
if value.startswith("swh:"):
# Technically this is valid; but:
# 1. SWHIDs are URIs, not URLs
# 2. if a SWHID gets here, it's very likely to be a mistake
# (and we can remove this check if it turns out there is a
# legitimate use for it).
raise ValueError(f"SWHID used as context origin URL: {value}")
@visit.validator
def check_visit(self, attribute, value):
if value is None:
return
if value.__class__ is not int:
raise AttributeTypeError(value, attribute)
obj_type = self.target.object_type
if not (
obj_type is SwhidExtendedObjectType.SNAPSHOT
or obj_type is SwhidExtendedObjectType.RELEASE
or obj_type is SwhidExtendedObjectType.REVISION
or obj_type is SwhidExtendedObjectType.DIRECTORY
or obj_type is SwhidExtendedObjectType.CONTENT
):
raise ValueError(
f"Unexpected 'visit' context for "
f"{self.target.object_type.name.lower()} object: {value}"
)
if self.origin is None:
raise ValueError("'origin' context must be set if 'visit' is.")
if value <= 0:
raise ValueError("Nonpositive visit id")
@snapshot.validator
def check_snapshot(self, attribute, value):
if value is None:
return
if value.__class__ is not CoreSWHID:
raise AttributeTypeError(value, attribute)
obj_type = self.target.object_type
if not (
obj_type is SwhidExtendedObjectType.RELEASE
or obj_type is SwhidExtendedObjectType.REVISION
or obj_type is SwhidExtendedObjectType.DIRECTORY
or obj_type is SwhidExtendedObjectType.CONTENT
):
raise ValueError(
f"Unexpected 'snapshot' context for "
f"{self.target.object_type.name.lower()} object: {value}"
)
if value.object_type != SwhidObjectType.SNAPSHOT:
raise ValueError(
f"Expected SWHID type 'snapshot', "
f"got '{value.object_type.name.lower()}' in {value}"
)
@release.validator
def check_release(self, attribute, value):
if value is None:
return
if value.__class__ is not CoreSWHID:
raise AttributeTypeError(value, attribute)
obj_type = self.target.object_type
if not (
obj_type is SwhidExtendedObjectType.REVISION
or obj_type is SwhidExtendedObjectType.DIRECTORY
or obj_type is SwhidExtendedObjectType.CONTENT
):
raise ValueError(
f"Unexpected 'release' context for "
f"{self.target.object_type.name.lower()} object: {value}"
)
if value.object_type != SwhidObjectType.RELEASE:
raise ValueError(
f"Expected SWHID type 'release', "
f"got '{value.object_type.name.lower()}' in {value}"
)
@revision.validator
def check_revision(self, attribute, value):
if value is None:
return
if value.__class__ is not CoreSWHID:
raise AttributeTypeError(value, attribute)
obj_type = self.target.object_type
if not (
obj_type is SwhidExtendedObjectType.DIRECTORY
or obj_type is SwhidExtendedObjectType.CONTENT
):
raise ValueError(
f"Unexpected 'revision' context for "
f"{self.target.object_type.name.lower()} object: {value}"
)
if value.object_type != SwhidObjectType.REVISION:
raise ValueError(
f"Expected SWHID type 'revision', "
f"got '{value.object_type.name.lower()}' in {value}"
)
@path.validator
def check_path(self, attribute, value):
if value is None:
return
if value.__class__ is not bytes:
raise AttributeTypeError(value, attribute)
obj_type = self.target.object_type
if not (
obj_type is SwhidExtendedObjectType.DIRECTORY
or obj_type is SwhidExtendedObjectType.CONTENT
):
raise ValueError(
f"Unexpected 'path' context for "
f"{self.target.object_type.name.lower()} object: {value}"
)
@directory.validator
def check_directory(self, attribute, value):
if value is None:
return
if value.__class__ is not CoreSWHID:
raise AttributeTypeError(value, attribute)
if self.target.object_type is not SwhidExtendedObjectType.CONTENT:
raise ValueError(
f"Unexpected 'directory' context for "
f"{self.target.object_type.name.lower()} object: {value}"
)
if value.object_type != SwhidObjectType.DIRECTORY:
raise ValueError(
f"Expected SWHID type 'directory', "
f"got '{value.object_type.name.lower()}' in {value}"
)
def to_dict(self):
d = super().to_dict()
context_keys = (
"origin",
"visit",
"snapshot",
"release",
"revision",
"directory",
"path",
)
for context_key in context_keys:
if d[context_key] is None:
del d[context_key]
return d
@classmethod
def from_dict(cls, d):
if "type" in d:
# Convert from old schema
type_ = d.pop("type")
if type_ == "origin":
d["target"] = str(Origin(d["target"]).swhid())
d = {
**d,
"target": ExtendedSWHID.from_string(d["target"]),
"authority": MetadataAuthority.from_dict(d["authority"]),
"fetcher": MetadataFetcher.from_dict(d["fetcher"]),
}
swhid_keys = ("snapshot", "release", "revision", "directory")
for swhid_key in swhid_keys:
if d.get(swhid_key):
d[swhid_key] = CoreSWHID.from_string(d[swhid_key])
return super().from_dict(d)
def swhid(self) -> ExtendedSWHID:
"""Returns a SWHID representing this RawExtrinsicMetadata object."""
return ExtendedSWHID(
object_type=SwhidExtendedObjectType.RAW_EXTRINSIC_METADATA,
object_id=self.id,
)
@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
class ExtID(BaseHashableModel):
object_type: Final = ModelObjectType.EXTID
extid_type = attr.ib(type=str, validator=generic_type_validator)
extid = attr.ib(type=bytes, validator=generic_type_validator)
target = attr.ib(type=CoreSWHID, validator=generic_type_validator)
extid_version = attr.ib(type=int, validator=generic_type_validator, default=0)
payload_type = attr.ib(
type=Optional[str], validator=generic_type_validator, default=None
)
payload = attr.ib(
type=Optional[Sha1Git],
validator=generic_type_validator,
default=None,
repr=hash_repr,
)
id = attr.ib(
type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
)
@payload_type.validator
def check_payload_type(self, attribute, value):
if value is not None and self.payload is None:
raise ValueError("'payload' must be set if 'payload_type' is.")
@payload.validator
def check_payload(self, attribute, value):
if value is not None and self.payload_type is None:
raise ValueError("'payload_type' must be set if 'payload' is.")
@classmethod
def from_dict(cls, d):
return cls(
extid=d["extid"],
extid_type=d["extid_type"],
target=CoreSWHID.from_string(d["target"]),
extid_version=d.get("extid_version", 0),
payload_type=d.get("payload_type"),
payload=d.get("payload"),
)
def _compute_hash_from_attributes(self) -> bytes:
return _compute_hash_from_manifest(git_objects.extid_git_object(self))
# Note: we need the type ignore stanza here because mypy cannot figure that all
# subclasses of BaseModel do have an object_type attribute, even if BaseModel
# itself does not (because these are Final)
SWH_MODEL_OBJECT_TYPES: Dict[str, Type[BaseModel]] = {
cls.object_type: cls # type: ignore
for cls in (
Person,
Timestamp,
TimestampWithTimezone,
Origin,
OriginVisit,
OriginVisitStatus,
Snapshot,
SnapshotBranch,
Release,
Revision,
Directory,
DirectoryEntry,
Content,
SkippedContent,
MetadataAuthority,
MetadataFetcher,
RawExtrinsicMetadata,
ExtID,
)
}
# Copyright (C) 2015-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""
Classes to represent :ref:`SWH persistend IDentifiers <persistent-identifiers>`.
:class:`CoreSWHID` represents a SWHID with no qualifier, and :class:`QualifiedSWHID`
represents a SWHID that may have qualifiers.
:class:`ExtendedSWHID` extends the definition of SWHID to other object types,
and is used internally in Software Heritage; it does not support qualifiers.
"""
from __future__ import annotations
import enum
import re
from typing import Any, Dict, Generic, Optional, Tuple, Type, TypeVar, Union
import urllib.parse
import attr
from attrs_strict import type_validator
from .exceptions import ValidationError
from .hashutil import hash_to_bytes, hash_to_hex
class ObjectType(enum.Enum):
"""Possible object types of a QualifiedSWHID or CoreSWHID.
The values of each variant is what is used in the SWHID's string representation."""
SNAPSHOT = "snp"
REVISION = "rev"
RELEASE = "rel"
DIRECTORY = "dir"
CONTENT = "cnt"
class ExtendedObjectType(enum.Enum):
"""Possible object types of an ExtendedSWHID.
The variants are a superset of :class:`ObjectType`'s"""
SNAPSHOT = "snp"
REVISION = "rev"
RELEASE = "rel"
DIRECTORY = "dir"
CONTENT = "cnt"
ORIGIN = "ori"
RAW_EXTRINSIC_METADATA = "emd"
SWHID_NAMESPACE = "swh"
SWHID_VERSION = 1
SWHID_TYPES = ["snp", "rel", "rev", "dir", "cnt"]
EXTENDED_SWHID_TYPES = SWHID_TYPES + ["ori", "emd"]
SWHID_SEP = ":"
SWHID_CTXT_SEP = ";"
SWHID_QUALIFIERS = {"origin", "anchor", "visit", "path", "lines"}
SWHID_RE_RAW = (
f"(?P<namespace>{SWHID_NAMESPACE})"
f"{SWHID_SEP}(?P<scheme_version>{SWHID_VERSION})"
f"{SWHID_SEP}(?P<object_type>{'|'.join(EXTENDED_SWHID_TYPES)})"
f"{SWHID_SEP}(?P<object_id>[0-9a-f]{{40}})"
f"({SWHID_CTXT_SEP}(?P<qualifiers>\\S+))?"
)
SWHID_RE = re.compile(SWHID_RE_RAW)
# type of the "object_type" attribute of the SWHID class; either
# ObjectType or ExtendedObjectType
_TObjectType = TypeVar("_TObjectType", ObjectType, ExtendedObjectType)
# the SWHID class itself (this is used so that X.from_string() can return X
# for all X subclass of _BaseSWHID)
_TSWHID = TypeVar("_TSWHID", bound="_BaseSWHID")
@attr.s(frozen=True, kw_only=True, repr=False)
class _BaseSWHID(Generic[_TObjectType]):
"""Common base class for CoreSWHID, QualifiedSWHID, and ExtendedSWHID.
This is an "abstract" class and should not be instantiated directly;
it only exists to deduplicate code between these three SWHID classes."""
namespace = attr.ib(type=str, default=SWHID_NAMESPACE)
"""the namespace of the identifier, defaults to ``swh``"""
scheme_version = attr.ib(type=int, default=SWHID_VERSION)
"""the scheme version of the identifier, defaults to 1"""
# overridden by subclasses
object_type: _TObjectType
"""the type of object the identifier points to"""
object_id = attr.ib(type=bytes, validator=type_validator())
"""object's identifier"""
@namespace.validator
def check_namespace(self, attribute, value):
if value != SWHID_NAMESPACE:
raise ValidationError(
"Invalid SWHID: invalid namespace: %(namespace)s",
params={"namespace": value},
)
@scheme_version.validator
def check_scheme_version(self, attribute, value):
if value != SWHID_VERSION:
raise ValidationError(
"Invalid SWHID: invalid version: %(version)s", params={"version": value}
)
@object_id.validator
def check_object_id(self, attribute, value):
if len(value) != 20:
raise ValidationError(
"Invalid SWHID: invalid checksum: %(object_id)s",
params={"object_id": hash_to_hex(value)},
)
def __str__(self) -> str:
return self._format_core_swhid()
def _format_core_swhid(self) -> str:
return SWHID_SEP.join(
[
self.namespace,
str(self.scheme_version),
self.object_type.value,
hash_to_hex(self.object_id),
]
)
def __repr__(self) -> str:
return f"{self.__class__.__name__}.from_string('{self}')"
@classmethod
def from_string(cls: Type[_TSWHID], s: str) -> _TSWHID:
parts = _parse_swhid(s)
if parts.pop("qualifiers"):
raise ValidationError(f"{cls.__name__} does not support qualifiers.")
try:
return cls(**parts)
except ValueError as e:
raise ValidationError(
"ValueError: %(args)s", params={"args": e.args}
) from None
@attr.s(frozen=True, kw_only=True, repr=False)
class CoreSWHID(_BaseSWHID[ObjectType]):
"""
Dataclass holding the relevant info associated to a SoftWare Heritage
persistent IDentifier (SWHID).
Unlike `QualifiedSWHID`, it is restricted to core SWHIDs, ie. SWHIDs
with no qualifiers.
Raises:
swh.model.exceptions.ValidationError: In case of invalid object type or id
To get the raw SWHID string from an instance of this class,
use the :func:`str` function:
>>> swhid = CoreSWHID(
... object_type=ObjectType.CONTENT,
... object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'),
... )
>>> str(swhid)
'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0'
And vice-versa with :meth:`CoreSWHID.from_string`:
>>> swhid == CoreSWHID.from_string(
... "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0"
... )
True
"""
object_type = attr.ib(
type=ObjectType, validator=type_validator(), converter=ObjectType
)
"""the type of object the identifier points to"""
def to_extended(self) -> ExtendedSWHID:
"""Converts this CoreSWHID into an ExtendedSWHID.
As ExtendedSWHID is a superset of CoreSWHID, this is lossless."""
return ExtendedSWHID(
namespace=self.namespace,
scheme_version=self.scheme_version,
object_type=ExtendedObjectType(self.object_type.value),
object_id=self.object_id,
)
def to_qualified(self) -> QualifiedSWHID:
"""Converts this CoreSWHID into a QualifiedSWHID.
As QualifiedSWHID is a superset of CoreSWHID, this is lossless."""
return QualifiedSWHID(
namespace=self.namespace,
scheme_version=self.scheme_version,
object_type=self.object_type,
object_id=self.object_id,
)
def _parse_core_swhid(swhid: Union[str, CoreSWHID, None]) -> Optional[CoreSWHID]:
if swhid is None or isinstance(swhid, CoreSWHID):
return swhid
else:
return CoreSWHID.from_string(swhid)
def _parse_lines_qualifier(
lines: Union[str, Tuple[int, Optional[int]], None],
) -> Optional[Tuple[int, Optional[int]]]:
try:
if lines is None or isinstance(lines, tuple):
return lines
elif "-" in lines:
(from_, to) = lines.split("-", 2)
return (int(from_), int(to))
else:
return (int(lines), None)
except ValueError:
raise ValidationError(
"Invalid format for the lines qualifier: %(lines)s", params={"lines": lines}
)
def _parse_path_qualifier(path: Union[str, bytes, None]) -> Optional[bytes]:
if path is None or isinstance(path, bytes):
return path
else:
return urllib.parse.unquote_to_bytes(path)
@attr.s(frozen=True, kw_only=True, repr=False)
class QualifiedSWHID(_BaseSWHID[ObjectType]):
"""
Dataclass holding the relevant info associated to a SoftWare Heritage
persistent IDentifier (SWHID)
Raises:
swh.model.exceptions.ValidationError: In case of invalid object type or id
To get the raw SWHID string from an instance of this class,
use the :func:`str` function:
>>> swhid = QualifiedSWHID(
... object_type=ObjectType.CONTENT,
... object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'),
... lines=(5, 10),
... )
>>> str(swhid)
'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0;lines=5-10'
And vice-versa with :meth:`QualifiedSWHID.from_string`:
>>> swhid == QualifiedSWHID.from_string(
... "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0;lines=5-10"
... )
True
"""
object_type = attr.ib(
type=ObjectType, validator=type_validator(), converter=ObjectType
)
"""the type of object the identifier points to"""
# qualifiers:
origin = attr.ib(type=Optional[str], default=None, validator=type_validator())
"""the software origin where an object has been found or observed in the wild,
as an URI"""
visit = attr.ib(type=Optional[CoreSWHID], default=None, converter=_parse_core_swhid)
"""the core identifier of a snapshot corresponding to a specific visit
of a repository containing the designated object"""
anchor = attr.ib(
type=Optional[CoreSWHID],
default=None,
validator=type_validator(),
converter=_parse_core_swhid,
)
"""a designated node in the Merkle DAG relative to which a path to the object
is specified, as the core identifier of a directory, a revision, a release,
or a snapshot"""
path = attr.ib(
type=Optional[bytes],
default=None,
validator=type_validator(),
converter=_parse_path_qualifier,
)
"""the absolute file path, from the root directory associated to the anchor node,
to the object; when the anchor denotes a directory or a revision, and almost always
when it’s a release, the root directory is uniquely determined;
when the anchor denotes a snapshot, the root directory is the one pointed to by HEAD
(possibly indirectly), and undefined if such a reference is missing"""
Lines = Tuple[int, Optional[int]]
lines = attr.ib(
type=Optional[Lines],
default=None,
validator=type_validator(),
converter=_parse_lines_qualifier,
)
"""lines: line number(s) of interest, usually within a content object"""
@visit.validator
def check_visit(self, attribute, value):
if value and value.object_type != ObjectType.SNAPSHOT:
raise ValidationError(
"The 'visit' qualifier must be a 'snp' SWHID, not '%(type)s'",
params={"type": value.object_type.value},
)
@anchor.validator
def check_anchor(self, attribute, value):
if value and value.object_type not in (
ObjectType.DIRECTORY,
ObjectType.REVISION,
ObjectType.RELEASE,
ObjectType.SNAPSHOT,
):
raise ValidationError(
"The 'visit' qualifier must be a 'dir', 'rev', 'rel', or 'snp' SWHID, "
"not '%s(type)s'",
params={"type": value.object_type.value},
)
def to_dict(self) -> Dict[str, Optional[str | bytes | CoreSWHID | Lines]]:
"""Returns a dictionary version of this QSWHID for json serialization"""
return {
"swhid": self._format_core_swhid(),
"origin": self.origin,
"visit": self.visit,
"anchor": self.anchor,
"path": self.path,
"lines": self.lines,
}
def qualifiers(self) -> Dict[str, str]:
"""Returns URL-escaped qualifiers of this SWHID, for use in serialization"""
origin = self.origin
if origin:
unescaped_origin = origin
origin = origin.replace("%", "%25")
origin = origin.replace(";", "%3B")
assert (
urllib.parse.unquote(origin) == unescaped_origin
), "Escaping ';' in the origin qualifier corrupted the origin URL."
d: Dict[str, Optional[str]] = {
"origin": origin,
"visit": str(self.visit) if self.visit else None,
"anchor": str(self.anchor) if self.anchor else None,
"path": (
urllib.parse.quote_from_bytes(self.path)
if self.path is not None
else None
),
"lines": (
"-".join(str(line) for line in self.lines if line is not None)
if self.lines
else None
),
}
return {k: v for (k, v) in d.items() if v is not None}
def __str__(self) -> str:
swhid = self._format_core_swhid()
qualifiers = self.qualifiers()
if qualifiers:
for k, v in qualifiers.items():
swhid += "%s%s=%s" % (SWHID_CTXT_SEP, k, v)
return swhid
def __repr__(self) -> str:
return super().__repr__()
@classmethod
def from_string(cls, s: str) -> QualifiedSWHID:
parts = _parse_swhid(s)
qualifiers = parts.pop("qualifiers")
invalid_qualifiers = set(qualifiers) - SWHID_QUALIFIERS
if invalid_qualifiers:
raise ValidationError(
"Invalid qualifier(s): %(qualifiers)s",
params={"qualifiers": ", ".join(invalid_qualifiers)},
)
if "origin" in qualifiers:
qualifiers["origin"] = urllib.parse.unquote(qualifiers["origin"])
try:
return QualifiedSWHID(**parts, **qualifiers)
except ValueError as e:
raise ValidationError(
"ValueError: %(args)s", params={"args": e.args}
) from None
@attr.s(frozen=True, kw_only=True, repr=False)
class ExtendedSWHID(_BaseSWHID[ExtendedObjectType]):
"""
Dataclass holding the relevant info associated to a SoftWare Heritage
persistent IDentifier (SWHID).
It extends `CoreSWHID`, by allowing non-standard object types; and should
only be used internally to Software Heritage.
Raises:
swh.model.exceptions.ValidationError: In case of invalid object type or id
To get the raw SWHID string from an instance of this class,
use the :func:`str` function:
>>> swhid = ExtendedSWHID(
... object_type=ExtendedObjectType.CONTENT,
... object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'),
... )
>>> str(swhid)
'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0'
And vice-versa with :meth:`CoreSWHID.from_string`:
>>> swhid == ExtendedSWHID.from_string(
... "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0"
... )
True
"""
object_type = attr.ib(
type=ExtendedObjectType,
validator=type_validator(),
converter=ExtendedObjectType,
)
"""the type of object the identifier points to"""
def _parse_swhid(swhid: str) -> Dict[str, Any]:
"""Parse a Software Heritage identifier (SWHID) from string (see:
:ref:`persistent-identifiers`.)
This is for internal use; use :meth:`CoreSWHID.from_string`,
:meth:`QualifiedSWHID.from_string`, or :meth:`ExtendedSWHID.from_string` instead,
as they perform validation and build a dataclass.
Args:
swhid (str): A persistent identifier
Raises:
swh.model.exceptions.ValidationError: if passed string is not a valid SWHID
"""
m = SWHID_RE.fullmatch(swhid)
if not m:
raise ValidationError(
"Invalid SWHID: invalid syntax: %(swhid)s", params={"swhid": swhid}
)
parts: Dict[str, Any] = m.groupdict()
qualifiers_raw = parts["qualifiers"]
parts["qualifiers"] = {}
if qualifiers_raw:
for qualifier in qualifiers_raw.split(SWHID_CTXT_SEP):
try:
k, v = qualifier.split("=", maxsplit=1)
parts["qualifiers"][k] = v
except ValueError:
raise ValidationError(
"Invalid SWHID: invalid qualifier: %(qualifier)s",
params={"qualifier": qualifier},
)
parts["scheme_version"] = int(parts["scheme_version"])
parts["object_id"] = hash_to_bytes(parts["object_id"])
return parts