Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • anlambert/swh-model
  • lunar/swh-model
  • franckbret/swh-model
  • douardda/swh-model
  • olasd/swh-model
  • swh/devel/swh-model
  • Alphare/swh-model
  • samplet/swh-model
  • marmoute/swh-model
  • rboyer/swh-model
10 results
Show changes
Showing
with 2076 additions and 1124 deletions
[flake8]
# E203: whitespaces before ':' <https://github.com/psf/black/issues/315>
# E231: missing whitespace after ','
# W503: line break before binary operator <https://github.com/psf/black/issues/52>
ignore = E203,E231,W503
max-line-length = 88
#!/usr/bin/env python3
# Copyright (C) 2015-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from io import open
from os import path
from setuptools import find_packages, setup
here = path.abspath(path.dirname(__file__))
# Get the long description from the README file
with open(path.join(here, "README.md"), encoding="utf-8") as f:
long_description = f.read()
def parse_requirements(name=None):
if name:
reqf = "requirements-%s.txt" % name
else:
reqf = "requirements.txt"
requirements = []
if not path.exists(reqf):
return requirements
with open(reqf) as f:
for line in f.readlines():
line = line.strip()
if not line or line.startswith("#"):
continue
requirements.append(line)
return requirements
setup(
name="swh.model",
description="Software Heritage data model",
long_description=long_description,
long_description_content_type="text/markdown",
python_requires=">=3.7",
author="Software Heritage developers",
author_email="swh-devel@inria.fr",
url="https://forge.softwareheritage.org/diffusion/DMOD/",
packages=find_packages(),
setup_requires=["setuptools-scm"],
use_scm_version=True,
install_requires=parse_requirements() + parse_requirements("swh"),
extras_require={
"cli": parse_requirements("cli"),
"testing-minimal": parse_requirements("test"),
"testing": parse_requirements("test") + parse_requirements("cli"),
},
include_package_data=True,
entry_points="""
[console_scripts]
swh-identify=swh.model.cli:identify
[swh.cli.subcommands]
identify=swh.model.cli
""",
classifiers=[
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Development Status :: 5 - Production/Stable",
],
project_urls={
"Bug Reports": "https://forge.softwareheritage.org/maniphest",
"Funding": "https://www.softwareheritage.org/donate",
"Source": "https://forge.softwareheritage.org/source/swh-model",
"Documentation": "https://docs.softwareheritage.org/devel/swh-model/",
},
)
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)
......@@ -5,7 +5,7 @@
import os
import sys
from typing import Dict, Iterable, Optional
from typing import Callable, Dict, Iterable, Optional
# WARNING: do not import unnecessary things here to keep cli startup time under
# control
......@@ -20,10 +20,12 @@ except ImportError:
exit(1)
try:
from swh.core.cli import swh as swh_cli_group
import swh.core.cli
cli_command = swh.core.cli.swh.command
except ImportError:
# stub so that swh-identify can be used when swh-core isn't installed
swh_cli_group = click # type: ignore
cli_command = click.command
from swh.model.from_disk import Directory
from swh.model.swhids import CoreSWHID
......@@ -42,7 +44,7 @@ _DULWICH_TYPES = {
class CoreSWHIDParamType(click.ParamType):
"""Click argument that accepts a core SWHID and returns them as
:class:`swh.model.swhids.CoreSWHID` instances """
:class:`swh.model.swhids.CoreSWHID` instances"""
name = "SWHID"
......@@ -69,19 +71,27 @@ def swhid_of_file_content(data) -> CoreSWHID:
return object.swhid()
def model_of_dir(path: bytes, exclude_patterns: Iterable[bytes] = None) -> Directory:
from swh.model.from_disk import accept_all_directories, ignore_directories_patterns
def model_of_dir(
path: bytes,
exclude_patterns: Optional[Iterable[bytes]] = None,
update_info: Optional[Callable[[int], None]] = None,
) -> Directory:
from swh.model.from_disk import accept_all_paths, ignore_directories_patterns
dir_filter = (
path_filter = (
ignore_directories_patterns(path, exclude_patterns)
if exclude_patterns
else accept_all_directories
else accept_all_paths
)
return Directory.from_disk(path=path, dir_filter=dir_filter)
return Directory.from_disk(
path=path, path_filter=path_filter, progress_callback=update_info
)
def swhid_of_dir(path: bytes, exclude_patterns: Iterable[bytes] = None) -> CoreSWHID:
def swhid_of_dir(
path: bytes, exclude_patterns: Optional[Iterable[bytes]] = None
) -> CoreSWHID:
obj = model_of_dir(path, exclude_patterns)
return obj.swhid()
......@@ -170,7 +180,7 @@ def identify_object(
return swhid
@swh_cli_group.command(context_settings=CONTEXT_SETTINGS)
@cli_command(context_settings=CONTEXT_SETTINGS)
@click.option(
"--dereference/--no-dereference",
"follow_symlinks",
......@@ -209,7 +219,10 @@ def identify_object(
help="reference identifier to be compared with computed one",
)
@click.option(
"-r", "--recursive", is_flag=True, help="compute SWHID recursively",
"-r",
"--recursive",
is_flag=True,
help="compute SWHID recursively",
)
@click.argument("objects", nargs=-1, required=True)
def identify(
......@@ -226,30 +239,26 @@ def identify(
For more details about SWHIDs see:
\b
https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
Tip: you can pass "-" to identify the content of standard input.
\b
Examples::
\b
$ swh identify fork.c kmod.c sched/deadline.c
swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3 fork.c
swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2 kmod.c
swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82 sched/deadline.c
\b
$ swh identify --no-filename /usr/src/linux/kernel/
swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab
\b
$ git clone --mirror https://forge.softwareheritage.org/source/helloworld.git
$ swh identify --type snapshot helloworld.git/
swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93 helloworld.git
swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93 helloworld.git
""" # NoQA # overlong lines in shell examples are fine
"""
from functools import partial
import logging
......@@ -300,7 +309,7 @@ def identify(
click.echo("SWHID mismatch: %s != %s" % (verify, swhid))
sys.exit(1)
else:
for (obj, swhid) in results:
for obj, swhid in results:
msg = swhid
if show_filename:
msg = "%s\t%s" % (swhid, os.fsdecode(obj))
......
# Copyright (C) 2020 The Software Heritage developers
# Copyright (C) 2020-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from __future__ import annotations
"""Utility data structures."""
from collections.abc import Mapping
import copy
from typing import Dict, Generic, Iterable, Optional, Tuple, TypeVar, Union
KT = TypeVar("KT")
......@@ -18,36 +21,35 @@ class ImmutableDict(Mapping, Generic[KT, VT]):
This class behaves like a dictionary, but internally stores objects in a tuple,
so it is both immutable and hashable."""
data: Tuple[Tuple[KT, VT], ...]
_data: Dict[KT, VT]
def __init__(
self,
data: Union[
Iterable[Tuple[KT, VT]], "ImmutableDict[KT, VT]", Dict[KT, VT]
] = {},
data: Union[Iterable[Tuple[KT, VT]], ImmutableDict[KT, VT], Dict[KT, VT]] = {},
):
if isinstance(data, dict):
self.data = tuple(item for item in data.items())
self._data = data
elif isinstance(data, ImmutableDict):
self.data = data.data
self._data = data._data
else:
self.data = tuple(data)
self._data = {k: v for k, v in data}
@property
def data(self):
return tuple(self._data.items())
def __repr__(self):
return f"ImmutableDict({dict(self.data)!r})"
def __getitem__(self, key):
for (k, v) in self.data:
if k == key:
return v
raise KeyError(key)
return self._data[key]
def __iter__(self):
for (k, v) in self.data:
for k, v in self.data:
yield k
def __len__(self):
return len(self.data)
return len(self._data)
def items(self):
yield from self.data
......@@ -55,15 +57,9 @@ class ImmutableDict(Mapping, Generic[KT, VT]):
def __hash__(self):
return hash(tuple(sorted(self.data)))
def copy_pop(self, popped_key) -> Tuple[Optional[VT], "ImmutableDict[KT, VT]"]:
def copy_pop(self, popped_key) -> Tuple[Optional[VT], ImmutableDict[KT, VT]]:
"""Returns a copy of this ImmutableDict without the given key,
as well as the value associated to the key."""
popped_value = None
new_items = []
for (key, value) in self.data:
if key == popped_key:
popped_value = value
else:
new_items.append((key, value))
new_items = copy.deepcopy(self._data)
popped_value: Optional[VT] = new_items.pop(popped_key, None)
return (popped_value, ImmutableDict(new_items))
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Primitives for finding unknown content efficiently."""
from __future__ import annotations
from collections import namedtuple
import itertools
import logging
import random
from typing import (
Any,
Callable,
Iterable,
List,
Mapping,
NamedTuple,
Optional,
Set,
Union,
)
from typing_extensions import Protocol, runtime_checkable
from .from_disk import model
from .model import Sha1Git
logger = logging.getLogger(__name__)
# Maximum amount when sampling from the undecided set of directory entries
SAMPLE_SIZE = 1000
# Sets of sha1 of contents, skipped contents and directories respectively
Sample: NamedTuple = namedtuple(
"Sample", ["contents", "skipped_contents", "directories"]
)
@runtime_checkable
class ArchiveDiscoveryInterface(Protocol):
"""Interface used in discovery code to abstract over ways of connecting to
the SWH archive (direct storage, web API, etc.) for all methods needed by
discovery algorithms."""
contents: List[model.Content]
skipped_contents: List[model.SkippedContent]
directories: List[model.Directory]
def __init__(
self,
contents: List[model.Content],
skipped_contents: List[model.SkippedContent],
directories: List[model.Directory],
) -> None:
self.contents = contents
self.skipped_contents = skipped_contents
self.directories = directories
def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]:
"""List content missing from the archive by sha1"""
def skipped_content_missing(
self, skipped_contents: List[Sha1Git]
) -> Iterable[Sha1Git]:
"""List skipped content missing from the archive by sha1"""
def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]:
"""List directories missing from the archive by sha1"""
class BaseDiscoveryGraph:
"""Creates the base structures and methods needed for discovery algorithms.
Subclasses should override ``get_sample`` to affect how the discovery is made.
The `update_info_callback` is an optional argument that will get called for
each new piece of information we get. The callback arguments are `(content,
known)`.
- content: the relevant model.Content object,
- known: a boolean, True if the file is known to the archive False otherwise.
"""
def __init__(
self,
contents,
skipped_contents,
directories,
update_info_callback: Optional[Callable[[Any, bool], None]] = None,
):
self._all_contents: Mapping[
Sha1Git, Union[model.Content, model.SkippedContent]
] = {}
self._undecided_directories: Set[Sha1Git] = set()
self._children: Mapping[Sha1Git, Set[Sha1Git]] = {}
self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {}
self.undecided: Set[Sha1Git] = set()
for content in itertools.chain(contents, skipped_contents):
self.undecided.add(content.sha1_git)
self._all_contents[content.sha1_git] = content
for directory in directories:
self.undecided.add(directory.id)
self._undecided_directories.add(directory.id)
self._children[directory.id] = {c.target for c in directory.entries}
for child in directory.entries:
self._parents.setdefault(child.target, set()).add(directory.id)
self.undecided |= self._undecided_directories
self.known: Set[Sha1Git] = set()
self.unknown: Set[Sha1Git] = set()
self._update_info_callback = update_info_callback
self._sha1_to_obj = {}
for content in itertools.chain(contents, skipped_contents):
self._sha1_to_obj[content.sha1_git] = content
for directory in directories:
self._sha1_to_obj[directory.id] = directory
def mark_known(self, entries: Iterable[Sha1Git]):
"""Mark ``entries`` and those they imply as known in the SWH archive"""
self._mark_entries(entries, self._children, self.known)
def mark_unknown(self, entries: Iterable[Sha1Git]):
"""Mark ``entries`` and those they imply as unknown in the SWH archive"""
self._mark_entries(entries, self._parents, self.unknown)
def _mark_entries(
self,
entries: Iterable[Sha1Git],
transitive_mapping: Mapping[Any, Any],
target_set: Set[Any],
):
"""Use Merkle graph properties to mark a directory entry as known or unknown.
If an entry is known, then all of its descendants are known. If it's
unknown, then all of its ancestors are unknown.
- ``entries``: directory entries to mark along with their ancestors/descendants
where applicable.
- ``transitive_mapping``: mapping from an entry to the next entries to mark
in the hierarchy, if any.
- ``target_set``: set where marked entries will be added.
"""
callback = self._update_info_callback
to_process = set(entries)
while to_process:
current = to_process.pop()
target_set.add(current)
new = current in self.undecided
self.undecided.discard(current)
self._undecided_directories.discard(current)
next_entries = transitive_mapping.get(current, set()) & self.undecided
to_process.update(next_entries)
if new and callback is not None:
obj = self._sha1_to_obj[current]
callback(obj, current in self.known)
def get_sample(
self,
) -> Sample:
"""Return a three-tuple of samples from the undecided sets of contents,
skipped contents and directories respectively.
These samples will be queried against the storage which will tell us
which are known."""
raise NotImplementedError()
def do_query(self, archive: ArchiveDiscoveryInterface, sample: Sample) -> None:
"""Given a three-tuple of samples, ask the archive which are known or
unknown and mark them as such."""
methods = (
archive.content_missing,
archive.skipped_content_missing,
archive.directory_missing,
)
for sample_per_type, method in zip(sample, methods):
if not sample_per_type:
continue
known = set(sample_per_type)
unknown = set(method(list(sample_per_type)))
known -= unknown
self.mark_known(known)
self.mark_unknown(unknown)
class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph):
"""Use a random sampling using only directories.
This allows us to find a statistically good spread of entries in the graph
with a smaller population than using all types of entries. When there are
no more directories, only contents or skipped contents are undecided if any
are left: we send them directly to the storage since they should be few and
their structure flat."""
def get_sample(self) -> Sample:
if self._undecided_directories:
if len(self._undecided_directories) <= SAMPLE_SIZE:
return Sample(
contents=set(),
skipped_contents=set(),
directories=set(self._undecided_directories),
)
sample = random.sample(tuple(self._undecided_directories), SAMPLE_SIZE)
directories = {o for o in sample}
return Sample(
contents=set(), skipped_contents=set(), directories=directories
)
contents = set()
skipped_contents = set()
for sha1 in self.undecided:
obj = self._all_contents[sha1]
obj_type = obj.object_type
if obj_type == model.Content.object_type:
contents.add(sha1)
elif obj_type == model.SkippedContent.object_type:
skipped_contents.add(sha1)
else:
raise TypeError(f"Unexpected object type {obj_type}")
return Sample(
contents=contents, skipped_contents=skipped_contents, directories=set()
)
def filter_known_objects(
archive: ArchiveDiscoveryInterface,
update_info_callback: Optional[Callable[[Any, bool], None]] = None,
):
"""Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
to only return those that are unknown to the SWH archive using a discovery
algorithm.
The `update_info_callback` is an optional argument that will get called for
each new piece of information we get. The callback arguments are `(content,
known)`.
- content: the relevant model.Content object,
- known: a boolean, True if the file is known to the archive False otherwise.
"""
contents = archive.contents
skipped_contents = archive.skipped_contents
directories = archive.directories
contents_count = len(contents)
skipped_contents_count = len(skipped_contents)
directories_count = len(directories)
graph = RandomDirSamplingDiscoveryGraph(
contents,
skipped_contents,
directories,
update_info_callback=update_info_callback,
)
while graph.undecided:
sample = graph.get_sample()
graph.do_query(archive, sample)
contents = [c for c in contents if c.sha1_git in graph.unknown]
skipped_contents = [c for c in skipped_contents if c.sha1_git in graph.unknown]
directories = [c for c in directories if c.id in graph.unknown]
logger.debug(
"Filtered out %d contents, %d skipped contents and %d directories",
contents_count - len(contents),
skipped_contents_count - len(skipped_contents),
directories_count - len(directories),
)
return (contents, skipped_contents, directories)
......@@ -27,7 +27,10 @@ def validate_against_schema(model, schema, value):
if not isinstance(value, dict):
raise ValidationError(
"Unexpected type %(type)s for %(model)s, expected dict",
params={"model": model, "type": value.__class__.__name__,},
params={
"model": model,
"type": value.__class__.__name__,
},
code="model-unexpected-type",
)
......
......@@ -96,7 +96,9 @@ def validate_hash(value, hash_type):
raise ValidationError(
"Unexpected type %(type)s for hash, expected str or bytes",
params={"type": value.__class__.__name__,},
params={
"type": value.__class__.__name__,
},
code="unexpected-hash-value-type",
)
......
......@@ -18,7 +18,10 @@ def validate_type(value, type):
typestr = type.__name__
raise ValidationError(
"Unexpected type %(type)s, expected %(expected_type)s",
params={"type": value.__class__.__name__, "expected_type": typestr,},
params={
"type": value.__class__.__name__,
"expected_type": typestr,
},
code="unexpected-type",
)
......
This diff is collapsed.
......@@ -39,15 +39,32 @@ from .collections import ImmutableDict
from .hashutil import git_object_header, hash_to_bytehex
def content_git_object(content: model.Content) -> bytes:
"""Formats a content as a git blob.
A content's identifier is the blob sha1 à la git of the tagged content.
"""
content = cast(model.Content, content)
if content.data is None:
raise model.MissingData("Content data is None, cannot format.")
return git_object_header("blob", len(content.data)) + content.data
def directory_entry_sort_key(entry: model.DirectoryEntry):
"""The sorting key for tree entries"""
if isinstance(entry, dict):
# For backward compatibility
entry = model.DirectoryEntry.from_dict(entry)
if entry.type == "dir":
return entry.name + b"/"
type_ = entry["type"]
name = entry["name"]
else:
return entry.name
type_ = entry.type
name = entry.name
if type_ == "dir":
return name + b"/"
else:
return name
@lru_cache()
......@@ -177,7 +194,13 @@ def directory_git_object(directory: Union[Dict, model.Directory]) -> bytes:
for entry in sorted(directory.entries, key=directory_entry_sort_key):
components.extend(
[_perms_to_bytes(entry.perms), b"\x20", entry.name, b"\x00", entry.target,]
[
_perms_to_bytes(entry.perms),
b"\x20",
entry.name,
b"\x00",
entry.target,
]
)
return format_git_object_from_parts("tree", components)
......@@ -221,10 +244,7 @@ def format_git_object_from_headers(
if message is not None:
entries.extend((b"\n", message))
concatenated_entries = b"".join(entries)
header = git_object_header(git_type, len(concatenated_entries))
return header + concatenated_entries
return format_git_object_from_parts(git_type, entries)
def format_git_object_from_parts(git_type: str, parts: Iterable[bytes]) -> bytes:
......@@ -340,10 +360,15 @@ def revision_git_object(revision: Union[Dict, model.Revision]) -> bytes:
if parent:
headers.append((b"parent", hash_to_bytehex(parent)))
headers.append((b"author", format_author_data(revision.author, revision.date)))
headers.append(
(b"committer", format_author_data(revision.committer, revision.committer_date),)
)
if revision.author is not None:
headers.append((b"author", format_author_data(revision.author, revision.date)))
if revision.committer is not None:
headers.append(
(
b"committer",
format_author_data(revision.committer, revision.committer_date),
)
)
# Handle extra headers
metadata = revision.metadata or ImmutableDict()
......@@ -356,14 +381,14 @@ def revision_git_object(revision: Union[Dict, model.Revision]) -> bytes:
return format_git_object_from_headers("commit", headers, revision.message)
def target_type_to_git(target_type: model.ObjectType) -> bytes:
def target_type_to_git(target_type: model.ReleaseTargetType) -> bytes:
"""Convert a software heritage target type to a git object type"""
return {
model.ObjectType.CONTENT: b"blob",
model.ObjectType.DIRECTORY: b"tree",
model.ObjectType.REVISION: b"commit",
model.ObjectType.RELEASE: b"tag",
model.ObjectType.SNAPSHOT: b"refs",
model.ReleaseTargetType.CONTENT: b"blob",
model.ReleaseTargetType.DIRECTORY: b"tree",
model.ReleaseTargetType.REVISION: b"commit",
model.ReleaseTargetType.RELEASE: b"tag",
model.ReleaseTargetType.SNAPSHOT: b"refs",
}[target_type]
......@@ -391,7 +416,9 @@ def release_git_object(release: Union[Dict, model.Release]) -> bytes:
return format_git_object_from_headers("tag", headers, release.message)
def snapshot_git_object(snapshot: Union[Dict, model.Snapshot]) -> bytes:
def snapshot_git_object(
snapshot: Union[Dict, model.Snapshot], *, ignore_unresolved: bool = False
) -> bytes:
"""Formats a snapshot as a git-like object.
Snapshots are a set of named branches, which are pointers to objects at any
......@@ -435,6 +462,10 @@ def snapshot_git_object(snapshot: Union[Dict, model.Snapshot]) -> bytes:
Note that, akin to directory manifests, there is no separator between
entries. Because of symbolic branches, identifiers are of arbitrary
length but are length-encoded to avoid ambiguity.
Args:
ignore_unresolved: if False (the default), raises an exception when
alias branches point to non-existing branches
"""
if isinstance(snapshot, dict):
# For backward compatibility
......@@ -454,7 +485,7 @@ def snapshot_git_object(snapshot: Union[Dict, model.Snapshot]) -> bytes:
if not target:
target_type = b"dangling"
target_id = b""
elif target.target_type == model.TargetType.ALIAS:
elif target.target_type == model.SnapshotTargetType.ALIAS:
target_type = b"alias"
target_id = target.target
if target_id not in snapshot.branches or target_id == name:
......@@ -474,7 +505,7 @@ def snapshot_git_object(snapshot: Union[Dict, model.Snapshot]) -> bytes:
]
)
if unresolved:
if unresolved and not ignore_unresolved:
raise ValueError(
"Branch aliases unresolved: %s"
% ", ".join("%r -> %r" % x for x in unresolved),
......@@ -485,7 +516,7 @@ def snapshot_git_object(snapshot: Union[Dict, model.Snapshot]) -> bytes:
def raw_extrinsic_metadata_git_object(
metadata: Union[Dict, model.RawExtrinsicMetadata]
metadata: Union[Dict, model.RawExtrinsicMetadata],
) -> bytes:
"""Formats RawExtrinsicMetadata as a git-like object.
......@@ -559,7 +590,10 @@ def raw_extrinsic_metadata_git_object(
b"authority",
f"{metadata.authority.type.value} {metadata.authority.url}".encode(),
),
(b"fetcher", f"{metadata.fetcher.name} {metadata.fetcher.version}".encode(),),
(
b"fetcher",
f"{metadata.fetcher.name} {metadata.fetcher.version}".encode(),
),
(b"format", metadata.format.encode()),
]
......@@ -597,6 +631,8 @@ def extid_git_object(extid: model.ExtID) -> bytes:
[extid_version $Str]
extid $Bytes
target $CoreSwhid
[payload_type $StrWithoutSpaces]
[payload $ContentIdentifier]
```
$StrWithoutSpaces is an ASCII string, and may not contain spaces.
......@@ -605,6 +641,10 @@ def extid_git_object(extid: model.ExtID) -> bytes:
space after them.
The extid_version line is only generated if the version is non-zero.
The payload_type and payload lines are only generated if they are not
:const:`None`. $ContentIdentifier is the object ID of a content object.
"""
headers = [
......@@ -615,7 +655,18 @@ def extid_git_object(extid: model.ExtID) -> bytes:
headers.append((b"extid_version", str(extid_version).encode("ascii")))
headers.extend(
[(b"extid", extid.extid), (b"target", str(extid.target).encode("ascii")),]
[
(b"extid", extid.extid),
(b"target", str(extid.target).encode("ascii")),
]
)
payload_type = extid.payload_type
if payload_type is not None:
headers.append((b"payload_type", payload_type.encode("ascii")))
payload = extid.payload
if payload is not None:
headers.append((b"payload", payload))
return format_git_object_from_headers("extid", headers)
......@@ -56,9 +56,11 @@ import functools
import hashlib
from io import BytesIO
import os
from typing import Callable, Dict, Optional
from typing import Callable, Dict, Optional, Union
ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5"])
ALGORITHMS = set(
["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5", "sha512"]
)
"""Hashing algorithms supported by this module"""
DEFAULT_ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256"])
......@@ -70,7 +72,7 @@ Subset of :const:`ALGORITHMS`.
HASH_BLOCK_SIZE = 32768
"""Block size for streaming hash computations made in this module"""
_blake2_hash_cache = {} # type: Dict[str, Callable]
_blake2_hash_cache: Dict[str, Callable] = {}
class MultiHash:
......@@ -160,9 +162,7 @@ class MultiHash:
def _new_blake2_hash(algo):
"""Return a function that initializes a blake2 hash.
"""
"""Return a function that initializes a blake2 hash."""
if algo in _blake2_hash_cache:
return _blake2_hash_cache[algo]()
......@@ -295,7 +295,7 @@ def hash_git_data(data, git_type, base_algo="sha1"):
@functools.lru_cache()
def hash_to_hex(hash):
def hash_to_hex(hash: Union[str, bytes]) -> str:
"""Converts a hash (in hex or bytes form) to its hexadecimal ascii form
Args:
......@@ -311,7 +311,7 @@ def hash_to_hex(hash):
@functools.lru_cache()
def hash_to_bytehex(hash):
def hash_to_bytehex(hash: bytes) -> bytes:
"""Converts a hash to its hexadecimal bytes representation
Args:
......@@ -324,7 +324,7 @@ def hash_to_bytehex(hash):
@functools.lru_cache()
def hash_to_bytes(hash):
def hash_to_bytes(hash: Union[str, bytes]) -> bytes:
"""Converts a hash (in hex or bytes form) to its raw bytes form
Args:
......@@ -340,7 +340,7 @@ def hash_to_bytes(hash):
@functools.lru_cache()
def bytehex_to_hash(hex):
def bytehex_to_hash(hex: bytes) -> bytes:
"""Converts a hexadecimal bytes representation of a hash to that hash
Args:
......
This diff is collapsed.
# Copyright (C) 2015-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Any, Dict
import warnings
from . import model
# Reexport for backward compatibility
from .git_objects import * # noqa
from .hashutil import MultiHash, hash_to_hex
# Reexport for backward compatibility
from .swhids import * # noqa
warnings.warn(
"The swh.model.identifiers module is deprecated. "
"SWHID-related classes were moved to swh.model.swhids, and identifier "
"computation is now done directly with swh.model.model classes.",
DeprecationWarning,
stacklevel=2,
)
# The following are deprecated aliases of the variants defined in ObjectType
# while transitioning from SWHID to QualifiedSWHID
ORIGIN = "origin"
SNAPSHOT = "snapshot"
REVISION = "revision"
RELEASE = "release"
DIRECTORY = "directory"
CONTENT = "content"
RAW_EXTRINSIC_METADATA = "raw_extrinsic_metadata"
def content_identifier(content: Dict[str, Any]) -> Dict[str, bytes]:
"""Deprecated, use :class:`swh.model.Content` instead:
``content_identifier(d)`` is equivalent to:
``{k: hash_to_hex(v) for (k, v) in Content.from_data(d["data"]).hashes().items()}``
"""
return MultiHash.from_data(content["data"]).digest()
def directory_identifier(directory: Dict[str, Any]) -> str:
"""Deprecated, use :class:`swh.model.Directory` instead:
``directory_identifier(d)`` is equivalent to:
``hash_to_hex(Directory.from_dict(d).id)``.
See :func:`swh.model.git_objects.directory_git_object` for details of the
format used to generate this identifier."""
return hash_to_hex(model.Directory.from_dict(directory).id)
def revision_identifier(revision: Dict[str, Any]) -> str:
"""Deprecated, use :class:`swh.model.Revision` instead:
``revision_identifier(d)`` is equivalent to:
``hash_to_hex(Revision.from_dict(d).id)``.
See :func:`swh.model.git_objects.revision_git_object` for details of the
format used to generate this identifier."""
return hash_to_hex(model.Revision.from_dict(revision).id)
def release_identifier(release: Dict[str, Any]) -> str:
"""Deprecated, use :class:`swh.model.Release` instead:
``release_identifier(d)`` is equivalent to:
``hash_to_hex(Release.from_dict(d).id)``.
See :func:`swh.model.git_objects.release_git_object` for details of the
format used to generate this identifier."""
return hash_to_hex(model.Release.from_dict(release).id)
def snapshot_identifier(snapshot: Dict[str, Any]) -> str:
"""Deprecated, use :class:`swh.model.Snapshot` instead:
``snapshot_identifier(d)`` is equivalent to:
``hash_to_hex(Snapshot.from_dict(d).id)``.
See :func:`swh.model.git_objects.snapshot_git_object` for details of the
format used to generate this identifier."""
return hash_to_hex(model.Snapshot.from_dict(snapshot).id)
def origin_identifier(origin):
"""Deprecated, use :class:`swh.model.Origin` instead:
``origin_identifier(url)`` is equivalent to:
``hash_to_hex(Origin(url=url).id)``.
"""
return hash_to_hex(model.Origin.from_dict(origin).id)
# Copyright (C) 2017-2020 The Software Heritage developers
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Merkle tree data structure"""
import abc
from collections.abc import Mapping
from typing import Dict, Iterator, List, Set
def deep_update(left, right):
"""Recursively update the left mapping with deeply nested values from the right
mapping.
This function is useful to merge the results of several calls to
:func:`MerkleNode.collect`.
Arguments:
left: a mapping (modified by the update operation)
right: a mapping
Returns:
the left mapping, updated with nested values from the right mapping
Example:
>>> a = {
... 'key1': {
... 'key2': {
... 'key3': 'value1/2/3',
... },
... },
... }
>>> deep_update(a, {
... 'key1': {
... 'key2': {
... 'key4': 'value1/2/4',
... },
... },
... }) == {
... 'key1': {
... 'key2': {
... 'key3': 'value1/2/3',
... 'key4': 'value1/2/4',
... },
... },
... }
True
>>> deep_update(a, {
... 'key1': {
... 'key2': {
... 'key3': 'newvalue1/2/3',
... },
... },
... }) == {
... 'key1': {
... 'key2': {
... 'key3': 'newvalue1/2/3',
... 'key4': 'value1/2/4',
... },
... },
... }
True
from __future__ import annotations
"""
for key, rvalue in right.items():
if isinstance(rvalue, Mapping):
new_lvalue = deep_update(left.get(key, {}), rvalue)
left[key] = new_lvalue
else:
left[key] = rvalue
return left
import abc
from typing import Any, Dict, Iterator, List, Set
class MerkleNode(dict, metaclass=abc.ABCMeta):
......@@ -141,7 +79,7 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
for parent in self.parents:
parent.invalidate_hash()
def update_hash(self, *, force=False):
def update_hash(self, *, force=False) -> Any:
"""Recursively compute the hash of the current node.
Args:
......@@ -161,14 +99,17 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
return self.__hash
@property
def hash(self):
def hash(self) -> Any:
"""The hash of the current node, as calculated by
:func:`compute_hash`.
"""
return self.update_hash()
def __hash__(self):
return hash(self.hash)
@abc.abstractmethod
def compute_hash(self):
def compute_hash(self) -> Any:
"""Compute the hash of the current node.
The hash should depend on the data of the node, as well as on hashes
......@@ -223,47 +164,24 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
"""
return self.data
def collect_node(self, **kwargs):
"""Collect the data for the current node, for use by :func:`collect`.
Arguments:
kwargs: passed as-is to :func:`get_data`.
Returns:
A :class:`dict` compatible with :func:`collect`.
"""
def collect_node(self) -> Set[MerkleNode]:
"""Collect the current node if it has not been yet, for use by :func:`collect`."""
if not self.collected:
self.collected = True
return {self.object_type: {self.hash: self.get_data(**kwargs)}}
return {self}
else:
return {}
return set()
def collect(self, **kwargs):
"""Collect the data for all nodes in the subtree rooted at `self`.
The data is deduplicated by type and by hash.
Arguments:
kwargs: passed as-is to :func:`get_data`.
def collect(self) -> Set[MerkleNode]:
"""Collect the added and modified nodes in the subtree rooted at `self`
since the last collect operation.
Returns:
A :class:`dict` with the following structure::
{
'typeA': {
node1.hash: node1.get_data(),
node2.hash: node2.get_data(),
},
'typeB': {
node3.hash: node3.get_data(),
...
},
...
}
A :class:`set` of collected nodes
"""
ret = self.collect_node(**kwargs)
ret = self.collect_node()
for child in self.values():
deep_update(ret, child.collect(**kwargs))
ret.update(child.collect())
return ret
......@@ -277,14 +195,14 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
for child in self.values():
child.reset_collect()
def iter_tree(self, dedup=True) -> Iterator["MerkleNode"]:
def iter_tree(self, dedup=True) -> Iterator[MerkleNode]:
"""Yields all children nodes, recursively. Common nodes are deduplicated
by default (deduplication can be turned off setting the given argument
'dedup' to False).
by default (deduplication can be turned off setting the given argument
'dedup' to False).
"""
yield from self._iter_tree(set(), dedup)
yield from self._iter_tree(seen=set(), dedup=dedup)
def _iter_tree(self, seen: Set[bytes], dedup) -> Iterator["MerkleNode"]:
def _iter_tree(self, seen: Set[bytes], dedup) -> Iterator[MerkleNode]:
if self.hash not in seen:
if dedup:
seen.add(self.hash)
......@@ -299,7 +217,7 @@ class MerkleLeaf(MerkleNode):
A Merkle leaf is simply a Merkle node with children disabled.
"""
__slots__ = [] # type: List[str]
__slots__: List[str] = []
def __setitem__(self, name, child):
raise ValueError("%s is a leaf" % self.__class__.__name__)
......
This diff is collapsed.
......@@ -123,6 +123,9 @@ class _BaseSWHID(Generic[_TObjectType]):
)
def __str__(self) -> str:
return self._format_core_swhid()
def _format_core_swhid(self) -> str:
return SWHID_SEP.join(
[
self.namespace,
......@@ -194,6 +197,17 @@ class CoreSWHID(_BaseSWHID[ObjectType]):
object_id=self.object_id,
)
def to_qualified(self) -> QualifiedSWHID:
"""Converts this CoreSWHID into a QualifiedSWHID.
As QualifiedSWHID is a superset of CoreSWHID, this is lossless."""
return QualifiedSWHID(
namespace=self.namespace,
scheme_version=self.scheme_version,
object_type=self.object_type,
object_id=self.object_id,
)
def _parse_core_swhid(swhid: Union[str, CoreSWHID, None]) -> Optional[CoreSWHID]:
if swhid is None or isinstance(swhid, CoreSWHID):
......@@ -203,7 +217,7 @@ def _parse_core_swhid(swhid: Union[str, CoreSWHID, None]) -> Optional[CoreSWHID]
def _parse_lines_qualifier(
lines: Union[str, Tuple[int, Optional[int]], None]
lines: Union[str, Tuple[int, Optional[int]], None],
) -> Optional[Tuple[int, Optional[int]]]:
try:
if lines is None or isinstance(lines, tuple):
......@@ -291,8 +305,9 @@ class QualifiedSWHID(_BaseSWHID[ObjectType]):
when the anchor denotes a snapshot, the root directory is the one pointed to by HEAD
(possibly indirectly), and undefined if such a reference is missing"""
Lines = Tuple[int, Optional[int]]
lines = attr.ib(
type=Optional[Tuple[int, Optional[int]]],
type=Optional[Lines],
default=None,
validator=type_validator(),
converter=_parse_lines_qualifier,
......@@ -321,15 +336,26 @@ class QualifiedSWHID(_BaseSWHID[ObjectType]):
params={"type": value.object_type.value},
)
def to_dict(self) -> Dict[str, Optional[str | bytes | CoreSWHID | Lines]]:
"""Returns a dictionary version of this QSWHID for json serialization"""
return {
"swhid": self._format_core_swhid(),
"origin": self.origin,
"visit": self.visit,
"anchor": self.anchor,
"path": self.path,
"lines": self.lines,
}
def qualifiers(self) -> Dict[str, str]:
"""Returns URL-escaped qualifiers of this SWHID, for use in serialization"""
origin = self.origin
if origin:
unescaped_origin = origin
origin = origin.replace("%", "%25")
origin = origin.replace(";", "%3B")
assert urllib.parse.unquote_to_bytes(
origin
) == urllib.parse.unquote_to_bytes(
unescaped_origin
assert (
urllib.parse.unquote(origin) == unescaped_origin
), "Escaping ';' in the origin qualifier corrupted the origin URL."
d: Dict[str, Optional[str]] = {
......@@ -350,14 +376,7 @@ class QualifiedSWHID(_BaseSWHID[ObjectType]):
return {k: v for (k, v) in d.items() if v is not None}
def __str__(self) -> str:
swhid = SWHID_SEP.join(
[
self.namespace,
str(self.scheme_version),
self.object_type.value,
hash_to_hex(self.object_id),
]
)
swhid = self._format_core_swhid()
qualifiers = self.qualifiers()
if qualifiers:
for k, v in qualifiers.items():
......@@ -377,6 +396,9 @@ class QualifiedSWHID(_BaseSWHID[ObjectType]):
"Invalid qualifier(s): %(qualifiers)s",
params={"qualifiers": ", ".join(invalid_qualifiers)},
)
if "origin" in qualifiers:
qualifiers["origin"] = urllib.parse.unquote(qualifiers["origin"])
try:
return QualifiedSWHID(**parts, **qualifiers)
except ValueError as e:
......
......@@ -157,7 +157,9 @@ class ValidateCompound(unittest.TestCase):
def test_validate_whole_schema_shortcut_previous_error(self):
with self.assertRaises(ValidationError) as cm:
compound.validate_against_schema(
self.test_model, self.test_schema_shortcut, self.test_value_missing,
self.test_model,
self.test_schema_shortcut,
self.test_value_missing,
)
exc = cm.exception
......@@ -167,7 +169,9 @@ class ValidateCompound(unittest.TestCase):
def test_validate_whole_schema(self):
with self.assertRaises(ValidationError) as cm:
compound.validate_against_schema(
self.test_model, self.test_schema_shortcut, self.test_value,
self.test_model,
self.test_schema_shortcut,
self.test_value,
)
# The exception should be of the form:
......
This diff is collapsed.
......@@ -19,7 +19,6 @@ from swh.model.tests.swh_model_data import SAMPLE_FOLDER_SWHIDS
from swh.model.tests.test_from_disk import DataMixin
@pytest.mark.fs
class TestIdentify(DataMixin, unittest.TestCase):
def setUp(self):
super().setUp()
......@@ -78,7 +77,9 @@ class TestIdentify(DataMixin, unittest.TestCase):
with unittest.mock.patch.dict(sys.modules, {"dulwich": None}):
with tempfile.TemporaryDirectory(prefix="swh.model.cli") as d:
result = self.runner.invoke(
cli.identify, ["--type", "snapshot", d], catch_exceptions=False,
cli.identify,
["--type", "snapshot", d],
catch_exceptions=False,
)
assert result.exit_code == 1
......@@ -94,7 +95,8 @@ class TestIdentify(DataMixin, unittest.TestCase):
"""identify symlink --- both itself and target"""
regular = os.path.join(self.tmpdir_name, b"foo.txt")
link = os.path.join(self.tmpdir_name, b"bar.txt")
open(regular, "w").write("foo\n")
with open(regular, "w") as f:
f.write("foo\n")
os.symlink(os.path.basename(regular), link)
result = self.runner.invoke(cli.identify, [link])
......