Compare revisions

c9583bae · c9583bae · c9583bae · 098f76a7 · 098f76a7 · 098f76a7
--- a/requirements-cli.txt
+++ b/requirements-cli.txt
-swh.core
+swh.core >= 0.3
 Click
 dulwich
--- a/requirements-test.txt
+++ b/requirements-test.txt
-Click
-dulwich
-pytest
+aiohttp
+click
+pytest >= 8.1
 pytz
+types-click
+types-python-dateutil
+types-pytz
+types-deprecated
--- a/requirements.txt
+++ b/requirements.txt
 # Add here external Python modules dependencies, one per line. Module names
 # should match https://pypi.python.org/pypi names. For the full spec or
 # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
-vcversioner
-attrs
-attrs_strict
+attrs != 21.1.0  # https://github.com/python-attrs/attrs/issues/804
+attrs_strict >= 0.0.7
+deprecated
 hypothesis
-python-dateutil
 iso8601
+python-dateutil
+typing_extensions
+
--- a/setup.cfg
+++ b/setup.cfg
-[flake8]
-# E203: whitespaces before ':' <https://github.com/psf/black/issues/315>
-# E231: missing whitespace after ','
-# W503: line break before binary operator <https://github.com/psf/black/issues/52>
-ignore = E203,E231,W503
-max-line-length = 88
--- a/setup.py
+++ b/setup.py
-#!/usr/bin/env python3
-# Copyright (C) 2015-2018  The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-from setuptools import setup, find_packages
-
-from os import path
-from io import open
-
-here = path.abspath(path.dirname(__file__))
-
-# Get the long description from the README file
-with open(path.join(here, "README.md"), encoding="utf-8") as f:
-    long_description = f.read()
-
-
-def parse_requirements(name=None):
-    if name:
-        reqf = "requirements-%s.txt" % name
-    else:
-        reqf = "requirements.txt"
-
-    requirements = []
-    if not path.exists(reqf):
-        return requirements
-
-    with open(reqf) as f:
-        for line in f.readlines():
-            line = line.strip()
-            if not line or line.startswith("#"):
-                continue
-            requirements.append(line)
-    return requirements
-
-
-blake2_requirements = ['pyblake2;python_version<"3.6"']
-
-setup(
-    name="swh.model",
-    description="Software Heritage data model",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    author="Software Heritage developers",
-    author_email="swh-devel@inria.fr",
-    url="https://forge.softwareheritage.org/diffusion/DMOD/",
-    packages=find_packages(),
-    setup_requires=["vcversioner"],
-    install_requires=(
-        parse_requirements() + parse_requirements("swh") + blake2_requirements
-    ),
-    extras_require={
-        "cli": parse_requirements("cli"),
-        "testing": parse_requirements("test"),
-    },
-    vcversioner={},
-    include_package_data=True,
-    entry_points="""
-        [console_scripts]
-        swh-identify=swh.model.cli:identify
-        [swh.cli.subcommands]
-        identify=swh.model.cli:identify
-    """,
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "Intended Audience :: Developers",
-        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
-        "Operating System :: OS Independent",
-        "Development Status :: 5 - Production/Stable",
-    ],
-    project_urls={
-        "Bug Reports": "https://forge.softwareheritage.org/maniphest",
-        "Funding": "https://www.softwareheritage.org/donate",
-        "Source": "https://forge.softwareheritage.org/source/swh-model",
-    },
-)
--- a/swh/__init__.py
+++ b/swh/__init__.py
-from pkgutil import extend_path
-from typing import Iterable
-
-__path__ = extend_path(__path__, __name__)  # type: Iterable[str]
--- a/swh/model/cli.py
+++ b/swh/model/cli.py
-# Copyright (C) 2018-2019  The Software Heritage developers
+# Copyright (C) 2018-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

-import click
-import dulwich.repo
 import os
 import sys
+from typing import Callable, Dict, Iterable, Optional
+
+# WARNING: do not import unnecessary things here to keep cli startup time under
+# control
+try:
+    import click
+except ImportError:
+    print(
+        "Cannot run swh-identify; the Click package is not installed."
+        "Please install 'swh.model[cli]' for full functionality.",
+        file=sys.stderr,
+    )
+    exit(1)

-from functools import partial
-from urllib.parse import urlparse
+try:
+    import swh.core.cli

-from swh.model import hashutil
-from swh.model import identifiers as pids
-from swh.model.exceptions import ValidationError
-from swh.model.from_disk import Content, Directory
+    cli_command = swh.core.cli.swh.command
+except ImportError:
+    # stub so that swh-identify can be used when swh-core isn't installed
+    cli_command = click.command

+from swh.model.from_disk import Directory
+from swh.model.swhids import CoreSWHID

 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])

@@ -29,43 +42,81 @@ _DULWICH_TYPES = {
 }


-class PidParamType(click.ParamType):
-    name = "persistent identifier"
+class CoreSWHIDParamType(click.ParamType):
+    """Click argument that accepts a core SWHID and returns them as
+    :class:`swh.model.swhids.CoreSWHID` instances"""
+
+    name = "SWHID"
+
+    def convert(self, value, param, ctx) -> CoreSWHID:
+        from swh.model.exceptions import ValidationError

-    def convert(self, value, param, ctx):
        try:
-            pids.parse_persistent_identifier(value)
-            return value  # return as string, as we need just that
+            return CoreSWHID.from_string(value)
        except ValidationError as e:
-            self.fail("%s is not a valid SWHID. %s." % (value, e), param, ctx)
+            self.fail(f'"{value}" is not a valid core SWHID: {e}', param, ctx)
+

+def swhid_of_file(path) -> CoreSWHID:
+    from swh.model.from_disk import Content

-def pid_of_file(path):
-    object = Content.from_file(path=path).get_data()
-    return pids.persistent_identifier(pids.CONTENT, object)
+    object = Content.from_file(path=path)
+    return object.swhid()


-def pid_of_file_content(data):
-    object = Content.from_bytes(mode=644, data=data).get_data()
-    return pids.persistent_identifier(pids.CONTENT, object)
+def swhid_of_file_content(data) -> CoreSWHID:
+    from swh.model.from_disk import Content

+    object = Content.from_bytes(mode=644, data=data)
+    return object.swhid()

-def pid_of_dir(path):
-    object = Directory.from_disk(path=path).get_data()
-    return pids.persistent_identifier(pids.DIRECTORY, object)

+def model_of_dir(
+    path: bytes,
+    exclude_patterns: Optional[Iterable[bytes]] = None,
+    update_info: Optional[Callable[[int], None]] = None,
+) -> Directory:
+    from swh.model.from_disk import accept_all_paths, ignore_directories_patterns

-def pid_of_origin(url):
-    pid = pids.PersistentId(
-        object_type="origin", object_id=pids.origin_identifier({"url": url})
+    path_filter = (
+        ignore_directories_patterns(path, exclude_patterns)
+        if exclude_patterns
+        else accept_all_paths
    )
-    return str(pid)

+    return Directory.from_disk(
+        path=path, path_filter=path_filter, progress_callback=update_info
+    )
+
+
+def swhid_of_dir(
+    path: bytes, exclude_patterns: Optional[Iterable[bytes]] = None
+) -> CoreSWHID:
+    obj = model_of_dir(path, exclude_patterns)
+    return obj.swhid()
+
+
+def swhid_of_origin(url):
+    from swh.model.model import Origin
+
+    return Origin(url).swhid()
+
+
+def swhid_of_git_repo(path) -> CoreSWHID:
+    try:
+        import dulwich.repo
+    except ImportError:
+        raise click.ClickException(
+            "Cannot compute snapshot identifier; the Dulwich package is not installed. "
+            "Please install 'swh.model[cli]' for full functionality.",
+        )
+
+    from swh.model import hashutil
+    from swh.model.model import Snapshot

-def pid_of_git_repo(path):
    repo = dulwich.repo.Repo(path)

-    branches = {}
+    branches: Dict[bytes, Optional[Dict]] = {}
    for ref, target in repo.refs.as_dict().items():
        obj = repo[target]
        if obj:
@@ -84,13 +135,14 @@ def pid_of_git_repo(path):

    snapshot = {"branches": branches}

-    pid = pids.PersistentId(
-        object_type="snapshot", object_id=pids.snapshot_identifier(snapshot)
-    )
-    return str(pid)
+    return Snapshot.from_dict(snapshot).swhid()


-def identify_object(obj_type, follow_symlinks, obj):
+def identify_object(
+    obj_type: str, follow_symlinks: bool, exclude_patterns: Iterable[bytes], obj
+) -> str:
+    from urllib.parse import urlparse
+
    if obj_type == "auto":
        if obj == "-" or os.path.isfile(obj):
            obj_type = "content"
@@ -105,32 +157,30 @@ def identify_object(obj_type, follow_symlinks, obj):
            except ValueError:
                raise click.BadParameter("cannot detect object type for %s" % obj)

-    pid = None
-
    if obj == "-":
        content = sys.stdin.buffer.read()
-        pid = pid_of_file_content(content)
+        swhid = str(swhid_of_file_content(content))
    elif obj_type in ["content", "directory"]:
        path = obj.encode(sys.getfilesystemencoding())
        if follow_symlinks and os.path.islink(obj):
            path = os.path.realpath(obj)
        if obj_type == "content":
-            pid = pid_of_file(path)
+            swhid = str(swhid_of_file(path))
        elif obj_type == "directory":
-            pid = pid_of_dir(path)
+            swhid = str(swhid_of_dir(path, exclude_patterns))
    elif obj_type == "origin":
-        pid = pid_of_origin(obj)
+        swhid = str(swhid_of_origin(obj))
    elif obj_type == "snapshot":
-        pid = pid_of_git_repo(obj)
+        swhid = str(swhid_of_git_repo(obj))
    else:  # shouldn't happen, due to option validation
        raise click.BadParameter("invalid object type: " + obj_type)

    # note: we return original obj instead of path here, to preserve user-given
    # file name in output
-    return (obj, pid)
+    return swhid


-@click.command(context_settings=CONTEXT_SETTINGS)
+@cli_command(context_settings=CONTEXT_SETTINGS)
 @click.option(
    "--dereference/--no-dereference",
    "follow_symlinks",
@@ -152,64 +202,118 @@ def identify_object(obj_type, follow_symlinks, obj):
    type=click.Choice(["auto", "content", "directory", "origin", "snapshot"]),
    help="type of object to identify (default: auto)",
 )
+@click.option(
+    "--exclude",
+    "-x",
+    "exclude_patterns",
+    metavar="PATTERN",
+    multiple=True,
+    help="Exclude directories using glob patterns \
+    (e.g., ``*.git`` to exclude all .git directories)",
+)
 @click.option(
    "--verify",
    "-v",
    metavar="SWHID",
-    type=PidParamType(),
+    type=CoreSWHIDParamType(),
    help="reference identifier to be compared with computed one",
 )
-@click.argument("objects", nargs=-1)
-def identify(obj_type, verify, show_filename, follow_symlinks, objects):
+@click.option(
+    "-r",
+    "--recursive",
+    is_flag=True,
+    help="compute SWHID recursively",
+)
+@click.argument("objects", nargs=-1, required=True)
+def identify(
+    obj_type,
+    verify,
+    show_filename,
+    follow_symlinks,
+    objects,
+    exclude_patterns,
+    recursive,
+):
    """Compute the Software Heritage persistent identifier (SWHID) for the given
    source code object(s).

    For more details about SWHIDs see:

-    \b
    https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html

-    \b
-    Examples:
+    Tip: you can pass "-" to identify the content of standard input.
+
+    Examples::

-    \b
      $ swh identify fork.c kmod.c sched/deadline.c
      swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3    fork.c
      swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2    kmod.c
      swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82    sched/deadline.c

-    \b
      $ swh identify --no-filename /usr/src/linux/kernel/
      swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab

-    \b
      $ git clone --mirror https://forge.softwareheritage.org/source/helloworld.git
+
      $ swh identify --type snapshot helloworld.git/
-      swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93	helloworld.git
+      swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93    helloworld.git
+
+    """
+    from functools import partial
+    import logging

-    """  # NoQA  # overlong lines in shell examples are fine
-    if not objects:
-        objects = ["-"]
+    if exclude_patterns:
+        exclude_patterns = set(pattern.encode() for pattern in exclude_patterns)

    if verify and len(objects) != 1:
        raise click.BadParameter("verification requires a single object")

-    results = map(partial(identify_object, obj_type, follow_symlinks), objects)
-
-    if verify:
-        pid = next(results)[1]
-        if verify == pid:
-            click.echo("SWHID match: %s" % pid)
-            sys.exit(0)
-        else:
-            click.echo("SWHID mismatch: %s != %s" % (verify, pid))
-            sys.exit(1)
-    else:
-        for (obj, pid) in results:
-            msg = pid
-            if show_filename:
-                msg = "%s\t%s" % (pid, os.fsdecode(obj))
+    if recursive and not os.path.isdir(objects[0]):
+        recursive = False
+        logging.warn("recursive option disabled, input is not a directory object")
+
+    if recursive:
+        if verify:
+            raise click.BadParameter(
+                "verification of recursive object identification is not supported"
+            )
+
+        if not obj_type == ("auto" or "directory"):
+            raise click.BadParameter(
+                "recursive identification is supported only for directories"
+            )
+
+        path = os.fsencode(objects[0])
+        dir_obj = model_of_dir(path, exclude_patterns)
+        for sub_obj in dir_obj.iter_tree():
+            path_name = "path" if "path" in sub_obj.data.keys() else "data"
+            path = os.fsdecode(sub_obj.data[path_name])
+            swhid = str(sub_obj.swhid())
+            msg = f"{swhid}\t{path}" if show_filename else f"{swhid}"
            click.echo(msg)
+    else:
+        results = zip(
+            objects,
+            map(
+                partial(identify_object, obj_type, follow_symlinks, exclude_patterns),
+                objects,
+            ),
+        )
+
+        if verify:
+            swhid = next(results)[1]
+            if str(verify) == swhid:
+                click.echo("SWHID match: %s" % swhid)
+                sys.exit(0)
+            else:
+                click.echo("SWHID mismatch: %s != %s" % (verify, swhid))
+                sys.exit(1)
+        else:
+            for obj, swhid in results:
+                msg = swhid
+                if show_filename:
+                    msg = "%s\t%s" % (swhid, os.fsdecode(obj))
+                click.echo(msg)


 if __name__ == "__main__":

--- a/swh/model/collections.py
+++ b/swh/model/collections.py
+# Copyright (C) 2020-2023 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from __future__ import annotations
+
+"""Utility data structures."""
+
+from collections.abc import Mapping
+import copy
+from typing import Dict, Generic, Iterable, Optional, Tuple, TypeVar, Union
+
+KT = TypeVar("KT")
+VT = TypeVar("VT")
+
+
+class ImmutableDict(Mapping, Generic[KT, VT]):
+    """A frozen dictionary.
+
+    This class behaves like a dictionary, but internally stores objects in a tuple,
+    so it is both immutable and hashable."""
+
+    _data: Dict[KT, VT]
+
+    def __init__(
+        self,
+        data: Union[Iterable[Tuple[KT, VT]], ImmutableDict[KT, VT], Dict[KT, VT]] = {},
+    ):
+        if isinstance(data, dict):
+            self._data = data
+        elif isinstance(data, ImmutableDict):
+            self._data = data._data
+        else:
+            self._data = {k: v for k, v in data}
+
+    @property
+    def data(self):
+        return tuple(self._data.items())
+
+    def __repr__(self):
+        return f"ImmutableDict({dict(self.data)!r})"
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def __iter__(self):
+        for k, v in self.data:
+            yield k
+
+    def __len__(self):
+        return len(self._data)
+
+    def items(self):
+        yield from self.data
+
+    def __hash__(self):
+        return hash(tuple(sorted(self.data)))
+
+    def copy_pop(self, popped_key) -> Tuple[Optional[VT], ImmutableDict[KT, VT]]:
+        """Returns a copy of this ImmutableDict without the given key,
+        as well as the value associated to the key."""
+        new_items = copy.deepcopy(self._data)
+        popped_value: Optional[VT] = new_items.pop(popped_key, None)
+        return (popped_value, ImmutableDict(new_items))
--- a/swh/model/discovery.py
+++ b/swh/model/discovery.py
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Primitives for finding unknown content efficiently."""
+
+from __future__ import annotations
+
+from collections import namedtuple
+import itertools
+import logging
+import random
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    Set,
+    Union,
+)
+
+from typing_extensions import Protocol, runtime_checkable
+
+from .from_disk import model
+from .model import Sha1Git
+
+logger = logging.getLogger(__name__)
+
+# Maximum amount when sampling from the undecided set of directory entries
+SAMPLE_SIZE = 1000
+
+# Sets of sha1 of contents, skipped contents and directories respectively
+Sample: NamedTuple = namedtuple(
+    "Sample", ["contents", "skipped_contents", "directories"]
+)
+
+
+@runtime_checkable
+class ArchiveDiscoveryInterface(Protocol):
+    """Interface used in discovery code to abstract over ways of connecting to
+    the SWH archive (direct storage, web API, etc.) for all methods needed by
+    discovery algorithms."""
+
+    contents: List[model.Content]
+    skipped_contents: List[model.SkippedContent]
+    directories: List[model.Directory]
+
+    def __init__(
+        self,
+        contents: List[model.Content],
+        skipped_contents: List[model.SkippedContent],
+        directories: List[model.Directory],
+    ) -> None:
+        self.contents = contents
+        self.skipped_contents = skipped_contents
+        self.directories = directories
+
+    def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]:
+        """List content missing from the archive by sha1"""
+
+    def skipped_content_missing(
+        self, skipped_contents: List[Sha1Git]
+    ) -> Iterable[Sha1Git]:
+        """List skipped content missing from the archive by sha1"""
+
+    def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]:
+        """List directories missing from the archive by sha1"""
+
+
+class BaseDiscoveryGraph:
+    """Creates the base structures and methods needed for discovery algorithms.
+    Subclasses should override ``get_sample`` to affect how the discovery is made.
+
+    The `update_info_callback` is an optional argument that will get called for
+    each new piece of information we get. The callback arguments are `(content,
+    known)`.
+    - content: the relevant model.Content object,
+    - known: a boolean, True if the file is known to the archive False otherwise.
+    """
+
+    def __init__(
+        self,
+        contents,
+        skipped_contents,
+        directories,
+        update_info_callback: Optional[Callable[[Any, bool], None]] = None,
+    ):
+        self._all_contents: Mapping[
+            Sha1Git, Union[model.Content, model.SkippedContent]
+        ] = {}
+        self._undecided_directories: Set[Sha1Git] = set()
+        self._children: Mapping[Sha1Git, Set[Sha1Git]] = {}
+        self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {}
+        self.undecided: Set[Sha1Git] = set()
+
+        for content in itertools.chain(contents, skipped_contents):
+            self.undecided.add(content.sha1_git)
+            self._all_contents[content.sha1_git] = content
+
+        for directory in directories:
+            self.undecided.add(directory.id)
+            self._undecided_directories.add(directory.id)
+            self._children[directory.id] = {c.target for c in directory.entries}
+            for child in directory.entries:
+                self._parents.setdefault(child.target, set()).add(directory.id)
+
+        self.undecided |= self._undecided_directories
+        self.known: Set[Sha1Git] = set()
+        self.unknown: Set[Sha1Git] = set()
+        self._update_info_callback = update_info_callback
+        self._sha1_to_obj = {}
+        for content in itertools.chain(contents, skipped_contents):
+            self._sha1_to_obj[content.sha1_git] = content
+        for directory in directories:
+            self._sha1_to_obj[directory.id] = directory
+
+    def mark_known(self, entries: Iterable[Sha1Git]):
+        """Mark ``entries`` and those they imply as known in the SWH archive"""
+        self._mark_entries(entries, self._children, self.known)
+
+    def mark_unknown(self, entries: Iterable[Sha1Git]):
+        """Mark ``entries`` and those they imply as unknown in the SWH archive"""
+        self._mark_entries(entries, self._parents, self.unknown)
+
+    def _mark_entries(
+        self,
+        entries: Iterable[Sha1Git],
+        transitive_mapping: Mapping[Any, Any],
+        target_set: Set[Any],
+    ):
+        """Use Merkle graph properties to mark a directory entry as known or unknown.
+
+        If an entry is known, then all of its descendants are known. If it's
+        unknown, then all of its ancestors are unknown.
+
+        - ``entries``: directory entries to mark along with their ancestors/descendants
+          where applicable.
+        - ``transitive_mapping``: mapping from an entry to the next entries to mark
+          in the hierarchy, if any.
+        - ``target_set``: set where marked entries will be added.
+
+        """
+        callback = self._update_info_callback
+        to_process = set(entries)
+        while to_process:
+            current = to_process.pop()
+            target_set.add(current)
+            new = current in self.undecided
+            self.undecided.discard(current)
+            self._undecided_directories.discard(current)
+            next_entries = transitive_mapping.get(current, set()) & self.undecided
+            to_process.update(next_entries)
+            if new and callback is not None:
+                obj = self._sha1_to_obj[current]
+                callback(obj, current in self.known)
+
+    def get_sample(
+        self,
+    ) -> Sample:
+        """Return a three-tuple of samples from the undecided sets of contents,
+        skipped contents and directories respectively.
+        These samples will be queried against the storage which will tell us
+        which are known."""
+        raise NotImplementedError()
+
+    def do_query(self, archive: ArchiveDiscoveryInterface, sample: Sample) -> None:
+        """Given a three-tuple of samples, ask the archive which are known or
+        unknown and mark them as such."""
+
+        methods = (
+            archive.content_missing,
+            archive.skipped_content_missing,
+            archive.directory_missing,
+        )
+
+        for sample_per_type, method in zip(sample, methods):
+            if not sample_per_type:
+                continue
+            known = set(sample_per_type)
+            unknown = set(method(list(sample_per_type)))
+            known -= unknown
+
+            self.mark_known(known)
+            self.mark_unknown(unknown)
+
+
+class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph):
+    """Use a random sampling using only directories.
+
+    This allows us to find a statistically good spread of entries in the graph
+    with a smaller population than using all types of entries. When there are
+    no more directories, only contents or skipped contents are undecided if any
+    are left: we send them directly to the storage since they should be few and
+    their structure flat."""
+
+    def get_sample(self) -> Sample:
+        if self._undecided_directories:
+            if len(self._undecided_directories) <= SAMPLE_SIZE:
+                return Sample(
+                    contents=set(),
+                    skipped_contents=set(),
+                    directories=set(self._undecided_directories),
+                )
+            sample = random.sample(tuple(self._undecided_directories), SAMPLE_SIZE)
+            directories = {o for o in sample}
+            return Sample(
+                contents=set(), skipped_contents=set(), directories=directories
+            )
+
+        contents = set()
+        skipped_contents = set()
+
+        for sha1 in self.undecided:
+            obj = self._all_contents[sha1]
+            obj_type = obj.object_type
+            if obj_type == model.Content.object_type:
+                contents.add(sha1)
+            elif obj_type == model.SkippedContent.object_type:
+                skipped_contents.add(sha1)
+            else:
+                raise TypeError(f"Unexpected object type {obj_type}")
+
+        return Sample(
+            contents=contents, skipped_contents=skipped_contents, directories=set()
+        )
+
+
+def filter_known_objects(
+    archive: ArchiveDiscoveryInterface,
+    update_info_callback: Optional[Callable[[Any, bool], None]] = None,
+):
+    """Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
+    to only return those that are unknown to the SWH archive using a discovery
+    algorithm.
+
+    The `update_info_callback` is an optional argument that will get called for
+    each new piece of information we get. The callback arguments are `(content,
+    known)`.
+    - content: the relevant model.Content object,
+    - known: a boolean, True if the file is known to the archive False otherwise.
+    """
+    contents = archive.contents
+    skipped_contents = archive.skipped_contents
+    directories = archive.directories
+
+    contents_count = len(contents)
+    skipped_contents_count = len(skipped_contents)
+    directories_count = len(directories)
+
+    graph = RandomDirSamplingDiscoveryGraph(
+        contents,
+        skipped_contents,
+        directories,
+        update_info_callback=update_info_callback,
+    )
+
+    while graph.undecided:
+        sample = graph.get_sample()
+        graph.do_query(archive, sample)
+
+    contents = [c for c in contents if c.sha1_git in graph.unknown]
+    skipped_contents = [c for c in skipped_contents if c.sha1_git in graph.unknown]
+    directories = [c for c in directories if c.id in graph.unknown]
+
+    logger.debug(
+        "Filtered out %d contents, %d skipped contents and %d directories",
+        contents_count - len(contents),
+        skipped_contents_count - len(skipped_contents),
+        directories_count - len(directories),
+    )
+
+    return (contents, skipped_contents, directories)
--- a/swh/model/exceptions.py
+++ b/swh/model/exceptions.py
@@ -129,3 +129,7 @@ class ValidationError(Exception):

    def __repr__(self):
        return "ValidationError(%s)" % self
+
+
+class InvalidDirectoryPath(Exception):
+    pass
--- a/swh/model/fields/__init__.py
+++ b/swh/model/fields/__init__.py
@@ -6,13 +6,13 @@
 # We do our imports here but we don't use them, so flake8 complains
 # flake8: noqa

+from .compound import validate_against_schema, validate_all_keys, validate_any_key
+from .hashes import validate_sha1, validate_sha1_git, validate_sha256
 from .simple import (
-    validate_type,
-    validate_int,
-    validate_str,
    validate_bytes,
    validate_datetime,
    validate_enum,
+    validate_int,
+    validate_str,
+    validate_type,
 )
-from .hashes import validate_sha1, validate_sha1_git, validate_sha256
-from .compound import validate_against_schema, validate_all_keys, validate_any_key
--- a/swh/model/fields/compound.py
+++ b/swh/model/fields/compound.py
@@ -6,7 +6,7 @@
 from collections import defaultdict
 import itertools

-from ..exceptions import ValidationError, NON_FIELD_ERRORS
+from ..exceptions import NON_FIELD_ERRORS, ValidationError


 def validate_against_schema(model, schema, value):
@@ -27,7 +27,10 @@ def validate_against_schema(model, schema, value):
    if not isinstance(value, dict):
        raise ValidationError(
            "Unexpected type %(type)s for %(model)s, expected dict",
-            params={"model": model, "type": value.__class__.__name__,},
+            params={
+                "model": model,
+                "type": value.__class__.__name__,
+            },
            code="model-unexpected-type",
        )


--- a/swh/model/fields/hashes.py
+++ b/swh/model/fields/hashes.py
@@ -4,6 +4,7 @@
 # See top-level LICENSE file for more information

 import string
+
 from ..exceptions import ValidationError


@@ -95,7 +96,9 @@ def validate_hash(value, hash_type):

    raise ValidationError(
        "Unexpected type %(type)s for hash, expected str or bytes",
-        params={"type": value.__class__.__name__,},
+        params={
+            "type": value.__class__.__name__,
+        },
        code="unexpected-hash-value-type",
    )


--- a/swh/model/fields/simple.py
+++ b/swh/model/fields/simple.py
@@ -18,7 +18,10 @@ def validate_type(value, type):
            typestr = type.__name__
        raise ValidationError(
            "Unexpected type %(type)s, expected %(expected_type)s",
-            params={"type": value.__class__.__name__, "expected_type": typestr,},
+            params={
+                "type": value.__class__.__name__,
+                "expected_type": typestr,
+            },
            code="unexpected-type",
        )


--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
@@ -54,12 +54,13 @@ Basic usage examples:
 import binascii
 import functools
 import hashlib
-import os
-
 from io import BytesIO
-from typing import Callable, Dict
+import os
+from typing import Callable, Dict, Optional, Union

-ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512"])
+ALGORITHMS = set(
+    ["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5", "sha512"]
+)
 """Hashing algorithms supported by this module"""

 DEFAULT_ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256"])
@@ -71,7 +72,7 @@ Subset of :const:`ALGORITHMS`.
 HASH_BLOCK_SIZE = 32768
 """Block size for streaming hash computations made in this module"""

-_blake2_hash_cache = {}  # type: Dict[str, Callable]
+_blake2_hash_cache: Dict[str, Callable] = {}


 class MultiHash:
@@ -161,9 +162,7 @@ class MultiHash:


 def _new_blake2_hash(algo):
-    """Return a function that initializes a blake2 hash.
-
-    """
+    """Return a function that initializes a blake2 hash."""
    if algo in _blake2_hash_cache:
        return _blake2_hash_cache[algo]()

@@ -184,20 +183,8 @@ def _new_blake2_hash(algo):
                "Digest size for algorithm %s must be a multiple of 8" % algo
            )

-    if lalgo in hashlib.algorithms_available:
-        # Handle the case where OpenSSL ships the given algorithm
-        # (e.g. Python 3.5 on Debian 9 stretch)
-        _blake2_hash_cache[algo] = lambda: hashlib.new(lalgo)
-    else:
-        # Try using the built-in implementation for Python 3.6+
-        if blake_family in hashlib.algorithms_available:
-            blake2 = getattr(hashlib, blake_family)
-        else:
-            import pyblake2
-
-            blake2 = getattr(pyblake2, blake_family)
-
-        _blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size)
+    blake2 = getattr(hashlib, blake_family)
+    _blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size)

    return _blake2_hash_cache[algo]()

@@ -213,12 +200,10 @@ def _new_hashlib_hash(algo):
        return hashlib.new(algo)


-def _new_git_hash(base_algo, git_type, length):
-    """Initialize a digest object (as returned by python's hashlib) for the
-    requested algorithm, and feed it with the header for a git object of the
-    given type and length.
+def git_object_header(git_type: str, length: int) -> bytes:
+    """Returns the header for a git object of the given type and length.

-    The header for hashing a git object consists of:
+    The header of a git object consists of:
     - The type of the object (encoded in ASCII)
     - One ASCII space (\x20)
     - The length of the object (decimal encoded in ASCII)
@@ -233,15 +218,26 @@ def _new_git_hash(base_algo, git_type, length):
    Returns:
        a hashutil.hash object
    """
+    git_object_types = {
+        "blob",
+        "tree",
+        "commit",
+        "tag",
+        "snapshot",
+        "raw_extrinsic_metadata",
+        "extid",
+    }

-    h = _new_hashlib_hash(base_algo)
-    git_header = "%s %d\0" % (git_type, length)
-    h.update(git_header.encode("ascii"))
+    if git_type not in git_object_types:
+        raise ValueError(
+            "Unexpected git object type %s, expected one of %s"
+            % (git_type, ", ".join(sorted(git_object_types)))
+        )

-    return h
+    return ("%s %d\0" % (git_type, length)).encode("ascii")


-def _new_hash(algo, length=None):
+def _new_hash(algo: str, length: Optional[int] = None):
    """Initialize a digest object (as returned by python's hashlib) for
    the requested algorithm. See the constant ALGORITHMS for the list
    of supported algorithms. If a git-specific hashing algorithm is
@@ -271,7 +267,9 @@ def _new_hash(algo, length=None):
        if length is None:
            raise ValueError("Missing length for git hashing algorithm")
        base_algo = algo[:-4]
-        return _new_git_hash(base_algo, "blob", length)
+        h = _new_hashlib_hash(base_algo)
+        h.update(git_object_header("blob", length))
+        return h

    return _new_hashlib_hash(algo)

@@ -289,23 +287,15 @@ def hash_git_data(data, git_type, base_algo="sha1"):
    Raises:
        ValueError if the git_type is unexpected.
    """
-
-    git_object_types = {"blob", "tree", "commit", "tag", "snapshot"}
-
-    if git_type not in git_object_types:
-        raise ValueError(
-            "Unexpected git object type %s, expected one of %s"
-            % (git_type, ", ".join(sorted(git_object_types)))
-        )
-
-    h = _new_git_hash(base_algo, git_type, len(data))
+    h = _new_hashlib_hash(base_algo)
+    h.update(git_object_header(git_type, len(data)))
    h.update(data)

    return h.digest()


 @functools.lru_cache()
-def hash_to_hex(hash):
+def hash_to_hex(hash: Union[str, bytes]) -> str:
    """Converts a hash (in hex or bytes form) to its hexadecimal ascii form

    Args:
@@ -321,7 +311,7 @@ def hash_to_hex(hash):


 @functools.lru_cache()
-def hash_to_bytehex(hash):
+def hash_to_bytehex(hash: bytes) -> bytes:
    """Converts a hash to its hexadecimal bytes representation

    Args:
@@ -334,7 +324,7 @@ def hash_to_bytehex(hash):


 @functools.lru_cache()
-def hash_to_bytes(hash):
+def hash_to_bytes(hash: Union[str, bytes]) -> bytes:
    """Converts a hash (in hex or bytes form) to its raw bytes form

    Args:
@@ -350,7 +340,7 @@ def hash_to_bytes(hash):


 @functools.lru_cache()
-def bytehex_to_hash(hex):
+def bytehex_to_hash(hex: bytes) -> bytes:
    """Converts a hexadecimal bytes representation of a hash to that hash

    Args:

--- a/swh/model/hypothesis_strategies.py
+++ b/swh/model/hypothesis_strategies.py
--- a/swh/model/merkle.py
+++ b/swh/model/merkle.py
-# Copyright (C) 2017 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

 """Merkle tree data structure"""

-import abc
-import collections
-
-from typing import Iterator, List, Optional, Set
-
-
-def deep_update(left, right):
-    """Recursively update the left mapping with deeply nested values from the right
-    mapping.
-
-    This function is useful to merge the results of several calls to
-    :func:`MerkleNode.collect`.
-
-    Arguments:
-      left: a mapping (modified by the update operation)
-      right: a mapping
-
-    Returns:
-      the left mapping, updated with nested values from the right mapping
-
-    Example:
-        >>> a = {
-        ...     'key1': {
-        ...         'key2': {
-        ...              'key3': 'value1/2/3',
-        ...         },
-        ...     },
-        ... }
-        >>> deep_update(a, {
-        ...     'key1': {
-        ...         'key2': {
-        ...              'key4': 'value1/2/4',
-        ...         },
-        ...     },
-        ... }) == {
-        ...     'key1': {
-        ...         'key2': {
-        ...             'key3': 'value1/2/3',
-        ...             'key4': 'value1/2/4',
-        ...         },
-        ...     },
-        ... }
-        True
-        >>> deep_update(a, {
-        ...     'key1': {
-        ...         'key2': {
-        ...              'key3': 'newvalue1/2/3',
-        ...         },
-        ...     },
-        ... }) == {
-        ...     'key1': {
-        ...         'key2': {
-        ...             'key3': 'newvalue1/2/3',
-        ...             'key4': 'value1/2/4',
-        ...         },
-        ...     },
-        ... }
-        True
+from __future__ import annotations

-    """
-    for key, rvalue in right.items():
-        if isinstance(rvalue, collections.Mapping):
-            new_lvalue = deep_update(left.get(key, {}), rvalue)
-            left[key] = new_lvalue
-        else:
-            left[key] = rvalue
-    return left
+import abc
+from typing import Any, Dict, Iterator, List, Set


 class MerkleNode(dict, metaclass=abc.ABCMeta):
@@ -102,17 +39,18 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
    The collection of updated data from the tree is implemented through the
    :func:`collect` function and associated helpers.

-    Attributes:
-      data (dict): data associated to the current node
-      parents (list): known parents of the current node
-      collected (bool): whether the current node has been collected
-
    """

    __slots__ = ["parents", "data", "__hash", "collected"]

-    type = None  # type: Optional[str]  # TODO: make this an enum
-    """Type of the current node (used as a classifier for :func:`collect`)"""
+    data: Dict
+    """data associated to the current node"""
+
+    parents: List
+    """known parents of the current node"""
+
+    collected: bool
+    """whether the current node has been collected"""

    def __init__(self, data=None):
        super().__init__()
@@ -141,7 +79,7 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        for parent in self.parents:
            parent.invalidate_hash()

-    def update_hash(self, *, force=False):
+    def update_hash(self, *, force=False) -> Any:
        """Recursively compute the hash of the current node.

        Args:
@@ -161,14 +99,17 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        return self.__hash

    @property
-    def hash(self):
+    def hash(self) -> Any:
        """The hash of the current node, as calculated by
        :func:`compute_hash`.
        """
        return self.update_hash()

+    def __hash__(self):
+        return hash(self.hash)
+
    @abc.abstractmethod
-    def compute_hash(self):
+    def compute_hash(self) -> Any:
        """Compute the hash of the current node.

        The hash should depend on the data of the node, as well as on hashes
@@ -223,47 +164,24 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        """
        return self.data

-    def collect_node(self, **kwargs):
-        """Collect the data for the current node, for use by :func:`collect`.
-
-        Arguments:
-          kwargs: passed as-is to :func:`get_data`.
-
-        Returns:
-          A :class:`dict` compatible with :func:`collect`.
-        """
+    def collect_node(self) -> Set[MerkleNode]:
+        """Collect the current node if it has not been yet, for use by :func:`collect`."""
        if not self.collected:
            self.collected = True
-            return {self.type: {self.hash: self.get_data(**kwargs)}}
+            return {self}
        else:
-            return {}
-
-    def collect(self, **kwargs):
-        """Collect the data for all nodes in the subtree rooted at `self`.
+            return set()

-        The data is deduplicated by type and by hash.
-
-        Arguments:
-          kwargs: passed as-is to :func:`get_data`.
+    def collect(self) -> Set[MerkleNode]:
+        """Collect the added and modified nodes in the subtree rooted at `self`
+        since the last collect operation.

        Returns:
-           A :class:`dict` with the following structure::
-
-             {
-               'typeA': {
-                 node1.hash: node1.get_data(),
-                 node2.hash: node2.get_data(),
-               },
-               'typeB': {
-                 node3.hash: node3.get_data(),
-                 ...
-               },
-               ...
-             }
+           A :class:`set` of collected nodes
        """
-        ret = self.collect_node(**kwargs)
+        ret = self.collect_node()
        for child in self.values():
-            deep_update(ret, child.collect(**kwargs))
+            ret.update(child.collect())

        return ret

@@ -277,18 +195,20 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        for child in self.values():
            child.reset_collect()

-    def iter_tree(self) -> Iterator["MerkleNode"]:
-        """Yields all children nodes, recursively. Common nodes are
-        deduplicated.
+    def iter_tree(self, dedup=True) -> Iterator[MerkleNode]:
+        """Yields all children nodes, recursively. Common nodes are deduplicated
+        by default (deduplication can be turned off setting the given argument
+        'dedup' to False).
        """
-        yield from self._iter_tree(set())
+        yield from self._iter_tree(seen=set(), dedup=dedup)

-    def _iter_tree(self, seen: Set[bytes]) -> Iterator["MerkleNode"]:
+    def _iter_tree(self, seen: Set[bytes], dedup) -> Iterator[MerkleNode]:
        if self.hash not in seen:
-            seen.add(self.hash)
+            if dedup:
+                seen.add(self.hash)
            yield self
            for child in self.values():
-                yield from child._iter_tree(seen=seen)
+                yield from child._iter_tree(seen=seen, dedup=dedup)


 class MerkleLeaf(MerkleNode):
@@ -297,7 +217,7 @@ class MerkleLeaf(MerkleNode):
    A Merkle leaf is simply a Merkle node with children disabled.
    """

-    __slots__ = []  # type: List[str]
+    __slots__: List[str] = []

    def __setitem__(self, name, child):
        raise ValueError("%s is a leaf" % self.__class__.__name__)

--- a/swh/model/model.py
+++ b/swh/model/model.py
No results found