Compare revisions

c9583bae · c9583bae · c9583bae · c9583bae · c9583bae · c9583bae
--- a/docs/iana-swh-template.txt
+++ b/docs/iana-swh-template.txt
+(last updated 2020-04-28)
+
+Scheme name: swh
+
+Status: Provisional
+
+Applications/protocols that use this scheme name:
+  Software Heritage: https://www.softwareheritage.org/
+  Software Package Data Exchange: https://spdx.org/
+  NTIA: https://www.ntia.doc.gov/SoftwareTransparency
+  Identifiers.org: http://identifiers.org/
+  Name-to-Thing (N2T): https://n2t.net/
+  HAL: https://hal.archives-ouvertes.fr/
+
+Contact: Stefano Zacchiroli <zack@upsilon.cc>
+
+Change controller: Software Heritage <info@softwareheritage.org>
+
+References:
+
+  Scheme specification: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
+
+  The Software Heritage project: https://www.softwareheritage.org/
+
+  The Software Heritage archive: https://archive.softwareheritage.org/
+
+  Publications:
+
+    Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. Referencing Source
+    Code Artifacts: a Separate Concern in Software Citation. In Computing in
+    Science and Engineering, volume 22, issue 2, pp. 33-43. ISSN 1521-9615,
+    IEEE. March 2020. DOI 10.1109/MCSE.2019.2963148
+
+    Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. Identifiers for
+    Digital Objects: the Case of Software Source Code Preservation. In
+    proceedings of iPRES 2018: 15th International Conference on Digital
+    Preservation. September 2018. 10.17605/OSF.IO/KDE56
+
+(file created 2020-04-28)
--- a/docs/images/Makefile
+++ b/docs/images/Makefile
@@ -6,20 +6,15 @@ BUILD_TARGETS += $(MERKLE_DAG)

 all: $(BUILD_TARGETS)

-
-# dia exporters
-
-%.eps: %.dia
-	dia -t eps --export $@ $<
-
 %.svg: %.dia
-	dia -t svg --export $@ $<
-
-# generic converters
-
-%.pdf: %.eps
-	epstopdf $<
-
+	dia -e $@ $<
+
+%.pdf: %.svg
+	set -e; if [ $$(inkscape --version 2>/dev/null | grep -Eo '[0-9]+' | head -1) -gt 0 ]; then \
+	  inkscape -o $@ $< ; \
+	else \
+	  inkscape -A $@ $< ; \
+	fi

 clean:
 	-rm -f $(BUILD_TARGETS)
--- a/docs/index.rst
+++ b/docs/index.rst
 .. _swh-model:

-Software Heritage - Development Documentation
-=============================================
+.. include:: README.rst

 .. toctree::
-   :maxdepth: 2
-   :caption: Contents:
+   :caption: Overview:
+   :titlesonly:

+   data-model
+   persistent-identifiers
+   cli

-Overview
--------
+.. only:: standalone_package_doc

-* :ref:`data-model`
-* :ref:`persistent-identifiers`
+   Indices and tables
+   ------------------

-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
+   * :ref:`genindex`
+   * :ref:`modindex`
+   * :ref:`search`
--- a/docs/persistent-identifiers.rst
+++ b/docs/persistent-identifiers.rst
--- a/pyproject.toml
+++ b/pyproject.toml
+[project]
+name = "swh.model"
+authors = [
+    {name="Software Heritage developers", email="swh-devel@inria.fr"},
+]
+
+description = "Software Heritage data model"
+readme = {file = "README.rst", content-type = "text/x-rst"}
+requires-python = ">=3.7"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
+    "Operating System :: OS Independent",
+    "Development Status :: 5 - Production/Stable",
+]
+dynamic = ["version", "dependencies", "optional-dependencies"]
+
+[tool.setuptools.packages.find]
+include = ["swh.*"]
+
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
+
+[tool.setuptools.dynamic.optional-dependencies]
+cli = {file = "requirements-cli.txt"}
+testing = {file = ["requirements-cli.txt", "requirements-test.txt"]}
+testing_minimal = {file = "requirements-test.txt"}
+
+[project.entry-points.console_scripts]
+"swh.identify" = "swh.model.cli:identify"
+
+[project.entry-points."swh.cli.subcommands"]
+"swh.model" = "swh.model.cli"
+
+[project.urls]
+"Homepage" = "https://gitlab.softwareheritage.org/swh/devel/swh-model"
+"Bug Reports" = "https://gitlab.softwareheritage.org/swh/devel/swh-model/-/issues"
+"Funding" = "https://www.softwareheritage.org/donate"
+"Documentation" = "https://docs.softwareheritage.org/devel/swh-model/"
+"Source" = "https://gitlab.softwareheritage.org/swh/devel/swh-model.git"
+
+[build-system]
+requires = ["setuptools", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools_scm]
+fallback_version = "0.0.1"
+
+[tool.black]
+target-version = ['py39', 'py310', 'py311', 'py312']
+
+[tool.isort]
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+line_length = 88
+force_sort_within_sections = true
+known_first_party = ['swh']
+
+[tool.mypy]
+namespace_packages = true
+warn_unused_ignores = true
+explicit_package_bases = true
+# ^ Needed for mypy to detect py.typed from swh packages installed
+# in editable mode
+
+plugins = []
+
+# 3rd party libraries without stubs (yet)
+# [[tool.mypy.overrides]]
+# module = [
+#     "package1.*",
+#     "package2.*",
+# ]
+# ignore_missing_imports = true
+
+[tool.flake8]
+select = ["C", "E", "F", "W", "B950"]
+ignore = [
+    "E203", # whitespaces before ':' <https://github.com/psf/black/issues/315>
+    "E231", # missing whitespace after ','
+    "E501", # line too long, use B950 warning from flake8-bugbear instead
+    "W503" # line break before binary operator <https://github.com/psf/black/issues/52>
+]
+max-line-length = 88
+
+[tool.pytest.ini_options]
+addopts = "--doctest-modules -p no:pytest_swh_core"
+norecursedirs = "build docs .*"
+asyncio_mode = "strict"
+consider_namespace_packages = true
+markers = [
+    "requires_optional_deps: tests in test_cli.py that should not run if optional dependencies are not installed",
+]
--- a/requirements-cli.txt
+++ b/requirements-cli.txt
+swh.core >= 0.3
+Click
+dulwich
--- a/requirements-swh.txt
+++ b/requirements-swh.txt
--- a/requirements-test.txt
+++ b/requirements-test.txt
+aiohttp
+click
+pytest >= 8.1
+pytz
+types-click
+types-python-dateutil
+types-pytz
+types-deprecated
--- a/requirements.txt
+++ b/requirements.txt
 # Add here external Python modules dependencies, one per line. Module names
 # should match https://pypi.python.org/pypi names. For the full spec or
 # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
-vcversioner
-Click
+attrs != 21.1.0  # https://github.com/python-attrs/attrs/issues/804
+attrs_strict >= 0.0.7
+deprecated
+hypothesis
+iso8601
+python-dateutil
+typing_extensions
+
--- a/setup.py
+++ b/setup.py
-import hashlib
-
-from setuptools import setup, find_packages
-
-
-def parse_requirements():
-    requirements = []
-    for reqf in ('requirements.txt', 'requirements-swh.txt'):
-        with open(reqf) as f:
-            for line in f.readlines():
-                line = line.strip()
-                if not line or line.startswith('#'):
-                    continue
-                requirements.append(line)
-    return requirements
-
-
-extra_requirements = []
-
-pyblake2_hash_sets = [
-    # Built-in implementation in Python 3.6+
-    {'blake2s', 'blake2b'},
-    # Potentially shipped by OpenSSL 1.1 (e.g. Python 3.5 in Debian stretch
-    # has these)
-    {'blake2s256', 'blake2b512'},
-]
-
-for pyblake2_hashes in pyblake2_hash_sets:
-    if not pyblake2_hashes - set(hashlib.algorithms_available):
-        # The required blake2 hashes have been found
-        break
-else:
-    # None of the possible sets of blake2 hashes are available.
-    # use pyblake2 instead
-    extra_requirements.append('pyblake2')
-
-setup(
-    name='swh.model',
-    description='Software Heritage data model',
-    author='Software Heritage developers',
-    author_email='swh-devel@inria.fr',
-    url='https://forge.softwareheritage.org/diffusion/DMOD/',
-    packages=find_packages(),  # packages's modules
-    scripts=[],   # scripts to package
-    install_requires=parse_requirements() + extra_requirements,
-    entry_points='''
-        [console_scripts]
-        swh-identify=swh.model.cli:identify
-    ''',
-    setup_requires=['vcversioner'],
-    vcversioner={},
-    include_package_data=True,
-)
--- a/swh/__init__.py
+++ b/swh/__init__.py
-__path__ = __import__('pkgutil').extend_path(__path__, __name__)
--- a/swh/model/cli.py
+++ b/swh/model/cli.py
-# Copyright (C) 2018  The Software Heritage developers
+# Copyright (C) 2018-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

-import click
 import os
 import sys
+from typing import Callable, Dict, Iterable, Optional

-from swh.model import identifiers as pids
-from swh.model.exceptions import ValidationError
-from swh.model.from_disk import Content, Directory
+# WARNING: do not import unnecessary things here to keep cli startup time under
+# control
+try:
+    import click
+except ImportError:
+    print(
+        "Cannot run swh-identify; the Click package is not installed."
+        "Please install 'swh.model[cli]' for full functionality.",
+        file=sys.stderr,
+    )
+    exit(1)

+try:
+    import swh.core.cli

-class PidParamType(click.ParamType):
-    name = 'persistent identifier'
+    cli_command = swh.core.cli.swh.command
+except ImportError:
+    # stub so that swh-identify can be used when swh-core isn't installed
+    cli_command = click.command
+
+from swh.model.from_disk import Directory
+from swh.model.swhids import CoreSWHID
+
+CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
+
+# Mapping between dulwich types and Software Heritage ones. Used by snapshot ID
+# computation.
+_DULWICH_TYPES = {
+    b"blob": "content",
+    b"tree": "directory",
+    b"commit": "revision",
+    b"tag": "release",
+}
+
+
+class CoreSWHIDParamType(click.ParamType):
+    """Click argument that accepts a core SWHID and returns them as
+    :class:`swh.model.swhids.CoreSWHID` instances"""
+
+    name = "SWHID"
+
+    def convert(self, value, param, ctx) -> CoreSWHID:
+        from swh.model.exceptions import ValidationError

-    def convert(self, value, param, ctx):
        try:
-            pids.parse_persistent_identifier(value)
-            return value  # return as string, as we need just that
+            return CoreSWHID.from_string(value)
        except ValidationError as e:
-            self.fail('%s is not a valid PID. %s.' % (value, e), param, ctx)
+            self.fail(f'"{value}" is not a valid core SWHID: {e}', param, ctx)
+
+
+def swhid_of_file(path) -> CoreSWHID:
+    from swh.model.from_disk import Content
+
+    object = Content.from_file(path=path)
+    return object.swhid()
+
+
+def swhid_of_file_content(data) -> CoreSWHID:
+    from swh.model.from_disk import Content
+
+    object = Content.from_bytes(mode=644, data=data)
+    return object.swhid()
+
+
+def model_of_dir(
+    path: bytes,
+    exclude_patterns: Optional[Iterable[bytes]] = None,
+    update_info: Optional[Callable[[int], None]] = None,
+) -> Directory:
+    from swh.model.from_disk import accept_all_paths, ignore_directories_patterns
+
+    path_filter = (
+        ignore_directories_patterns(path, exclude_patterns)
+        if exclude_patterns
+        else accept_all_paths
+    )
+
+    return Directory.from_disk(
+        path=path, path_filter=path_filter, progress_callback=update_info
+    )
+
+
+def swhid_of_dir(
+    path: bytes, exclude_patterns: Optional[Iterable[bytes]] = None
+) -> CoreSWHID:
+    obj = model_of_dir(path, exclude_patterns)
+    return obj.swhid()
+
+
+def swhid_of_origin(url):
+    from swh.model.model import Origin
+
+    return Origin(url).swhid()
+

+def swhid_of_git_repo(path) -> CoreSWHID:
+    try:
+        import dulwich.repo
+    except ImportError:
+        raise click.ClickException(
+            "Cannot compute snapshot identifier; the Dulwich package is not installed. "
+            "Please install 'swh.model[cli]' for full functionality.",
+        )

-def pid_of_file(path):
-    object = Content.from_file(path=path).get_data()
-    return pids.persistent_identifier(pids.CONTENT, object)
+    from swh.model import hashutil
+    from swh.model.model import Snapshot

+    repo = dulwich.repo.Repo(path)

-def pid_of_dir(path):
-    object = Directory.from_disk(path=path).get_data()
-    return pids.persistent_identifier(pids.DIRECTORY, object)
+    branches: Dict[bytes, Optional[Dict]] = {}
+    for ref, target in repo.refs.as_dict().items():
+        obj = repo[target]
+        if obj:
+            branches[ref] = {
+                "target": hashutil.bytehex_to_hash(target),
+                "target_type": _DULWICH_TYPES[obj.type_name],
+            }
+        else:
+            branches[ref] = None
+
+    for ref, target in repo.refs.get_symrefs().items():
+        branches[ref] = {
+            "target": target,
+            "target_type": "alias",
+        }
+
+    snapshot = {"branches": branches}
+
+    return Snapshot.from_dict(snapshot).swhid()
+
+
+def identify_object(
+    obj_type: str, follow_symlinks: bool, exclude_patterns: Iterable[bytes], obj
+) -> str:
+    from urllib.parse import urlparse

+    if obj_type == "auto":
+        if obj == "-" or os.path.isfile(obj):
+            obj_type = "content"
+        elif os.path.isdir(obj):
+            obj_type = "directory"
+        else:
+            try:  # URL parsing
+                if urlparse(obj).scheme:
+                    obj_type = "origin"
+                else:
+                    raise ValueError
+            except ValueError:
+                raise click.BadParameter("cannot detect object type for %s" % obj)
+
+    if obj == "-":
+        content = sys.stdin.buffer.read()
+        swhid = str(swhid_of_file_content(content))
+    elif obj_type in ["content", "directory"]:
+        path = obj.encode(sys.getfilesystemencoding())
+        if follow_symlinks and os.path.islink(obj):
+            path = os.path.realpath(obj)
+        if obj_type == "content":
+            swhid = str(swhid_of_file(path))
+        elif obj_type == "directory":
+            swhid = str(swhid_of_dir(path, exclude_patterns))
+    elif obj_type == "origin":
+        swhid = str(swhid_of_origin(obj))
+    elif obj_type == "snapshot":
+        swhid = str(swhid_of_git_repo(obj))
+    else:  # shouldn't happen, due to option validation
+        raise click.BadParameter("invalid object type: " + obj_type)

-@click.command()
-@click.option('--type', '-t', default='auto',
-              type=click.Choice(['auto', 'content', 'directory']),
-              help='type of object to identify (default: auto)')
-@click.option('--verify', '-v', metavar='PID', type=PidParamType(),
-              help='reference identifier to be compared with computed one')
-@click.argument('object',
-                type=click.Path(exists=True, readable=True,
-                                allow_dash=True, path_type=bytes))
-def identify(type, verify, object):
-    """Compute the Software Heritage persistent identifier (PID) for a given
-    source code object.
+    # note: we return original obj instead of path here, to preserve user-given
+    # file name in output
+    return swhid

-    For more details about Software Heritage PIDs see:

-    \b
+@cli_command(context_settings=CONTEXT_SETTINGS)
+@click.option(
+    "--dereference/--no-dereference",
+    "follow_symlinks",
+    default=True,
+    help="follow (or not) symlinks for OBJECTS passed as arguments "
+    + "(default: follow)",
+)
+@click.option(
+    "--filename/--no-filename",
+    "show_filename",
+    default=True,
+    help="show/hide file name (default: show)",
+)
+@click.option(
+    "--type",
+    "-t",
+    "obj_type",
+    default="auto",
+    type=click.Choice(["auto", "content", "directory", "origin", "snapshot"]),
+    help="type of object to identify (default: auto)",
+)
+@click.option(
+    "--exclude",
+    "-x",
+    "exclude_patterns",
+    metavar="PATTERN",
+    multiple=True,
+    help="Exclude directories using glob patterns \
+    (e.g., ``*.git`` to exclude all .git directories)",
+)
+@click.option(
+    "--verify",
+    "-v",
+    metavar="SWHID",
+    type=CoreSWHIDParamType(),
+    help="reference identifier to be compared with computed one",
+)
+@click.option(
+    "-r",
+    "--recursive",
+    is_flag=True,
+    help="compute SWHID recursively",
+)
+@click.argument("objects", nargs=-1, required=True)
+def identify(
+    obj_type,
+    verify,
+    show_filename,
+    follow_symlinks,
+    objects,
+    exclude_patterns,
+    recursive,
+):
+    """Compute the Software Heritage persistent identifier (SWHID) for the given
+    source code object(s).
+
+    For more details about SWHIDs see:
+
    https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html

-    \b
-    Examples:
+    Tip: you can pass "-" to identify the content of standard input.
+
+    Examples::

-    \b
-      $ swh-identify /usr/src/linux/kernel/
+      $ swh identify fork.c kmod.c sched/deadline.c
+      swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3    fork.c
+      swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2    kmod.c
+      swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82    sched/deadline.c
+
+      $ swh identify --no-filename /usr/src/linux/kernel/
      swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab

-    \b
-      $ swh-identify /usr/src/linux/kernel/sched/deadline.c
-      swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82
+      $ git clone --mirror https://forge.softwareheritage.org/source/helloworld.git
+
+      $ swh identify --type snapshot helloworld.git/
+      swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93    helloworld.git

    """
-    if type == 'auto':
-        if os.path.isfile(object):
-            type = 'content'
-        elif os.path.isdir(object):
-            type = 'directory'
-        else:  # shouldn't happen, due to path validation
-            raise click.BadParameter('%s is neither a file nor a directory' %
-                                     object)
-
-    pid = None
-    if type == 'content':
-        pid = pid_of_file(object)
-    elif type == 'directory':
-        pid = pid_of_dir(object)
-    else:  # shouldn't happen, due to option validation
-        raise click.BadParameter('invalid object type: ' + type)
+    from functools import partial
+    import logging

-    if verify:
-        if verify == pid:
-            click.echo('PID match: %s' % pid)
-            sys.exit(0)
-        else:
-            click.echo('PID mismatch: %s != %s' % (verify, pid))
-            sys.exit(1)
+    if exclude_patterns:
+        exclude_patterns = set(pattern.encode() for pattern in exclude_patterns)
+
+    if verify and len(objects) != 1:
+        raise click.BadParameter("verification requires a single object")
+
+    if recursive and not os.path.isdir(objects[0]):
+        recursive = False
+        logging.warn("recursive option disabled, input is not a directory object")
+
+    if recursive:
+        if verify:
+            raise click.BadParameter(
+                "verification of recursive object identification is not supported"
+            )
+
+        if not obj_type == ("auto" or "directory"):
+            raise click.BadParameter(
+                "recursive identification is supported only for directories"
+            )
+
+        path = os.fsencode(objects[0])
+        dir_obj = model_of_dir(path, exclude_patterns)
+        for sub_obj in dir_obj.iter_tree():
+            path_name = "path" if "path" in sub_obj.data.keys() else "data"
+            path = os.fsdecode(sub_obj.data[path_name])
+            swhid = str(sub_obj.swhid())
+            msg = f"{swhid}\t{path}" if show_filename else f"{swhid}"
+            click.echo(msg)
    else:
-        click.echo(pid)
+        results = zip(
+            objects,
+            map(
+                partial(identify_object, obj_type, follow_symlinks, exclude_patterns),
+                objects,
+            ),
+        )
+
+        if verify:
+            swhid = next(results)[1]
+            if str(verify) == swhid:
+                click.echo("SWHID match: %s" % swhid)
+                sys.exit(0)
+            else:
+                click.echo("SWHID mismatch: %s != %s" % (verify, swhid))
+                sys.exit(1)
+        else:
+            for obj, swhid in results:
+                msg = swhid
+                if show_filename:
+                    msg = "%s\t%s" % (swhid, os.fsdecode(obj))
+                click.echo(msg)


-if __name__ == '__main__':
+if __name__ == "__main__":
    identify()
--- a/swh/model/collections.py
+++ b/swh/model/collections.py
+# Copyright (C) 2020-2023 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from __future__ import annotations
+
+"""Utility data structures."""
+
+from collections.abc import Mapping
+import copy
+from typing import Dict, Generic, Iterable, Optional, Tuple, TypeVar, Union
+
+KT = TypeVar("KT")
+VT = TypeVar("VT")
+
+
+class ImmutableDict(Mapping, Generic[KT, VT]):
+    """A frozen dictionary.
+
+    This class behaves like a dictionary, but internally stores objects in a tuple,
+    so it is both immutable and hashable."""
+
+    _data: Dict[KT, VT]
+
+    def __init__(
+        self,
+        data: Union[Iterable[Tuple[KT, VT]], ImmutableDict[KT, VT], Dict[KT, VT]] = {},
+    ):
+        if isinstance(data, dict):
+            self._data = data
+        elif isinstance(data, ImmutableDict):
+            self._data = data._data
+        else:
+            self._data = {k: v for k, v in data}
+
+    @property
+    def data(self):
+        return tuple(self._data.items())
+
+    def __repr__(self):
+        return f"ImmutableDict({dict(self.data)!r})"
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def __iter__(self):
+        for k, v in self.data:
+            yield k
+
+    def __len__(self):
+        return len(self._data)
+
+    def items(self):
+        yield from self.data
+
+    def __hash__(self):
+        return hash(tuple(sorted(self.data)))
+
+    def copy_pop(self, popped_key) -> Tuple[Optional[VT], ImmutableDict[KT, VT]]:
+        """Returns a copy of this ImmutableDict without the given key,
+        as well as the value associated to the key."""
+        new_items = copy.deepcopy(self._data)
+        popped_value: Optional[VT] = new_items.pop(popped_key, None)
+        return (popped_value, ImmutableDict(new_items))
--- a/swh/model/discovery.py
+++ b/swh/model/discovery.py
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Primitives for finding unknown content efficiently."""
+
+from __future__ import annotations
+
+from collections import namedtuple
+import itertools
+import logging
+import random
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    Set,
+    Union,
+)
+
+from typing_extensions import Protocol, runtime_checkable
+
+from .from_disk import model
+from .model import Sha1Git
+
+logger = logging.getLogger(__name__)
+
+# Maximum amount when sampling from the undecided set of directory entries
+SAMPLE_SIZE = 1000
+
+# Sets of sha1 of contents, skipped contents and directories respectively
+Sample: NamedTuple = namedtuple(
+    "Sample", ["contents", "skipped_contents", "directories"]
+)
+
+
+@runtime_checkable
+class ArchiveDiscoveryInterface(Protocol):
+    """Interface used in discovery code to abstract over ways of connecting to
+    the SWH archive (direct storage, web API, etc.) for all methods needed by
+    discovery algorithms."""
+
+    contents: List[model.Content]
+    skipped_contents: List[model.SkippedContent]
+    directories: List[model.Directory]
+
+    def __init__(
+        self,
+        contents: List[model.Content],
+        skipped_contents: List[model.SkippedContent],
+        directories: List[model.Directory],
+    ) -> None:
+        self.contents = contents
+        self.skipped_contents = skipped_contents
+        self.directories = directories
+
+    def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]:
+        """List content missing from the archive by sha1"""
+
+    def skipped_content_missing(
+        self, skipped_contents: List[Sha1Git]
+    ) -> Iterable[Sha1Git]:
+        """List skipped content missing from the archive by sha1"""
+
+    def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]:
+        """List directories missing from the archive by sha1"""
+
+
+class BaseDiscoveryGraph:
+    """Creates the base structures and methods needed for discovery algorithms.
+    Subclasses should override ``get_sample`` to affect how the discovery is made.
+
+    The `update_info_callback` is an optional argument that will get called for
+    each new piece of information we get. The callback arguments are `(content,
+    known)`.
+    - content: the relevant model.Content object,
+    - known: a boolean, True if the file is known to the archive False otherwise.
+    """
+
+    def __init__(
+        self,
+        contents,
+        skipped_contents,
+        directories,
+        update_info_callback: Optional[Callable[[Any, bool], None]] = None,
+    ):
+        self._all_contents: Mapping[
+            Sha1Git, Union[model.Content, model.SkippedContent]
+        ] = {}
+        self._undecided_directories: Set[Sha1Git] = set()
+        self._children: Mapping[Sha1Git, Set[Sha1Git]] = {}
+        self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {}
+        self.undecided: Set[Sha1Git] = set()
+
+        for content in itertools.chain(contents, skipped_contents):
+            self.undecided.add(content.sha1_git)
+            self._all_contents[content.sha1_git] = content
+
+        for directory in directories:
+            self.undecided.add(directory.id)
+            self._undecided_directories.add(directory.id)
+            self._children[directory.id] = {c.target for c in directory.entries}
+            for child in directory.entries:
+                self._parents.setdefault(child.target, set()).add(directory.id)
+
+        self.undecided |= self._undecided_directories
+        self.known: Set[Sha1Git] = set()
+        self.unknown: Set[Sha1Git] = set()
+        self._update_info_callback = update_info_callback
+        self._sha1_to_obj = {}
+        for content in itertools.chain(contents, skipped_contents):
+            self._sha1_to_obj[content.sha1_git] = content
+        for directory in directories:
+            self._sha1_to_obj[directory.id] = directory
+
+    def mark_known(self, entries: Iterable[Sha1Git]):
+        """Mark ``entries`` and those they imply as known in the SWH archive"""
+        self._mark_entries(entries, self._children, self.known)
+
+    def mark_unknown(self, entries: Iterable[Sha1Git]):
+        """Mark ``entries`` and those they imply as unknown in the SWH archive"""
+        self._mark_entries(entries, self._parents, self.unknown)
+
+    def _mark_entries(
+        self,
+        entries: Iterable[Sha1Git],
+        transitive_mapping: Mapping[Any, Any],
+        target_set: Set[Any],
+    ):
+        """Use Merkle graph properties to mark a directory entry as known or unknown.
+
+        If an entry is known, then all of its descendants are known. If it's
+        unknown, then all of its ancestors are unknown.
+
+        - ``entries``: directory entries to mark along with their ancestors/descendants
+          where applicable.
+        - ``transitive_mapping``: mapping from an entry to the next entries to mark
+          in the hierarchy, if any.
+        - ``target_set``: set where marked entries will be added.
+
+        """
+        callback = self._update_info_callback
+        to_process = set(entries)
+        while to_process:
+            current = to_process.pop()
+            target_set.add(current)
+            new = current in self.undecided
+            self.undecided.discard(current)
+            self._undecided_directories.discard(current)
+            next_entries = transitive_mapping.get(current, set()) & self.undecided
+            to_process.update(next_entries)
+            if new and callback is not None:
+                obj = self._sha1_to_obj[current]
+                callback(obj, current in self.known)
+
+    def get_sample(
+        self,
+    ) -> Sample:
+        """Return a three-tuple of samples from the undecided sets of contents,
+        skipped contents and directories respectively.
+        These samples will be queried against the storage which will tell us
+        which are known."""
+        raise NotImplementedError()
+
+    def do_query(self, archive: ArchiveDiscoveryInterface, sample: Sample) -> None:
+        """Given a three-tuple of samples, ask the archive which are known or
+        unknown and mark them as such."""
+
+        methods = (
+            archive.content_missing,
+            archive.skipped_content_missing,
+            archive.directory_missing,
+        )
+
+        for sample_per_type, method in zip(sample, methods):
+            if not sample_per_type:
+                continue
+            known = set(sample_per_type)
+            unknown = set(method(list(sample_per_type)))
+            known -= unknown
+
+            self.mark_known(known)
+            self.mark_unknown(unknown)
+
+
+class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph):
+    """Use a random sampling using only directories.
+
+    This allows us to find a statistically good spread of entries in the graph
+    with a smaller population than using all types of entries. When there are
+    no more directories, only contents or skipped contents are undecided if any
+    are left: we send them directly to the storage since they should be few and
+    their structure flat."""
+
+    def get_sample(self) -> Sample:
+        if self._undecided_directories:
+            if len(self._undecided_directories) <= SAMPLE_SIZE:
+                return Sample(
+                    contents=set(),
+                    skipped_contents=set(),
+                    directories=set(self._undecided_directories),
+                )
+            sample = random.sample(tuple(self._undecided_directories), SAMPLE_SIZE)
+            directories = {o for o in sample}
+            return Sample(
+                contents=set(), skipped_contents=set(), directories=directories
+            )
+
+        contents = set()
+        skipped_contents = set()
+
+        for sha1 in self.undecided:
+            obj = self._all_contents[sha1]
+            obj_type = obj.object_type
+            if obj_type == model.Content.object_type:
+                contents.add(sha1)
+            elif obj_type == model.SkippedContent.object_type:
+                skipped_contents.add(sha1)
+            else:
+                raise TypeError(f"Unexpected object type {obj_type}")
+
+        return Sample(
+            contents=contents, skipped_contents=skipped_contents, directories=set()
+        )
+
+
+def filter_known_objects(
+    archive: ArchiveDiscoveryInterface,
+    update_info_callback: Optional[Callable[[Any, bool], None]] = None,
+):
+    """Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
+    to only return those that are unknown to the SWH archive using a discovery
+    algorithm.
+
+    The `update_info_callback` is an optional argument that will get called for
+    each new piece of information we get. The callback arguments are `(content,
+    known)`.
+    - content: the relevant model.Content object,
+    - known: a boolean, True if the file is known to the archive False otherwise.
+    """
+    contents = archive.contents
+    skipped_contents = archive.skipped_contents
+    directories = archive.directories
+
+    contents_count = len(contents)
+    skipped_contents_count = len(skipped_contents)
+    directories_count = len(directories)
+
+    graph = RandomDirSamplingDiscoveryGraph(
+        contents,
+        skipped_contents,
+        directories,
+        update_info_callback=update_info_callback,
+    )
+
+    while graph.undecided:
+        sample = graph.get_sample()
+        graph.do_query(archive, sample)
+
+    contents = [c for c in contents if c.sha1_git in graph.unknown]
+    skipped_contents = [c for c in skipped_contents if c.sha1_git in graph.unknown]
+    directories = [c for c in directories if c.id in graph.unknown]
+
+    logger.debug(
+        "Filtered out %d contents, %d skipped contents and %d directories",
+        contents_count - len(contents),
+        skipped_contents_count - len(skipped_contents),
+        directories_count - len(directories),
+    )
+
+    return (contents, skipped_contents, directories)
--- a/swh/model/exceptions.py
+++ b/swh/model/exceptions.py
@@ -33,11 +33,12 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.

-NON_FIELD_ERRORS = '__all__'
+NON_FIELD_ERRORS = "__all__"


 class ValidationError(Exception):
    """An error while validating data."""
+
    def __init__(self, message, code=None, params=None):
        """
        The `message` argument can be a single error, a list of errors, or a
@@ -54,16 +55,15 @@ class ValidationError(Exception):
            message = message[0]

        if isinstance(message, ValidationError):
-            if hasattr(message, 'error_dict'):
+            if hasattr(message, "error_dict"):
                message = message.error_dict
            # PY2 has a `message` property which is always there so we can't
            # duck-type on it. It was introduced in Python 2.5 and already
            # deprecated in Python 2.6.
-            elif not hasattr(message, 'message'):
+            elif not hasattr(message, "message"):
                message = message.error_list
            else:
-                message, code, params = (message.message, message.code,
-                                         message.params)
+                message, code, params = (message.message, message.code, message.params)

        if isinstance(message, dict):
            self.error_dict = {}
@@ -78,9 +78,8 @@ class ValidationError(Exception):
                # Normalize plain strings to instances of ValidationError.
                if not isinstance(message, ValidationError):
                    message = ValidationError(message)
-                if hasattr(message, 'error_dict'):
-                    self.error_list.extend(sum(message.error_dict.values(),
-                                               []))
+                if hasattr(message, "error_dict"):
+                    self.error_list.extend(sum(message.error_dict.values(), []))
                else:
                    self.error_list.extend(message.error_list)

@@ -94,18 +93,18 @@ class ValidationError(Exception):
    def message_dict(self):
        # Trigger an AttributeError if this ValidationError
        # doesn't have an error_dict.
-        getattr(self, 'error_dict')
+        getattr(self, "error_dict")

        return dict(self)

    @property
    def messages(self):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            return sum(dict(self).values(), [])
        return list(self)

    def update_error_dict(self, error_dict):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            for field, error_list in self.error_dict.items():
                error_dict.setdefault(field, []).extend(error_list)
        else:
@@ -113,7 +112,7 @@ class ValidationError(Exception):
        return error_dict

    def __iter__(self):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            for field, errors in self.error_dict.items():
                yield field, list(ValidationError(errors))
        else:
@@ -124,9 +123,13 @@ class ValidationError(Exception):
                yield message

    def __str__(self):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            return repr(dict(self))
        return repr(list(self))

    def __repr__(self):
-        return 'ValidationError(%s)' % self
+        return "ValidationError(%s)" % self
+
+
+class InvalidDirectoryPath(Exception):
+    pass
--- a/swh/model/fields/__init__.py
+++ b/swh/model/fields/__init__.py
@@ -6,8 +6,13 @@
 # We do our imports here but we don't use them, so flake8 complains
 # flake8: noqa

-from .simple import (validate_type, validate_int, validate_str, validate_bytes,
-                     validate_datetime, validate_enum)
-from .hashes import (validate_sha1, validate_sha1_git, validate_sha256)
-from .compound import (validate_against_schema, validate_all_keys,
-                       validate_any_key)
+from .compound import validate_against_schema, validate_all_keys, validate_any_key
+from .hashes import validate_sha1, validate_sha1_git, validate_sha256
+from .simple import (
+    validate_bytes,
+    validate_datetime,
+    validate_enum,
+    validate_int,
+    validate_str,
+    validate_type,
+)
--- a/swh/model/fields/compound.py
+++ b/swh/model/fields/compound.py
@@ -6,7 +6,7 @@
 from collections import defaultdict
 import itertools

-from ..exceptions import ValidationError, NON_FIELD_ERRORS
+from ..exceptions import NON_FIELD_ERRORS, ValidationError


 def validate_against_schema(model, schema, value):
@@ -26,19 +26,19 @@ def validate_against_schema(model, schema, value):

    if not isinstance(value, dict):
        raise ValidationError(
-            'Unexpected type %(type)s for %(model)s, expected dict',
+            "Unexpected type %(type)s for %(model)s, expected dict",
            params={
-                'model': model,
-                'type': value.__class__.__name__,
+                "model": model,
+                "type": value.__class__.__name__,
            },
-            code='model-unexpected-type',
+            code="model-unexpected-type",
        )

    errors = defaultdict(list)

    for key, (mandatory, validators) in itertools.chain(
        ((k, v) for k, v in schema.items() if k != NON_FIELD_ERRORS),
-        [(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))]
+        [(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))],
    ):
        if not validators:
            continue
@@ -54,9 +54,9 @@ def validate_against_schema(model, schema, value):
                if mandatory:
                    errors[key].append(
                        ValidationError(
-                            'Field %(field)s is mandatory',
-                            params={'field': key},
-                            code='model-field-mandatory',
+                            "Field %(field)s is mandatory",
+                            params={"field": key},
+                            code="model-field-mandatory",
                        )
                    )

@@ -74,19 +74,21 @@ def validate_against_schema(model, schema, value):
            else:
                if not valid:
                    errdata = {
-                        'validator': validator.__name__,
+                        "validator": validator.__name__,
                    }

                    if key == NON_FIELD_ERRORS:
-                        errmsg = 'Validation of model %(model)s failed in ' \
-                                 '%(validator)s'
-                        errdata['model'] = model
-                        errcode = 'model-validation-failed'
+                        errmsg = (
+                            "Validation of model %(model)s failed in " "%(validator)s"
+                        )
+                        errdata["model"] = model
+                        errcode = "model-validation-failed"
                    else:
-                        errmsg = 'Validation of field %(field)s failed in ' \
-                                 '%(validator)s'
-                        errdata['field'] = key
-                        errcode = 'field-validation-failed'
+                        errmsg = (
+                            "Validation of field %(field)s failed in " "%(validator)s"
+                        )
+                        errdata["field"] = key
+                        errcode = "field-validation-failed"

                    errors[key].append(
                        ValidationError(errmsg, params=errdata, code=errcode)
@@ -102,11 +104,11 @@ def validate_all_keys(value, keys):
    """Validate that all the given keys are present in value"""
    missing_keys = set(keys) - set(value)
    if missing_keys:
-        missing_fields = ', '.join(sorted(missing_keys))
+        missing_fields = ", ".join(sorted(missing_keys))
        raise ValidationError(
-            'Missing mandatory fields %(missing_fields)s',
-            params={'missing_fields': missing_fields},
-            code='missing-mandatory-field'
+            "Missing mandatory fields %(missing_fields)s",
+            params={"missing_fields": missing_fields},
+            code="missing-mandatory-field",
        )

    return True
@@ -116,11 +118,11 @@ def validate_any_key(value, keys):
    """Validate that any of the given keys is present in value"""
    present_keys = set(keys) & set(value)
    if not present_keys:
-        missing_fields = ', '.join(sorted(keys))
+        missing_fields = ", ".join(sorted(keys))
        raise ValidationError(
-            'Must contain one of the alternative fields %(missing_fields)s',
-            params={'missing_fields': missing_fields},
-            code='missing-alternative-field',
+            "Must contain one of the alternative fields %(missing_fields)s",
+            params={"missing_fields": missing_fields},
+            code="missing-alternative-field",
        )

    return True
--- a/swh/model/fields/hashes.py
+++ b/swh/model/fields/hashes.py
@@ -4,6 +4,7 @@
 # See top-level LICENSE file for more information

 import string
+
 from ..exceptions import ValidationError


@@ -22,22 +23,22 @@ def validate_hash(value, hash_type):
    """

    hash_lengths = {
-        'sha1': 20,
-        'sha1_git': 20,
-        'sha256': 32,
+        "sha1": 20,
+        "sha1_git": 20,
+        "sha256": 32,
    }

    hex_digits = set(string.hexdigits)

    if hash_type not in hash_lengths:
        raise ValidationError(
-            'Unexpected hash type %(hash_type)s, expected one of'
-            ' %(hash_types)s',
+            "Unexpected hash type %(hash_type)s, expected one of" " %(hash_types)s",
            params={
-                'hash_type': hash_type,
-                'hash_types': ', '.join(sorted(hash_lengths)),
+                "hash_type": hash_type,
+                "hash_types": ", ".join(sorted(hash_lengths)),
            },
-            code='unexpected-hash-type')
+            code="unexpected-hash-type",
+        )

    if isinstance(value, str):
        errors = []
@@ -48,10 +49,10 @@ def validate_hash(value, hash_type):
                    "Unexpected characters `%(unexpected_chars)s' for hash "
                    "type %(hash_type)s",
                    params={
-                        'unexpected_chars': ', '.join(sorted(extra_chars)),
-                        'hash_type': hash_type,
+                        "unexpected_chars": ", ".join(sorted(extra_chars)),
+                        "hash_type": hash_type,
                    },
-                    code='unexpected-hash-contents',
+                    code="unexpected-hash-contents",
                )
            )

@@ -60,14 +61,14 @@ def validate_hash(value, hash_type):
        if length != expected_length:
            errors.append(
                ValidationError(
-                    'Unexpected length %(length)d for hash type '
-                    '%(hash_type)s, expected %(expected_length)d',
+                    "Unexpected length %(length)d for hash type "
+                    "%(hash_type)s, expected %(expected_length)d",
                    params={
-                        'length': length,
-                        'expected_length': expected_length,
-                        'hash_type': hash_type,
+                        "length": length,
+                        "expected_length": expected_length,
+                        "hash_type": hash_type,
                    },
-                    code='unexpected-hash-length',
+                    code="unexpected-hash-length",
                )
            )

@@ -81,37 +82,37 @@ def validate_hash(value, hash_type):
        expected_length = hash_lengths[hash_type]
        if length != expected_length:
            raise ValidationError(
-                'Unexpected length %(length)d for hash type '
-                '%(hash_type)s, expected %(expected_length)d',
+                "Unexpected length %(length)d for hash type "
+                "%(hash_type)s, expected %(expected_length)d",
                params={
-                    'length': length,
-                    'expected_length': expected_length,
-                    'hash_type': hash_type,
+                    "length": length,
+                    "expected_length": expected_length,
+                    "hash_type": hash_type,
                },
-                code='unexpected-hash-length',
+                code="unexpected-hash-length",
            )

        return True

    raise ValidationError(
-        'Unexpected type %(type)s for hash, expected str or bytes',
+        "Unexpected type %(type)s for hash, expected str or bytes",
        params={
-            'type': value.__class__.__name__,
+            "type": value.__class__.__name__,
        },
-        code='unexpected-hash-value-type',
+        code="unexpected-hash-value-type",
    )


 def validate_sha1(sha1):
    """Validate that sha1 is a valid sha1 hash"""
-    return validate_hash(sha1, 'sha1')
+    return validate_hash(sha1, "sha1")


 def validate_sha1_git(sha1_git):
    """Validate that sha1_git is a valid sha1_git hash"""
-    return validate_hash(sha1_git, 'sha1_git')
+    return validate_hash(sha1_git, "sha1_git")


 def validate_sha256(sha256):
    """Validate that sha256 is a valid sha256 hash"""
-    return validate_hash(sha256, 'sha256')
+    return validate_hash(sha256, "sha256")
--- a/swh/model/fields/simple.py
+++ b/swh/model/fields/simple.py
@@ -13,16 +13,16 @@ def validate_type(value, type):
    """Validate that value is an integer"""
    if not isinstance(value, type):
        if isinstance(type, tuple):
-            typestr = 'one of %s' % ', '.join(typ.__name__ for typ in type)
+            typestr = "one of %s" % ", ".join(typ.__name__ for typ in type)
        else:
            typestr = type.__name__
        raise ValidationError(
-            'Unexpected type %(type)s, expected %(expected_type)s',
+            "Unexpected type %(type)s, expected %(expected_type)s",
            params={
-                'type': value.__class__.__name__,
-                'expected_type': typestr,
+                "type": value.__class__.__name__,
+                "expected_type": typestr,
            },
-            code='unexpected-type'
+            code="unexpected-type",
        )

    return True
@@ -54,10 +54,12 @@ def validate_datetime(value):
        errors.append(e)

    if isinstance(value, datetime.datetime) and value.tzinfo is None:
-        errors.append(ValidationError(
-            'Datetimes must be timezone-aware in swh',
-            code='datetime-without-tzinfo',
-        ))
+        errors.append(
+            ValidationError(
+                "Datetimes must be timezone-aware in swh",
+                code="datetime-without-tzinfo",
+            )
+        )

    if errors:
        raise ValidationError(errors)
@@ -69,12 +71,12 @@ def validate_enum(value, expected_values):
    """Validate that value is contained in expected_values"""
    if value not in expected_values:
        raise ValidationError(
-            'Unexpected value %(value)s, expected one of %(expected_values)s',
+            "Unexpected value %(value)s, expected one of %(expected_values)s",
            params={
-                'value': value,
-                'expected_values': ', '.join(sorted(expected_values)),
+                "value": value,
+                "expected_values": ", ".join(sorted(expected_values)),
            },
-            code='unexpected-value',
+            code="unexpected-value",
        )

    return True
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
No results found