Compare revisions

118c8a44 · 118c8a44 · 118c8a44 · 2ef894d9 · 2ef894d9 · 2ef894d9
--- a/setup.cfg
+++ b/setup.cfg
-[flake8]
-# E203: whitespaces before ':' <https://github.com/psf/black/issues/315>
-# E231: missing whitespace after ','
-# W503: line break before binary operator <https://github.com/psf/black/issues/52>
-ignore = E203,E231,W503
-max-line-length = 88
--- a/setup.py
+++ b/setup.py
-#!/usr/bin/env python3
-# Copyright (C) 2015-2020  The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-from io import open
-from os import path
-
-from setuptools import find_packages, setup
-
-here = path.abspath(path.dirname(__file__))
-
-# Get the long description from the README file
-with open(path.join(here, "README.md"), encoding="utf-8") as f:
-    long_description = f.read()
-
-
-def parse_requirements(name=None):
-    if name:
-        reqf = "requirements-%s.txt" % name
-    else:
-        reqf = "requirements.txt"
-
-    requirements = []
-    if not path.exists(reqf):
-        return requirements
-
-    with open(reqf) as f:
-        for line in f.readlines():
-            line = line.strip()
-            if not line or line.startswith("#"):
-                continue
-            requirements.append(line)
-    return requirements
-
-
-setup(
-    name="swh.model",
-    description="Software Heritage data model",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    python_requires=">=3.7",
-    author="Software Heritage developers",
-    author_email="swh-devel@inria.fr",
-    url="https://forge.softwareheritage.org/diffusion/DMOD/",
-    packages=find_packages(),
-    setup_requires=["setuptools-scm"],
-    use_scm_version=True,
-    install_requires=parse_requirements() + parse_requirements("swh"),
-    extras_require={
-        "cli": parse_requirements("cli"),
-        "testing-minimal": parse_requirements("test"),
-        "testing": parse_requirements("test") + parse_requirements("cli"),
-    },
-    include_package_data=True,
-    entry_points="""
-        [console_scripts]
-        swh-identify=swh.model.cli:identify
-        [swh.cli.subcommands]
-        identify=swh.model.cli
-    """,
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "Intended Audience :: Developers",
-        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
-        "Operating System :: OS Independent",
-        "Development Status :: 5 - Production/Stable",
-    ],
-    project_urls={
-        "Bug Reports": "https://forge.softwareheritage.org/maniphest",
-        "Funding": "https://www.softwareheritage.org/donate",
-        "Source": "https://forge.softwareheritage.org/source/swh-model",
-        "Documentation": "https://docs.softwareheritage.org/devel/swh-model/",
-    },
-)
--- a/swh/__init__.py
+++ b/swh/__init__.py
-from pkgutil import extend_path
-
-__path__ = extend_path(__path__, __name__)
--- a/swh/model/cli.py
+++ b/swh/model/cli.py
@@ -5,7 +5,7 @@

 import os
 import sys
-from typing import Dict, Iterable, Optional
+from typing import Callable, Dict, Iterable, Optional

 # WARNING: do not import unnecessary things here to keep cli startup time under
 # control
@@ -20,10 +20,12 @@ except ImportError:
    exit(1)

 try:
-    from swh.core.cli import swh as swh_cli_group
+    import swh.core.cli
+
+    cli_command = swh.core.cli.swh.command
 except ImportError:
    # stub so that swh-identify can be used when swh-core isn't installed
-    swh_cli_group = click  # type: ignore
+    cli_command = click.command

 from swh.model.from_disk import Directory
 from swh.model.swhids import CoreSWHID
@@ -42,7 +44,7 @@ _DULWICH_TYPES = {

 class CoreSWHIDParamType(click.ParamType):
    """Click argument that accepts a core SWHID and returns them as
-    :class:`swh.model.swhids.CoreSWHID` instances """
+    :class:`swh.model.swhids.CoreSWHID` instances"""

    name = "SWHID"

@@ -69,19 +71,27 @@ def swhid_of_file_content(data) -> CoreSWHID:
    return object.swhid()


-def model_of_dir(path: bytes, exclude_patterns: Iterable[bytes] = None) -> Directory:
-    from swh.model.from_disk import accept_all_directories, ignore_directories_patterns
+def model_of_dir(
+    path: bytes,
+    exclude_patterns: Optional[Iterable[bytes]] = None,
+    update_info: Optional[Callable[[int], None]] = None,
+) -> Directory:
+    from swh.model.from_disk import accept_all_paths, ignore_directories_patterns

-    dir_filter = (
+    path_filter = (
        ignore_directories_patterns(path, exclude_patterns)
        if exclude_patterns
-        else accept_all_directories
+        else accept_all_paths
    )

-    return Directory.from_disk(path=path, dir_filter=dir_filter)
+    return Directory.from_disk(
+        path=path, path_filter=path_filter, progress_callback=update_info
+    )


-def swhid_of_dir(path: bytes, exclude_patterns: Iterable[bytes] = None) -> CoreSWHID:
+def swhid_of_dir(
+    path: bytes, exclude_patterns: Optional[Iterable[bytes]] = None
+) -> CoreSWHID:
    obj = model_of_dir(path, exclude_patterns)
    return obj.swhid()

@@ -170,7 +180,7 @@ def identify_object(
    return swhid


-@swh_cli_group.command(context_settings=CONTEXT_SETTINGS)
+@cli_command(context_settings=CONTEXT_SETTINGS)
 @click.option(
    "--dereference/--no-dereference",
    "follow_symlinks",
@@ -209,7 +219,10 @@ def identify_object(
    help="reference identifier to be compared with computed one",
 )
 @click.option(
-    "-r", "--recursive", is_flag=True, help="compute SWHID recursively",
+    "-r",
+    "--recursive",
+    is_flag=True,
+    help="compute SWHID recursively",
 )
 @click.argument("objects", nargs=-1, required=True)
 def identify(
@@ -226,30 +239,26 @@ def identify(

    For more details about SWHIDs see:

-    \b
    https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html

    Tip: you can pass "-" to identify the content of standard input.

-    \b
    Examples::

-    \b
      $ swh identify fork.c kmod.c sched/deadline.c
      swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3    fork.c
      swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2    kmod.c
      swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82    sched/deadline.c

-    \b
      $ swh identify --no-filename /usr/src/linux/kernel/
      swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab

-    \b
      $ git clone --mirror https://forge.softwareheritage.org/source/helloworld.git
+
      $ swh identify --type snapshot helloworld.git/
-      swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93	helloworld.git
+      swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93    helloworld.git

-    """  # NoQA  # overlong lines in shell examples are fine
+    """
    from functools import partial
    import logging

@@ -300,7 +309,7 @@ def identify(
                click.echo("SWHID mismatch: %s != %s" % (verify, swhid))
                sys.exit(1)
        else:
-            for (obj, swhid) in results:
+            for obj, swhid in results:
                msg = swhid
                if show_filename:
                    msg = "%s\t%s" % (swhid, os.fsdecode(obj))

--- a/swh/model/collections.py
+++ b/swh/model/collections.py
-# Copyright (C) 2020 The Software Heritage developers
+# Copyright (C) 2020-2023 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

+from __future__ import annotations
+
 """Utility data structures."""

 from collections.abc import Mapping
+import copy
 from typing import Dict, Generic, Iterable, Optional, Tuple, TypeVar, Union

 KT = TypeVar("KT")
@@ -18,36 +21,35 @@ class ImmutableDict(Mapping, Generic[KT, VT]):
    This class behaves like a dictionary, but internally stores objects in a tuple,
    so it is both immutable and hashable."""

-    data: Tuple[Tuple[KT, VT], ...]
+    _data: Dict[KT, VT]

    def __init__(
        self,
-        data: Union[
-            Iterable[Tuple[KT, VT]], "ImmutableDict[KT, VT]", Dict[KT, VT]
-        ] = {},
+        data: Union[Iterable[Tuple[KT, VT]], ImmutableDict[KT, VT], Dict[KT, VT]] = {},
    ):
        if isinstance(data, dict):
-            self.data = tuple(item for item in data.items())
+            self._data = data
        elif isinstance(data, ImmutableDict):
-            self.data = data.data
+            self._data = data._data
        else:
-            self.data = tuple(data)
+            self._data = {k: v for k, v in data}
+
+    @property
+    def data(self):
+        return tuple(self._data.items())

    def __repr__(self):
        return f"ImmutableDict({dict(self.data)!r})"

    def __getitem__(self, key):
-        for (k, v) in self.data:
-            if k == key:
-                return v
-        raise KeyError(key)
+        return self._data[key]

    def __iter__(self):
-        for (k, v) in self.data:
+        for k, v in self.data:
            yield k

    def __len__(self):
-        return len(self.data)
+        return len(self._data)

    def items(self):
        yield from self.data
@@ -55,15 +57,9 @@ class ImmutableDict(Mapping, Generic[KT, VT]):
    def __hash__(self):
        return hash(tuple(sorted(self.data)))

-    def copy_pop(self, popped_key) -> Tuple[Optional[VT], "ImmutableDict[KT, VT]"]:
+    def copy_pop(self, popped_key) -> Tuple[Optional[VT], ImmutableDict[KT, VT]]:
        """Returns a copy of this ImmutableDict without the given key,
        as well as the value associated to the key."""
-        popped_value = None
-        new_items = []
-        for (key, value) in self.data:
-            if key == popped_key:
-                popped_value = value
-            else:
-                new_items.append((key, value))
-
+        new_items = copy.deepcopy(self._data)
+        popped_value: Optional[VT] = new_items.pop(popped_key, None)
        return (popped_value, ImmutableDict(new_items))
--- a/swh/model/discovery.py
+++ b/swh/model/discovery.py
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Primitives for finding unknown content efficiently."""
+
+from __future__ import annotations
+
+from collections import namedtuple
+import itertools
+import logging
+import random
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    Set,
+    Union,
+)
+
+from typing_extensions import Protocol, runtime_checkable
+
+from .from_disk import model
+from .model import Sha1Git
+
+logger = logging.getLogger(__name__)
+
+# Maximum amount when sampling from the undecided set of directory entries
+SAMPLE_SIZE = 1000
+
+# Sets of sha1 of contents, skipped contents and directories respectively
+Sample: NamedTuple = namedtuple(
+    "Sample", ["contents", "skipped_contents", "directories"]
+)
+
+
+@runtime_checkable
+class ArchiveDiscoveryInterface(Protocol):
+    """Interface used in discovery code to abstract over ways of connecting to
+    the SWH archive (direct storage, web API, etc.) for all methods needed by
+    discovery algorithms."""
+
+    contents: List[model.Content]
+    skipped_contents: List[model.SkippedContent]
+    directories: List[model.Directory]
+
+    def __init__(
+        self,
+        contents: List[model.Content],
+        skipped_contents: List[model.SkippedContent],
+        directories: List[model.Directory],
+    ) -> None:
+        self.contents = contents
+        self.skipped_contents = skipped_contents
+        self.directories = directories
+
+    def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]:
+        """List content missing from the archive by sha1"""
+
+    def skipped_content_missing(
+        self, skipped_contents: List[Sha1Git]
+    ) -> Iterable[Sha1Git]:
+        """List skipped content missing from the archive by sha1"""
+
+    def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]:
+        """List directories missing from the archive by sha1"""
+
+
+class BaseDiscoveryGraph:
+    """Creates the base structures and methods needed for discovery algorithms.
+    Subclasses should override ``get_sample`` to affect how the discovery is made.
+
+    The `update_info_callback` is an optional argument that will get called for
+    each new piece of information we get. The callback arguments are `(content,
+    known)`.
+    - content: the relevant model.Content object,
+    - known: a boolean, True if the file is known to the archive False otherwise.
+    """
+
+    def __init__(
+        self,
+        contents,
+        skipped_contents,
+        directories,
+        update_info_callback: Optional[Callable[[Any, bool], None]] = None,
+    ):
+        self._all_contents: Mapping[
+            Sha1Git, Union[model.Content, model.SkippedContent]
+        ] = {}
+        self._undecided_directories: Set[Sha1Git] = set()
+        self._children: Mapping[Sha1Git, Set[Sha1Git]] = {}
+        self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {}
+        self.undecided: Set[Sha1Git] = set()
+
+        for content in itertools.chain(contents, skipped_contents):
+            self.undecided.add(content.sha1_git)
+            self._all_contents[content.sha1_git] = content
+
+        for directory in directories:
+            self.undecided.add(directory.id)
+            self._undecided_directories.add(directory.id)
+            self._children[directory.id] = {c.target for c in directory.entries}
+            for child in directory.entries:
+                self._parents.setdefault(child.target, set()).add(directory.id)
+
+        self.undecided |= self._undecided_directories
+        self.known: Set[Sha1Git] = set()
+        self.unknown: Set[Sha1Git] = set()
+        self._update_info_callback = update_info_callback
+        self._sha1_to_obj = {}
+        for content in itertools.chain(contents, skipped_contents):
+            self._sha1_to_obj[content.sha1_git] = content
+        for directory in directories:
+            self._sha1_to_obj[directory.id] = directory
+
+    def mark_known(self, entries: Iterable[Sha1Git]):
+        """Mark ``entries`` and those they imply as known in the SWH archive"""
+        self._mark_entries(entries, self._children, self.known)
+
+    def mark_unknown(self, entries: Iterable[Sha1Git]):
+        """Mark ``entries`` and those they imply as unknown in the SWH archive"""
+        self._mark_entries(entries, self._parents, self.unknown)
+
+    def _mark_entries(
+        self,
+        entries: Iterable[Sha1Git],
+        transitive_mapping: Mapping[Any, Any],
+        target_set: Set[Any],
+    ):
+        """Use Merkle graph properties to mark a directory entry as known or unknown.
+
+        If an entry is known, then all of its descendants are known. If it's
+        unknown, then all of its ancestors are unknown.
+
+        - ``entries``: directory entries to mark along with their ancestors/descendants
+          where applicable.
+        - ``transitive_mapping``: mapping from an entry to the next entries to mark
+          in the hierarchy, if any.
+        - ``target_set``: set where marked entries will be added.
+
+        """
+        callback = self._update_info_callback
+        to_process = set(entries)
+        while to_process:
+            current = to_process.pop()
+            target_set.add(current)
+            new = current in self.undecided
+            self.undecided.discard(current)
+            self._undecided_directories.discard(current)
+            next_entries = transitive_mapping.get(current, set()) & self.undecided
+            to_process.update(next_entries)
+            if new and callback is not None:
+                obj = self._sha1_to_obj[current]
+                callback(obj, current in self.known)
+
+    def get_sample(
+        self,
+    ) -> Sample:
+        """Return a three-tuple of samples from the undecided sets of contents,
+        skipped contents and directories respectively.
+        These samples will be queried against the storage which will tell us
+        which are known."""
+        raise NotImplementedError()
+
+    def do_query(self, archive: ArchiveDiscoveryInterface, sample: Sample) -> None:
+        """Given a three-tuple of samples, ask the archive which are known or
+        unknown and mark them as such."""
+
+        methods = (
+            archive.content_missing,
+            archive.skipped_content_missing,
+            archive.directory_missing,
+        )
+
+        for sample_per_type, method in zip(sample, methods):
+            if not sample_per_type:
+                continue
+            known = set(sample_per_type)
+            unknown = set(method(list(sample_per_type)))
+            known -= unknown
+
+            self.mark_known(known)
+            self.mark_unknown(unknown)
+
+
+class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph):
+    """Use a random sampling using only directories.
+
+    This allows us to find a statistically good spread of entries in the graph
+    with a smaller population than using all types of entries. When there are
+    no more directories, only contents or skipped contents are undecided if any
+    are left: we send them directly to the storage since they should be few and
+    their structure flat."""
+
+    def get_sample(self) -> Sample:
+        if self._undecided_directories:
+            if len(self._undecided_directories) <= SAMPLE_SIZE:
+                return Sample(
+                    contents=set(),
+                    skipped_contents=set(),
+                    directories=set(self._undecided_directories),
+                )
+            sample = random.sample(tuple(self._undecided_directories), SAMPLE_SIZE)
+            directories = {o for o in sample}
+            return Sample(
+                contents=set(), skipped_contents=set(), directories=directories
+            )
+
+        contents = set()
+        skipped_contents = set()
+
+        for sha1 in self.undecided:
+            obj = self._all_contents[sha1]
+            obj_type = obj.object_type
+            if obj_type == model.Content.object_type:
+                contents.add(sha1)
+            elif obj_type == model.SkippedContent.object_type:
+                skipped_contents.add(sha1)
+            else:
+                raise TypeError(f"Unexpected object type {obj_type}")
+
+        return Sample(
+            contents=contents, skipped_contents=skipped_contents, directories=set()
+        )
+
+
+def filter_known_objects(
+    archive: ArchiveDiscoveryInterface,
+    update_info_callback: Optional[Callable[[Any, bool], None]] = None,
+):
+    """Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
+    to only return those that are unknown to the SWH archive using a discovery
+    algorithm.
+
+    The `update_info_callback` is an optional argument that will get called for
+    each new piece of information we get. The callback arguments are `(content,
+    known)`.
+    - content: the relevant model.Content object,
+    - known: a boolean, True if the file is known to the archive False otherwise.
+    """
+    contents = archive.contents
+    skipped_contents = archive.skipped_contents
+    directories = archive.directories
+
+    contents_count = len(contents)
+    skipped_contents_count = len(skipped_contents)
+    directories_count = len(directories)
+
+    graph = RandomDirSamplingDiscoveryGraph(
+        contents,
+        skipped_contents,
+        directories,
+        update_info_callback=update_info_callback,
+    )
+
+    while graph.undecided:
+        sample = graph.get_sample()
+        graph.do_query(archive, sample)
+
+    contents = [c for c in contents if c.sha1_git in graph.unknown]
+    skipped_contents = [c for c in skipped_contents if c.sha1_git in graph.unknown]
+    directories = [c for c in directories if c.id in graph.unknown]
+
+    logger.debug(
+        "Filtered out %d contents, %d skipped contents and %d directories",
+        contents_count - len(contents),
+        skipped_contents_count - len(skipped_contents),
+        directories_count - len(directories),
+    )
+
+    return (contents, skipped_contents, directories)
--- a/swh/model/fields/compound.py
+++ b/swh/model/fields/compound.py
@@ -27,7 +27,10 @@ def validate_against_schema(model, schema, value):
    if not isinstance(value, dict):
        raise ValidationError(
            "Unexpected type %(type)s for %(model)s, expected dict",
-            params={"model": model, "type": value.__class__.__name__,},
+            params={
+                "model": model,
+                "type": value.__class__.__name__,
+            },
            code="model-unexpected-type",
        )


--- a/swh/model/fields/hashes.py
+++ b/swh/model/fields/hashes.py
@@ -96,7 +96,9 @@ def validate_hash(value, hash_type):

    raise ValidationError(
        "Unexpected type %(type)s for hash, expected str or bytes",
-        params={"type": value.__class__.__name__,},
+        params={
+            "type": value.__class__.__name__,
+        },
        code="unexpected-hash-value-type",
    )


--- a/swh/model/fields/simple.py
+++ b/swh/model/fields/simple.py
@@ -18,7 +18,10 @@ def validate_type(value, type):
            typestr = type.__name__
        raise ValidationError(
            "Unexpected type %(type)s, expected %(expected_type)s",
-            params={"type": value.__class__.__name__, "expected_type": typestr,},
+            params={
+                "type": value.__class__.__name__,
+                "expected_type": typestr,
+            },
            code="unexpected-type",
        )


--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
--- a/swh/model/git_objects.py
+++ b/swh/model/git_objects.py
@@ -39,15 +39,32 @@ from .collections import ImmutableDict
 from .hashutil import git_object_header, hash_to_bytehex


+def content_git_object(content: model.Content) -> bytes:
+    """Formats a content as a git blob.
+
+    A content's identifier is the blob sha1 à la git of the tagged content.
+    """
+    content = cast(model.Content, content)
+
+    if content.data is None:
+        raise model.MissingData("Content data is None, cannot format.")
+
+    return git_object_header("blob", len(content.data)) + content.data
+
+
 def directory_entry_sort_key(entry: model.DirectoryEntry):
    """The sorting key for tree entries"""
    if isinstance(entry, dict):
-        # For backward compatibility
-        entry = model.DirectoryEntry.from_dict(entry)
-    if entry.type == "dir":
-        return entry.name + b"/"
+        type_ = entry["type"]
+        name = entry["name"]
    else:
-        return entry.name
+        type_ = entry.type
+        name = entry.name
+
+    if type_ == "dir":
+        return name + b"/"
+    else:
+        return name


 @lru_cache()
@@ -177,7 +194,13 @@ def directory_git_object(directory: Union[Dict, model.Directory]) -> bytes:

    for entry in sorted(directory.entries, key=directory_entry_sort_key):
        components.extend(
-            [_perms_to_bytes(entry.perms), b"\x20", entry.name, b"\x00", entry.target,]
+            [
+                _perms_to_bytes(entry.perms),
+                b"\x20",
+                entry.name,
+                b"\x00",
+                entry.target,
+            ]
        )

    return format_git_object_from_parts("tree", components)
@@ -221,10 +244,7 @@ def format_git_object_from_headers(
    if message is not None:
        entries.extend((b"\n", message))

-    concatenated_entries = b"".join(entries)
-
-    header = git_object_header(git_type, len(concatenated_entries))
-    return header + concatenated_entries
+    return format_git_object_from_parts(git_type, entries)


 def format_git_object_from_parts(git_type: str, parts: Iterable[bytes]) -> bytes:
@@ -340,10 +360,15 @@ def revision_git_object(revision: Union[Dict, model.Revision]) -> bytes:
        if parent:
            headers.append((b"parent", hash_to_bytehex(parent)))

-    headers.append((b"author", format_author_data(revision.author, revision.date)))
-    headers.append(
-        (b"committer", format_author_data(revision.committer, revision.committer_date),)
-    )
+    if revision.author is not None:
+        headers.append((b"author", format_author_data(revision.author, revision.date)))
+    if revision.committer is not None:
+        headers.append(
+            (
+                b"committer",
+                format_author_data(revision.committer, revision.committer_date),
+            )
+        )

    # Handle extra headers
    metadata = revision.metadata or ImmutableDict()
@@ -356,14 +381,14 @@ def revision_git_object(revision: Union[Dict, model.Revision]) -> bytes:
    return format_git_object_from_headers("commit", headers, revision.message)


-def target_type_to_git(target_type: model.ObjectType) -> bytes:
+def target_type_to_git(target_type: model.ReleaseTargetType) -> bytes:
    """Convert a software heritage target type to a git object type"""
    return {
-        model.ObjectType.CONTENT: b"blob",
-        model.ObjectType.DIRECTORY: b"tree",
-        model.ObjectType.REVISION: b"commit",
-        model.ObjectType.RELEASE: b"tag",
-        model.ObjectType.SNAPSHOT: b"refs",
+        model.ReleaseTargetType.CONTENT: b"blob",
+        model.ReleaseTargetType.DIRECTORY: b"tree",
+        model.ReleaseTargetType.REVISION: b"commit",
+        model.ReleaseTargetType.RELEASE: b"tag",
+        model.ReleaseTargetType.SNAPSHOT: b"refs",
    }[target_type]


@@ -391,7 +416,9 @@ def release_git_object(release: Union[Dict, model.Release]) -> bytes:
    return format_git_object_from_headers("tag", headers, release.message)


-def snapshot_git_object(snapshot: Union[Dict, model.Snapshot]) -> bytes:
+def snapshot_git_object(
+    snapshot: Union[Dict, model.Snapshot], *, ignore_unresolved: bool = False
+) -> bytes:
    """Formats a snapshot as a git-like object.

    Snapshots are a set of named branches, which are pointers to objects at any
@@ -435,6 +462,10 @@ def snapshot_git_object(snapshot: Union[Dict, model.Snapshot]) -> bytes:
      Note that, akin to directory manifests, there is no separator between
      entries. Because of symbolic branches, identifiers are of arbitrary
      length but are length-encoded to avoid ambiguity.
+
+    Args:
+      ignore_unresolved: if False (the default), raises an exception when
+        alias branches point to non-existing branches
    """
    if isinstance(snapshot, dict):
        # For backward compatibility
@@ -454,7 +485,7 @@ def snapshot_git_object(snapshot: Union[Dict, model.Snapshot]) -> bytes:
        if not target:
            target_type = b"dangling"
            target_id = b""
-        elif target.target_type == model.TargetType.ALIAS:
+        elif target.target_type == model.SnapshotTargetType.ALIAS:
            target_type = b"alias"
            target_id = target.target
            if target_id not in snapshot.branches or target_id == name:
@@ -474,7 +505,7 @@ def snapshot_git_object(snapshot: Union[Dict, model.Snapshot]) -> bytes:
            ]
        )

-    if unresolved:
+    if unresolved and not ignore_unresolved:
        raise ValueError(
            "Branch aliases unresolved: %s"
            % ", ".join("%r -> %r" % x for x in unresolved),
@@ -485,7 +516,7 @@ def snapshot_git_object(snapshot: Union[Dict, model.Snapshot]) -> bytes:


 def raw_extrinsic_metadata_git_object(
-    metadata: Union[Dict, model.RawExtrinsicMetadata]
+    metadata: Union[Dict, model.RawExtrinsicMetadata],
 ) -> bytes:
    """Formats RawExtrinsicMetadata as a git-like object.

@@ -559,7 +590,10 @@ def raw_extrinsic_metadata_git_object(
            b"authority",
            f"{metadata.authority.type.value} {metadata.authority.url}".encode(),
        ),
-        (b"fetcher", f"{metadata.fetcher.name} {metadata.fetcher.version}".encode(),),
+        (
+            b"fetcher",
+            f"{metadata.fetcher.name} {metadata.fetcher.version}".encode(),
+        ),
        (b"format", metadata.format.encode()),
    ]

@@ -597,6 +631,8 @@ def extid_git_object(extid: model.ExtID) -> bytes:
    [extid_version $Str]
    extid $Bytes
    target $CoreSwhid
+    [payload_type $StrWithoutSpaces]
+    [payload $ContentIdentifier]
    ```

    $StrWithoutSpaces is an ASCII string, and may not contain spaces.
@@ -605,6 +641,10 @@ def extid_git_object(extid: model.ExtID) -> bytes:
    space after them.

    The extid_version line is only generated if the version is non-zero.
+
+    The payload_type and payload lines are only generated if they are not
+    :const:`None`. $ContentIdentifier is the object ID of a content object.
+
    """

    headers = [
@@ -615,7 +655,18 @@ def extid_git_object(extid: model.ExtID) -> bytes:
        headers.append((b"extid_version", str(extid_version).encode("ascii")))

    headers.extend(
-        [(b"extid", extid.extid), (b"target", str(extid.target).encode("ascii")),]
+        [
+            (b"extid", extid.extid),
+            (b"target", str(extid.target).encode("ascii")),
+        ]
    )

+    payload_type = extid.payload_type
+    if payload_type is not None:
+        headers.append((b"payload_type", payload_type.encode("ascii")))
+
+    payload = extid.payload
+    if payload is not None:
+        headers.append((b"payload", payload))
+
    return format_git_object_from_headers("extid", headers)
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
@@ -56,9 +56,11 @@ import functools
 import hashlib
 from io import BytesIO
 import os
-from typing import Callable, Dict, Optional
+from typing import Callable, Dict, Optional, Union

-ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5"])
+ALGORITHMS = set(
+    ["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5", "sha512"]
+)
 """Hashing algorithms supported by this module"""

 DEFAULT_ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256"])
@@ -70,7 +72,7 @@ Subset of :const:`ALGORITHMS`.
 HASH_BLOCK_SIZE = 32768
 """Block size for streaming hash computations made in this module"""

-_blake2_hash_cache = {}  # type: Dict[str, Callable]
+_blake2_hash_cache: Dict[str, Callable] = {}


 class MultiHash:
@@ -160,9 +162,7 @@ class MultiHash:


 def _new_blake2_hash(algo):
-    """Return a function that initializes a blake2 hash.
-
-    """
+    """Return a function that initializes a blake2 hash."""
    if algo in _blake2_hash_cache:
        return _blake2_hash_cache[algo]()

@@ -295,7 +295,7 @@ def hash_git_data(data, git_type, base_algo="sha1"):


 @functools.lru_cache()
-def hash_to_hex(hash):
+def hash_to_hex(hash: Union[str, bytes]) -> str:
    """Converts a hash (in hex or bytes form) to its hexadecimal ascii form

    Args:
@@ -311,7 +311,7 @@ def hash_to_hex(hash):


 @functools.lru_cache()
-def hash_to_bytehex(hash):
+def hash_to_bytehex(hash: bytes) -> bytes:
    """Converts a hash to its hexadecimal bytes representation

    Args:
@@ -324,7 +324,7 @@ def hash_to_bytehex(hash):


 @functools.lru_cache()
-def hash_to_bytes(hash):
+def hash_to_bytes(hash: Union[str, bytes]) -> bytes:
    """Converts a hash (in hex or bytes form) to its raw bytes form

    Args:
@@ -340,7 +340,7 @@ def hash_to_bytes(hash):


 @functools.lru_cache()
-def bytehex_to_hash(hex):
+def bytehex_to_hash(hex: bytes) -> bytes:
    """Converts a hexadecimal bytes representation of a hash to that hash

    Args:

--- a/swh/model/hypothesis_strategies.py
+++ b/swh/model/hypothesis_strategies.py
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
-# Copyright (C) 2015-2021  The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-from typing import Any, Dict
-import warnings
-
-from . import model
-
-# Reexport for backward compatibility
-from .git_objects import *  # noqa
-from .hashutil import MultiHash, hash_to_hex
-
-# Reexport for backward compatibility
-from .swhids import *  # noqa
-
-warnings.warn(
-    "The swh.model.identifiers module is deprecated. "
-    "SWHID-related classes were moved to swh.model.swhids, and identifier "
-    "computation is now done directly with swh.model.model classes.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-# The following are deprecated aliases of the variants defined in ObjectType
-# while transitioning from SWHID to QualifiedSWHID
-ORIGIN = "origin"
-SNAPSHOT = "snapshot"
-REVISION = "revision"
-RELEASE = "release"
-DIRECTORY = "directory"
-CONTENT = "content"
-RAW_EXTRINSIC_METADATA = "raw_extrinsic_metadata"
-
-
-def content_identifier(content: Dict[str, Any]) -> Dict[str, bytes]:
-    """Deprecated, use :class:`swh.model.Content` instead:
-    ``content_identifier(d)`` is equivalent to:
-    ``{k: hash_to_hex(v) for (k, v) in Content.from_data(d["data"]).hashes().items()}``
-    """
-    return MultiHash.from_data(content["data"]).digest()
-
-
-def directory_identifier(directory: Dict[str, Any]) -> str:
-    """Deprecated, use :class:`swh.model.Directory` instead:
-    ``directory_identifier(d)`` is equivalent to:
-    ``hash_to_hex(Directory.from_dict(d).id)``.
-
-    See :func:`swh.model.git_objects.directory_git_object` for details of the
-    format used to generate this identifier."""
-    return hash_to_hex(model.Directory.from_dict(directory).id)
-
-
-def revision_identifier(revision: Dict[str, Any]) -> str:
-    """Deprecated, use :class:`swh.model.Revision` instead:
-    ``revision_identifier(d)`` is equivalent to:
-    ``hash_to_hex(Revision.from_dict(d).id)``.
-
-    See :func:`swh.model.git_objects.revision_git_object` for details of the
-    format used to generate this identifier."""
-    return hash_to_hex(model.Revision.from_dict(revision).id)
-
-
-def release_identifier(release: Dict[str, Any]) -> str:
-    """Deprecated, use :class:`swh.model.Release` instead:
-    ``release_identifier(d)`` is equivalent to:
-    ``hash_to_hex(Release.from_dict(d).id)``.
-
-    See :func:`swh.model.git_objects.release_git_object` for details of the
-    format used to generate this identifier."""
-    return hash_to_hex(model.Release.from_dict(release).id)
-
-
-def snapshot_identifier(snapshot: Dict[str, Any]) -> str:
-    """Deprecated, use :class:`swh.model.Snapshot` instead:
-    ``snapshot_identifier(d)`` is equivalent to:
-    ``hash_to_hex(Snapshot.from_dict(d).id)``.
-
-    See :func:`swh.model.git_objects.snapshot_git_object` for details of the
-    format used to generate this identifier."""
-    return hash_to_hex(model.Snapshot.from_dict(snapshot).id)
-
-
-def origin_identifier(origin):
-    """Deprecated, use :class:`swh.model.Origin` instead:
-    ``origin_identifier(url)`` is equivalent to:
-    ``hash_to_hex(Origin(url=url).id)``.
-    """
-
-    return hash_to_hex(model.Origin.from_dict(origin).id)
--- a/swh/model/merkle.py
+++ b/swh/model/merkle.py
-# Copyright (C) 2017-2020 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

 """Merkle tree data structure"""

-import abc
-from collections.abc import Mapping
-from typing import Dict, Iterator, List, Set
-
-
-def deep_update(left, right):
-    """Recursively update the left mapping with deeply nested values from the right
-    mapping.
-
-    This function is useful to merge the results of several calls to
-    :func:`MerkleNode.collect`.
-
-    Arguments:
-      left: a mapping (modified by the update operation)
-      right: a mapping
-
-    Returns:
-      the left mapping, updated with nested values from the right mapping
-
-    Example:
-        >>> a = {
-        ...     'key1': {
-        ...         'key2': {
-        ...              'key3': 'value1/2/3',
-        ...         },
-        ...     },
-        ... }
-        >>> deep_update(a, {
-        ...     'key1': {
-        ...         'key2': {
-        ...              'key4': 'value1/2/4',
-        ...         },
-        ...     },
-        ... }) == {
-        ...     'key1': {
-        ...         'key2': {
-        ...             'key3': 'value1/2/3',
-        ...             'key4': 'value1/2/4',
-        ...         },
-        ...     },
-        ... }
-        True
-        >>> deep_update(a, {
-        ...     'key1': {
-        ...         'key2': {
-        ...              'key3': 'newvalue1/2/3',
-        ...         },
-        ...     },
-        ... }) == {
-        ...     'key1': {
-        ...         'key2': {
-        ...             'key3': 'newvalue1/2/3',
-        ...             'key4': 'value1/2/4',
-        ...         },
-        ...     },
-        ... }
-        True
+from __future__ import annotations

-    """
-    for key, rvalue in right.items():
-        if isinstance(rvalue, Mapping):
-            new_lvalue = deep_update(left.get(key, {}), rvalue)
-            left[key] = new_lvalue
-        else:
-            left[key] = rvalue
-    return left
+import abc
+from typing import Any, Dict, Iterator, List, Set


 class MerkleNode(dict, metaclass=abc.ABCMeta):
@@ -141,7 +79,7 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        for parent in self.parents:
            parent.invalidate_hash()

-    def update_hash(self, *, force=False):
+    def update_hash(self, *, force=False) -> Any:
        """Recursively compute the hash of the current node.

        Args:
@@ -161,14 +99,17 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        return self.__hash

    @property
-    def hash(self):
+    def hash(self) -> Any:
        """The hash of the current node, as calculated by
        :func:`compute_hash`.
        """
        return self.update_hash()

+    def __hash__(self):
+        return hash(self.hash)
+
    @abc.abstractmethod
-    def compute_hash(self):
+    def compute_hash(self) -> Any:
        """Compute the hash of the current node.

        The hash should depend on the data of the node, as well as on hashes
@@ -223,47 +164,24 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        """
        return self.data

-    def collect_node(self, **kwargs):
-        """Collect the data for the current node, for use by :func:`collect`.
-
-        Arguments:
-          kwargs: passed as-is to :func:`get_data`.
-
-        Returns:
-          A :class:`dict` compatible with :func:`collect`.
-        """
+    def collect_node(self) -> Set[MerkleNode]:
+        """Collect the current node if it has not been yet, for use by :func:`collect`."""
        if not self.collected:
            self.collected = True
-            return {self.object_type: {self.hash: self.get_data(**kwargs)}}
+            return {self}
        else:
-            return {}
+            return set()

-    def collect(self, **kwargs):
-        """Collect the data for all nodes in the subtree rooted at `self`.
-
-        The data is deduplicated by type and by hash.
-
-        Arguments:
-          kwargs: passed as-is to :func:`get_data`.
+    def collect(self) -> Set[MerkleNode]:
+        """Collect the added and modified nodes in the subtree rooted at `self`
+        since the last collect operation.

        Returns:
-           A :class:`dict` with the following structure::
-
-             {
-               'typeA': {
-                 node1.hash: node1.get_data(),
-                 node2.hash: node2.get_data(),
-               },
-               'typeB': {
-                 node3.hash: node3.get_data(),
-                 ...
-               },
-               ...
-             }
+           A :class:`set` of collected nodes
        """
-        ret = self.collect_node(**kwargs)
+        ret = self.collect_node()
        for child in self.values():
-            deep_update(ret, child.collect(**kwargs))
+            ret.update(child.collect())

        return ret

@@ -277,14 +195,14 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        for child in self.values():
            child.reset_collect()

-    def iter_tree(self, dedup=True) -> Iterator["MerkleNode"]:
+    def iter_tree(self, dedup=True) -> Iterator[MerkleNode]:
        """Yields all children nodes, recursively. Common nodes are deduplicated
-           by default (deduplication can be turned off setting the given argument
-           'dedup' to False).
+        by default (deduplication can be turned off setting the given argument
+        'dedup' to False).
        """
-        yield from self._iter_tree(set(), dedup)
+        yield from self._iter_tree(seen=set(), dedup=dedup)

-    def _iter_tree(self, seen: Set[bytes], dedup) -> Iterator["MerkleNode"]:
+    def _iter_tree(self, seen: Set[bytes], dedup) -> Iterator[MerkleNode]:
        if self.hash not in seen:
            if dedup:
                seen.add(self.hash)
@@ -299,7 +217,7 @@ class MerkleLeaf(MerkleNode):
    A Merkle leaf is simply a Merkle node with children disabled.
    """

-    __slots__ = []  # type: List[str]
+    __slots__: List[str] = []

    def __setitem__(self, name, child):
        raise ValueError("%s is a leaf" % self.__class__.__name__)

--- a/swh/model/model.py
+++ b/swh/model/model.py
--- a/swh/model/swhids.py
+++ b/swh/model/swhids.py
@@ -123,6 +123,9 @@ class _BaseSWHID(Generic[_TObjectType]):
            )

    def __str__(self) -> str:
+        return self._format_core_swhid()
+
+    def _format_core_swhid(self) -> str:
        return SWHID_SEP.join(
            [
                self.namespace,
@@ -194,6 +197,17 @@ class CoreSWHID(_BaseSWHID[ObjectType]):
            object_id=self.object_id,
        )

+    def to_qualified(self) -> QualifiedSWHID:
+        """Converts this CoreSWHID into a QualifiedSWHID.
+
+        As QualifiedSWHID is a superset of CoreSWHID, this is lossless."""
+        return QualifiedSWHID(
+            namespace=self.namespace,
+            scheme_version=self.scheme_version,
+            object_type=self.object_type,
+            object_id=self.object_id,
+        )
+

 def _parse_core_swhid(swhid: Union[str, CoreSWHID, None]) -> Optional[CoreSWHID]:
    if swhid is None or isinstance(swhid, CoreSWHID):
@@ -203,7 +217,7 @@ def _parse_core_swhid(swhid: Union[str, CoreSWHID, None]) -> Optional[CoreSWHID]


 def _parse_lines_qualifier(
-    lines: Union[str, Tuple[int, Optional[int]], None]
+    lines: Union[str, Tuple[int, Optional[int]], None],
 ) -> Optional[Tuple[int, Optional[int]]]:
    try:
        if lines is None or isinstance(lines, tuple):
@@ -291,8 +305,9 @@ class QualifiedSWHID(_BaseSWHID[ObjectType]):
    when the anchor denotes a snapshot, the root directory is the one pointed to by HEAD
    (possibly indirectly), and undefined if such a reference is missing"""

+    Lines = Tuple[int, Optional[int]]
    lines = attr.ib(
-        type=Optional[Tuple[int, Optional[int]]],
+        type=Optional[Lines],
        default=None,
        validator=type_validator(),
        converter=_parse_lines_qualifier,
@@ -321,15 +336,26 @@ class QualifiedSWHID(_BaseSWHID[ObjectType]):
                params={"type": value.object_type.value},
            )

+    def to_dict(self) -> Dict[str, Optional[str | bytes | CoreSWHID | Lines]]:
+        """Returns a dictionary version of this QSWHID for json serialization"""
+        return {
+            "swhid": self._format_core_swhid(),
+            "origin": self.origin,
+            "visit": self.visit,
+            "anchor": self.anchor,
+            "path": self.path,
+            "lines": self.lines,
+        }
+
    def qualifiers(self) -> Dict[str, str]:
+        """Returns URL-escaped qualifiers of this SWHID, for use in serialization"""
        origin = self.origin
        if origin:
            unescaped_origin = origin
+            origin = origin.replace("%", "%25")
            origin = origin.replace(";", "%3B")
-            assert urllib.parse.unquote_to_bytes(
-                origin
-            ) == urllib.parse.unquote_to_bytes(
-                unescaped_origin
+            assert (
+                urllib.parse.unquote(origin) == unescaped_origin
            ), "Escaping ';' in the origin qualifier corrupted the origin URL."

        d: Dict[str, Optional[str]] = {
@@ -350,14 +376,7 @@ class QualifiedSWHID(_BaseSWHID[ObjectType]):
        return {k: v for (k, v) in d.items() if v is not None}

    def __str__(self) -> str:
-        swhid = SWHID_SEP.join(
-            [
-                self.namespace,
-                str(self.scheme_version),
-                self.object_type.value,
-                hash_to_hex(self.object_id),
-            ]
-        )
+        swhid = self._format_core_swhid()
        qualifiers = self.qualifiers()
        if qualifiers:
            for k, v in qualifiers.items():
@@ -377,6 +396,9 @@ class QualifiedSWHID(_BaseSWHID[ObjectType]):
                "Invalid qualifier(s): %(qualifiers)s",
                params={"qualifiers": ", ".join(invalid_qualifiers)},
            )
+        if "origin" in qualifiers:
+            qualifiers["origin"] = urllib.parse.unquote(qualifiers["origin"])
+
        try:
            return QualifiedSWHID(**parts, **qualifiers)
        except ValueError as e:

--- a/swh/model/tests/fields/test_compound.py
+++ b/swh/model/tests/fields/test_compound.py
@@ -157,7 +157,9 @@ class ValidateCompound(unittest.TestCase):
    def test_validate_whole_schema_shortcut_previous_error(self):
        with self.assertRaises(ValidationError) as cm:
            compound.validate_against_schema(
-                self.test_model, self.test_schema_shortcut, self.test_value_missing,
+                self.test_model,
+                self.test_schema_shortcut,
+                self.test_value_missing,
            )

        exc = cm.exception
@@ -167,7 +169,9 @@ class ValidateCompound(unittest.TestCase):
    def test_validate_whole_schema(self):
        with self.assertRaises(ValidationError) as cm:
            compound.validate_against_schema(
-                self.test_model, self.test_schema_shortcut, self.test_value,
+                self.test_model,
+                self.test_schema_shortcut,
+                self.test_value,
            )

        # The exception should be of the form:

--- a/swh/model/tests/swh_model_data.py
+++ b/swh/model/tests/swh_model_data.py
--- a/swh/model/tests/test_cli.py
+++ b/swh/model/tests/test_cli.py
@@ -19,7 +19,6 @@ from swh.model.tests.swh_model_data import SAMPLE_FOLDER_SWHIDS
 from swh.model.tests.test_from_disk import DataMixin


-@pytest.mark.fs
 class TestIdentify(DataMixin, unittest.TestCase):
    def setUp(self):
        super().setUp()
@@ -78,7 +77,9 @@ class TestIdentify(DataMixin, unittest.TestCase):
        with unittest.mock.patch.dict(sys.modules, {"dulwich": None}):
            with tempfile.TemporaryDirectory(prefix="swh.model.cli") as d:
                result = self.runner.invoke(
-                    cli.identify, ["--type", "snapshot", d], catch_exceptions=False,
+                    cli.identify,
+                    ["--type", "snapshot", d],
+                    catch_exceptions=False,
                )

        assert result.exit_code == 1
@@ -94,7 +95,8 @@ class TestIdentify(DataMixin, unittest.TestCase):
        """identify symlink --- both itself and target"""
        regular = os.path.join(self.tmpdir_name, b"foo.txt")
        link = os.path.join(self.tmpdir_name, b"bar.txt")
-        open(regular, "w").write("foo\n")
+        with open(regular, "w") as f:
+            f.write("foo\n")
        os.symlink(os.path.basename(regular), link)

        result = self.runner.invoke(cli.identify, [link])
No results found