Compare revisions

c9583bae · c9583bae · c9583bae · fcfbd4de · fcfbd4de · c9583bae
--- a/requirements-cli.txt
+++ b/requirements-cli.txt
+swh.core >= 0.3
 Click
 dulwich
--- a/requirements-test.txt
+++ b/requirements-test.txt
-Click
-dulwich
-pytest
+aiohttp
+click
+pytest >= 8.1
 pytz
+types-click
+types-python-dateutil
+types-pytz
+types-deprecated
--- a/requirements.txt
+++ b/requirements.txt
 # Add here external Python modules dependencies, one per line. Module names
 # should match https://pypi.python.org/pypi names. For the full spec or
 # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
-vcversioner
-attrs
+attrs != 21.1.0  # https://github.com/python-attrs/attrs/issues/804
+attrs_strict >= 0.0.7
+deprecated
 hypothesis
+iso8601
 python-dateutil
+typing_extensions
+
--- a/setup.py
+++ b/setup.py
-#!/usr/bin/env python3
-# Copyright (C) 2015-2018  The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-from setuptools import setup, find_packages
-
-from os import path
-from io import open
-
-here = path.abspath(path.dirname(__file__))
-
-# Get the long description from the README file
-with open(path.join(here, 'README.md'), encoding='utf-8') as f:
-    long_description = f.read()
-
-
-def parse_requirements(name=None):
-    if name:
-        reqf = 'requirements-%s.txt' % name
-    else:
-        reqf = 'requirements.txt'
-
-    requirements = []
-    if not path.exists(reqf):
-        return requirements
-
-    with open(reqf) as f:
-        for line in f.readlines():
-            line = line.strip()
-            if not line or line.startswith('#'):
-                continue
-            requirements.append(line)
-    return requirements
-
-
-blake2_requirements = ['pyblake2;python_version<"3.6"']
-
-setup(
-    name='swh.model',
-    description='Software Heritage data model',
-    long_description=long_description,
-    long_description_content_type='text/markdown',
-    author='Software Heritage developers',
-    author_email='swh-devel@inria.fr',
-    url='https://forge.softwareheritage.org/diffusion/DMOD/',
-    packages=find_packages(),
-    setup_requires=['vcversioner'],
-    install_requires=(parse_requirements() + parse_requirements('swh') +
-                      blake2_requirements),
-    extras_require={
-        'cli': parse_requirements('cli'),
-        'testing': parse_requirements('test'),
-    },
-    vcversioner={},
-    include_package_data=True,
-    entry_points='''
-        [console_scripts]
-        swh-identify=swh.model.cli:identify
-        [swh.cli.subcommands]
-        identify=swh.model.cli:identify
-    ''',
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "Intended Audience :: Developers",
-        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
-        "Operating System :: OS Independent",
-        "Development Status :: 5 - Production/Stable",
-    ],
-    project_urls={
-        'Bug Reports': 'https://forge.softwareheritage.org/maniphest',
-        'Funding': 'https://www.softwareheritage.org/donate',
-        'Source': 'https://forge.softwareheritage.org/source/swh-model',
-    },
-)
--- a/swh/__init__.py
+++ b/swh/__init__.py
-from pkgutil import extend_path
-from typing import Iterable
-
-__path__ = extend_path(__path__, __name__)  # type: Iterable[str]
--- a/swh/model/cli.py
+++ b/swh/model/cli.py
-# Copyright (C) 2018-2019  The Software Heritage developers
+# Copyright (C) 2018-2020  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

-import click
-import dulwich.repo
 import os
 import sys
+from typing import Callable, Dict, Iterable, Optional

-from functools import partial
-from urllib.parse import urlparse
+# WARNING: do not import unnecessary things here to keep cli startup time under
+# control
+try:
+    import click
+except ImportError:
+    print(
+        "Cannot run swh-identify; the Click package is not installed."
+        "Please install 'swh.model[cli]' for full functionality.",
+        file=sys.stderr,
+    )
+    exit(1)

-from swh.model import hashutil
-from swh.model import identifiers as pids
-from swh.model.exceptions import ValidationError
-from swh.model.from_disk import Content, Directory
+try:
+    import swh.core.cli

+    cli_command = swh.core.cli.swh.command
+except ImportError:
+    # stub so that swh-identify can be used when swh-core isn't installed
+    cli_command = click.command

-CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
+from swh.model.from_disk import Directory
+from swh.model.swhids import CoreSWHID
+
+CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])

 # Mapping between dulwich types and Software Heritage ones. Used by snapshot ID
 # computation.
 _DULWICH_TYPES = {
-    b'blob': 'content',
-    b'tree': 'directory',
-    b'commit': 'revision',
-    b'tag': 'release',
+    b"blob": "content",
+    b"tree": "directory",
+    b"commit": "revision",
+    b"tag": "release",
 }


-class PidParamType(click.ParamType):
-    name = 'persistent identifier'
+class CoreSWHIDParamType(click.ParamType):
+    """Click argument that accepts a core SWHID and returns them as
+    :class:`swh.model.swhids.CoreSWHID` instances"""
+
+    name = "SWHID"
+
+    def convert(self, value, param, ctx) -> CoreSWHID:
+        from swh.model.exceptions import ValidationError

-    def convert(self, value, param, ctx):
        try:
-            pids.parse_persistent_identifier(value)
-            return value  # return as string, as we need just that
+            return CoreSWHID.from_string(value)
        except ValidationError as e:
-            self.fail('%s is not a valid PID. %s.' % (value, e), param, ctx)
+            self.fail(f'"{value}" is not a valid core SWHID: {e}', param, ctx)
+
+
+def swhid_of_file(path) -> CoreSWHID:
+    from swh.model.from_disk import Content
+
+    object = Content.from_file(path=path)
+    return object.swhid()
+
+
+def swhid_of_file_content(data) -> CoreSWHID:
+    from swh.model.from_disk import Content
+
+    object = Content.from_bytes(mode=644, data=data)
+    return object.swhid()
+
+
+def model_of_dir(
+    path: bytes,
+    exclude_patterns: Optional[Iterable[bytes]] = None,
+    update_info: Optional[Callable[[int], None]] = None,
+) -> Directory:
+    from swh.model.from_disk import accept_all_paths, ignore_directories_patterns
+
+    path_filter = (
+        ignore_directories_patterns(path, exclude_patterns)
+        if exclude_patterns
+        else accept_all_paths
+    )

+    return Directory.from_disk(
+        path=path, path_filter=path_filter, progress_callback=update_info
+    )

-def pid_of_file(path):
-    object = Content.from_file(path=path).get_data()
-    return pids.persistent_identifier(pids.CONTENT, object)

+def swhid_of_dir(
+    path: bytes, exclude_patterns: Optional[Iterable[bytes]] = None
+) -> CoreSWHID:
+    obj = model_of_dir(path, exclude_patterns)
+    return obj.swhid()

-def pid_of_file_content(data):
-    object = Content.from_bytes(mode=644, data=data).get_data()
-    return pids.persistent_identifier(pids.CONTENT, object)

+def swhid_of_origin(url):
+    from swh.model.model import Origin

-def pid_of_dir(path):
-    object = Directory.from_disk(path=path).get_data()
-    return pids.persistent_identifier(pids.DIRECTORY, object)
+    return Origin(url).swhid()


-def pid_of_origin(url):
-    pid = pids.PersistentId(object_type='origin',
-                            object_id=pids.origin_identifier({'url': url}))
-    return str(pid)
+def swhid_of_git_repo(path) -> CoreSWHID:
+    try:
+        import dulwich.repo
+    except ImportError:
+        raise click.ClickException(
+            "Cannot compute snapshot identifier; the Dulwich package is not installed. "
+            "Please install 'swh.model[cli]' for full functionality.",
+        )

+    from swh.model import hashutil
+    from swh.model.model import Snapshot

-def pid_of_git_repo(path):
    repo = dulwich.repo.Repo(path)

-    branches = {}
+    branches: Dict[bytes, Optional[Dict]] = {}
    for ref, target in repo.refs.as_dict().items():
        obj = repo[target]
        if obj:
            branches[ref] = {
-                'target': hashutil.bytehex_to_hash(target),
-                'target_type': _DULWICH_TYPES[obj.type_name],
+                "target": hashutil.bytehex_to_hash(target),
+                "target_type": _DULWICH_TYPES[obj.type_name],
            }
        else:
            branches[ref] = None

    for ref, target in repo.refs.get_symrefs().items():
        branches[ref] = {
-            'target': target,
-            'target_type': 'alias',
+            "target": target,
+            "target_type": "alias",
        }

-    snapshot = {'branches': branches}
+    snapshot = {"branches": branches}

-    pid = pids.PersistentId(object_type='snapshot',
-                            object_id=pids.snapshot_identifier(snapshot))
-    return str(pid)
+    return Snapshot.from_dict(snapshot).swhid()


-def identify_object(obj_type, follow_symlinks, obj):
-    if obj_type == 'auto':
-        if obj == '-' or os.path.isfile(obj):
-            obj_type = 'content'
+def identify_object(
+    obj_type: str, follow_symlinks: bool, exclude_patterns: Iterable[bytes], obj
+) -> str:
+    from urllib.parse import urlparse
+
+    if obj_type == "auto":
+        if obj == "-" or os.path.isfile(obj):
+            obj_type = "content"
        elif os.path.isdir(obj):
-            obj_type = 'directory'
+            obj_type = "directory"
        else:
            try:  # URL parsing
                if urlparse(obj).scheme:
-                    obj_type = 'origin'
+                    obj_type = "origin"
                else:
                    raise ValueError
            except ValueError:
-                raise click.BadParameter('cannot detect object type for %s' %
-                                         obj)
-
-    pid = None
+                raise click.BadParameter("cannot detect object type for %s" % obj)

-    if obj == '-':
+    if obj == "-":
        content = sys.stdin.buffer.read()
-        pid = pid_of_file_content(content)
-    elif obj_type in ['content', 'directory']:
+        swhid = str(swhid_of_file_content(content))
+    elif obj_type in ["content", "directory"]:
        path = obj.encode(sys.getfilesystemencoding())
        if follow_symlinks and os.path.islink(obj):
            path = os.path.realpath(obj)
-        if obj_type == 'content':
-            pid = pid_of_file(path)
-        elif obj_type == 'directory':
-            pid = pid_of_dir(path)
-    elif obj_type == 'origin':
-        pid = pid_of_origin(obj)
-    elif obj_type == 'snapshot':
-        pid = pid_of_git_repo(obj)
+        if obj_type == "content":
+            swhid = str(swhid_of_file(path))
+        elif obj_type == "directory":
+            swhid = str(swhid_of_dir(path, exclude_patterns))
+    elif obj_type == "origin":
+        swhid = str(swhid_of_origin(obj))
+    elif obj_type == "snapshot":
+        swhid = str(swhid_of_git_repo(obj))
    else:  # shouldn't happen, due to option validation
-        raise click.BadParameter('invalid object type: ' + obj_type)
+        raise click.BadParameter("invalid object type: " + obj_type)

    # note: we return original obj instead of path here, to preserve user-given
    # file name in output
-    return (obj, pid)
-
-
-@click.command(context_settings=CONTEXT_SETTINGS)
-@click.option('--dereference/--no-dereference', 'follow_symlinks',
-              default=True,
-              help='follow (or not) symlinks for OBJECTS passed as arguments '
-              + '(default: follow)')
-@click.option('--filename/--no-filename', 'show_filename', default=True,
-              help='show/hide file name (default: show)')
-@click.option('--type', '-t', 'obj_type', default='auto',
-              type=click.Choice(['auto', 'content', 'directory', 'origin',
-                                 'snapshot']),
-              help='type of object to identify (default: auto)')
-@click.option('--verify', '-v', metavar='PID', type=PidParamType(),
-              help='reference identifier to be compared with computed one')
-@click.argument('objects', nargs=-1)
-def identify(obj_type, verify, show_filename, follow_symlinks, objects):
-    """Compute the Software Heritage persistent identifier (PID) for the given
+    return swhid
+
+
+@cli_command(context_settings=CONTEXT_SETTINGS)
+@click.option(
+    "--dereference/--no-dereference",
+    "follow_symlinks",
+    default=True,
+    help="follow (or not) symlinks for OBJECTS passed as arguments "
+    + "(default: follow)",
+)
+@click.option(
+    "--filename/--no-filename",
+    "show_filename",
+    default=True,
+    help="show/hide file name (default: show)",
+)
+@click.option(
+    "--type",
+    "-t",
+    "obj_type",
+    default="auto",
+    type=click.Choice(["auto", "content", "directory", "origin", "snapshot"]),
+    help="type of object to identify (default: auto)",
+)
+@click.option(
+    "--exclude",
+    "-x",
+    "exclude_patterns",
+    metavar="PATTERN",
+    multiple=True,
+    help="Exclude directories using glob patterns \
+    (e.g., ``*.git`` to exclude all .git directories)",
+)
+@click.option(
+    "--verify",
+    "-v",
+    metavar="SWHID",
+    type=CoreSWHIDParamType(),
+    help="reference identifier to be compared with computed one",
+)
+@click.option(
+    "-r",
+    "--recursive",
+    is_flag=True,
+    help="compute SWHID recursively",
+)
+@click.argument("objects", nargs=-1, required=True)
+def identify(
+    obj_type,
+    verify,
+    show_filename,
+    follow_symlinks,
+    objects,
+    exclude_patterns,
+    recursive,
+):
+    """Compute the Software Heritage persistent identifier (SWHID) for the given
    source code object(s).

-    For more details about Software Heritage PIDs see:
+    For more details about SWHIDs see:

-    \b
    https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html

-    \b
-    Examples:
+    Tip: you can pass "-" to identify the content of standard input.
+
+    Examples::

-    \b
      $ swh identify fork.c kmod.c sched/deadline.c
      swh:1:cnt:2e391c754ae730bd2d8520c2ab497c403220c6e3    fork.c
      swh:1:cnt:0277d1216f80ae1adeed84a686ed34c9b2931fc2    kmod.c
      swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82    sched/deadline.c

-    \b
      $ swh identify --no-filename /usr/src/linux/kernel/
      swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab

-    \b
      $ git clone --mirror https://forge.softwareheritage.org/source/helloworld.git
-      $ swh identify --type snapshot helloworld.git/
-      swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93	helloworld.git

-    """  # NoQA  # overlong lines in shell examples are fine
-    if not objects:
-        objects = ['-']
+      $ swh identify --type snapshot helloworld.git/
+      swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93    helloworld.git

-    if verify and len(objects) != 1:
-        raise click.BadParameter('verification requires a single object')
+    """
+    from functools import partial
+    import logging

-    results = map(partial(identify_object, obj_type, follow_symlinks), objects)
+    if exclude_patterns:
+        exclude_patterns = set(pattern.encode() for pattern in exclude_patterns)

-    if verify:
-        pid = next(results)[1]
-        if verify == pid:
-            click.echo('PID match: %s' % pid)
-            sys.exit(0)
-        else:
-            click.echo('PID mismatch: %s != %s' % (verify, pid))
-            sys.exit(1)
-    else:
-        for (obj, pid) in results:
-            msg = pid
-            if show_filename:
-                msg = '%s\t%s' % (pid, os.fsdecode(obj))
+    if verify and len(objects) != 1:
+        raise click.BadParameter("verification requires a single object")
+
+    if recursive and not os.path.isdir(objects[0]):
+        recursive = False
+        logging.warn("recursive option disabled, input is not a directory object")
+
+    if recursive:
+        if verify:
+            raise click.BadParameter(
+                "verification of recursive object identification is not supported"
+            )
+
+        if not obj_type == ("auto" or "directory"):
+            raise click.BadParameter(
+                "recursive identification is supported only for directories"
+            )
+
+        path = os.fsencode(objects[0])
+        dir_obj = model_of_dir(path, exclude_patterns)
+        for sub_obj in dir_obj.iter_tree():
+            path_name = "path" if "path" in sub_obj.data.keys() else "data"
+            path = os.fsdecode(sub_obj.data[path_name])
+            swhid = str(sub_obj.swhid())
+            msg = f"{swhid}\t{path}" if show_filename else f"{swhid}"
            click.echo(msg)
+    else:
+        results = zip(
+            objects,
+            map(
+                partial(identify_object, obj_type, follow_symlinks, exclude_patterns),
+                objects,
+            ),
+        )
+
+        if verify:
+            swhid = next(results)[1]
+            if str(verify) == swhid:
+                click.echo("SWHID match: %s" % swhid)
+                sys.exit(0)
+            else:
+                click.echo("SWHID mismatch: %s != %s" % (verify, swhid))
+                sys.exit(1)
+        else:
+            for obj, swhid in results:
+                msg = swhid
+                if show_filename:
+                    msg = "%s\t%s" % (swhid, os.fsdecode(obj))
+                click.echo(msg)


-if __name__ == '__main__':
+if __name__ == "__main__":
    identify()
--- a/swh/model/collections.py
+++ b/swh/model/collections.py
+# Copyright (C) 2020-2023 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from __future__ import annotations
+
+"""Utility data structures."""
+
+from collections.abc import Mapping
+import copy
+from typing import Dict, Generic, Iterable, Optional, Tuple, TypeVar, Union
+
+KT = TypeVar("KT")
+VT = TypeVar("VT")
+
+
+class ImmutableDict(Mapping, Generic[KT, VT]):
+    """A frozen dictionary.
+
+    This class behaves like a dictionary, but internally stores objects in a tuple,
+    so it is both immutable and hashable."""
+
+    _data: Dict[KT, VT]
+
+    def __init__(
+        self,
+        data: Union[Iterable[Tuple[KT, VT]], ImmutableDict[KT, VT], Dict[KT, VT]] = {},
+    ):
+        if isinstance(data, dict):
+            self._data = data
+        elif isinstance(data, ImmutableDict):
+            self._data = data._data
+        else:
+            self._data = {k: v for k, v in data}
+
+    @property
+    def data(self):
+        return tuple(self._data.items())
+
+    def __repr__(self):
+        return f"ImmutableDict({dict(self.data)!r})"
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def __iter__(self):
+        for k, v in self.data:
+            yield k
+
+    def __len__(self):
+        return len(self._data)
+
+    def items(self):
+        yield from self.data
+
+    def __hash__(self):
+        return hash(tuple(sorted(self.data)))
+
+    def copy_pop(self, popped_key) -> Tuple[Optional[VT], ImmutableDict[KT, VT]]:
+        """Returns a copy of this ImmutableDict without the given key,
+        as well as the value associated to the key."""
+        new_items = copy.deepcopy(self._data)
+        popped_value: Optional[VT] = new_items.pop(popped_key, None)
+        return (popped_value, ImmutableDict(new_items))
--- a/swh/model/discovery.py
+++ b/swh/model/discovery.py
+# Copyright (C) 2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Primitives for finding unknown content efficiently."""
+
+from __future__ import annotations
+
+from collections import namedtuple
+import itertools
+import logging
+import random
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    Set,
+    Union,
+)
+
+from typing_extensions import Protocol, runtime_checkable
+
+from .from_disk import model
+from .model import Sha1Git
+
+logger = logging.getLogger(__name__)
+
+# Maximum amount when sampling from the undecided set of directory entries
+SAMPLE_SIZE = 1000
+
+# Sets of sha1 of contents, skipped contents and directories respectively
+Sample: NamedTuple = namedtuple(
+    "Sample", ["contents", "skipped_contents", "directories"]
+)
+
+
+@runtime_checkable
+class ArchiveDiscoveryInterface(Protocol):
+    """Interface used in discovery code to abstract over ways of connecting to
+    the SWH archive (direct storage, web API, etc.) for all methods needed by
+    discovery algorithms."""
+
+    contents: List[model.Content]
+    skipped_contents: List[model.SkippedContent]
+    directories: List[model.Directory]
+
+    def __init__(
+        self,
+        contents: List[model.Content],
+        skipped_contents: List[model.SkippedContent],
+        directories: List[model.Directory],
+    ) -> None:
+        self.contents = contents
+        self.skipped_contents = skipped_contents
+        self.directories = directories
+
+    def content_missing(self, contents: List[Sha1Git]) -> Iterable[Sha1Git]:
+        """List content missing from the archive by sha1"""
+
+    def skipped_content_missing(
+        self, skipped_contents: List[Sha1Git]
+    ) -> Iterable[Sha1Git]:
+        """List skipped content missing from the archive by sha1"""
+
+    def directory_missing(self, directories: List[Sha1Git]) -> Iterable[Sha1Git]:
+        """List directories missing from the archive by sha1"""
+
+
+class BaseDiscoveryGraph:
+    """Creates the base structures and methods needed for discovery algorithms.
+    Subclasses should override ``get_sample`` to affect how the discovery is made.
+
+    The `update_info_callback` is an optional argument that will get called for
+    each new piece of information we get. The callback arguments are `(content,
+    known)`.
+    - content: the relevant model.Content object,
+    - known: a boolean, True if the file is known to the archive False otherwise.
+    """
+
+    def __init__(
+        self,
+        contents,
+        skipped_contents,
+        directories,
+        update_info_callback: Optional[Callable[[Any, bool], None]] = None,
+    ):
+        self._all_contents: Mapping[
+            Sha1Git, Union[model.Content, model.SkippedContent]
+        ] = {}
+        self._undecided_directories: Set[Sha1Git] = set()
+        self._children: Mapping[Sha1Git, Set[Sha1Git]] = {}
+        self._parents: Mapping[model.DirectoryEntry, Set[Any]] = {}
+        self.undecided: Set[Sha1Git] = set()
+
+        for content in itertools.chain(contents, skipped_contents):
+            self.undecided.add(content.sha1_git)
+            self._all_contents[content.sha1_git] = content
+
+        for directory in directories:
+            self.undecided.add(directory.id)
+            self._undecided_directories.add(directory.id)
+            self._children[directory.id] = {c.target for c in directory.entries}
+            for child in directory.entries:
+                self._parents.setdefault(child.target, set()).add(directory.id)
+
+        self.undecided |= self._undecided_directories
+        self.known: Set[Sha1Git] = set()
+        self.unknown: Set[Sha1Git] = set()
+        self._update_info_callback = update_info_callback
+        self._sha1_to_obj = {}
+        for content in itertools.chain(contents, skipped_contents):
+            self._sha1_to_obj[content.sha1_git] = content
+        for directory in directories:
+            self._sha1_to_obj[directory.id] = directory
+
+    def mark_known(self, entries: Iterable[Sha1Git]):
+        """Mark ``entries`` and those they imply as known in the SWH archive"""
+        self._mark_entries(entries, self._children, self.known)
+
+    def mark_unknown(self, entries: Iterable[Sha1Git]):
+        """Mark ``entries`` and those they imply as unknown in the SWH archive"""
+        self._mark_entries(entries, self._parents, self.unknown)
+
+    def _mark_entries(
+        self,
+        entries: Iterable[Sha1Git],
+        transitive_mapping: Mapping[Any, Any],
+        target_set: Set[Any],
+    ):
+        """Use Merkle graph properties to mark a directory entry as known or unknown.
+
+        If an entry is known, then all of its descendants are known. If it's
+        unknown, then all of its ancestors are unknown.
+
+        - ``entries``: directory entries to mark along with their ancestors/descendants
+          where applicable.
+        - ``transitive_mapping``: mapping from an entry to the next entries to mark
+          in the hierarchy, if any.
+        - ``target_set``: set where marked entries will be added.
+
+        """
+        callback = self._update_info_callback
+        to_process = set(entries)
+        while to_process:
+            current = to_process.pop()
+            target_set.add(current)
+            new = current in self.undecided
+            self.undecided.discard(current)
+            self._undecided_directories.discard(current)
+            next_entries = transitive_mapping.get(current, set()) & self.undecided
+            to_process.update(next_entries)
+            if new and callback is not None:
+                obj = self._sha1_to_obj[current]
+                callback(obj, current in self.known)
+
+    def get_sample(
+        self,
+    ) -> Sample:
+        """Return a three-tuple of samples from the undecided sets of contents,
+        skipped contents and directories respectively.
+        These samples will be queried against the storage which will tell us
+        which are known."""
+        raise NotImplementedError()
+
+    def do_query(self, archive: ArchiveDiscoveryInterface, sample: Sample) -> None:
+        """Given a three-tuple of samples, ask the archive which are known or
+        unknown and mark them as such."""
+
+        methods = (
+            archive.content_missing,
+            archive.skipped_content_missing,
+            archive.directory_missing,
+        )
+
+        for sample_per_type, method in zip(sample, methods):
+            if not sample_per_type:
+                continue
+            known = set(sample_per_type)
+            unknown = set(method(list(sample_per_type)))
+            known -= unknown
+
+            self.mark_known(known)
+            self.mark_unknown(unknown)
+
+
+class RandomDirSamplingDiscoveryGraph(BaseDiscoveryGraph):
+    """Use a random sampling using only directories.
+
+    This allows us to find a statistically good spread of entries in the graph
+    with a smaller population than using all types of entries. When there are
+    no more directories, only contents or skipped contents are undecided if any
+    are left: we send them directly to the storage since they should be few and
+    their structure flat."""
+
+    def get_sample(self) -> Sample:
+        if self._undecided_directories:
+            if len(self._undecided_directories) <= SAMPLE_SIZE:
+                return Sample(
+                    contents=set(),
+                    skipped_contents=set(),
+                    directories=set(self._undecided_directories),
+                )
+            sample = random.sample(tuple(self._undecided_directories), SAMPLE_SIZE)
+            directories = {o for o in sample}
+            return Sample(
+                contents=set(), skipped_contents=set(), directories=directories
+            )
+
+        contents = set()
+        skipped_contents = set()
+
+        for sha1 in self.undecided:
+            obj = self._all_contents[sha1]
+            obj_type = obj.object_type
+            if obj_type == model.Content.object_type:
+                contents.add(sha1)
+            elif obj_type == model.SkippedContent.object_type:
+                skipped_contents.add(sha1)
+            else:
+                raise TypeError(f"Unexpected object type {obj_type}")
+
+        return Sample(
+            contents=contents, skipped_contents=skipped_contents, directories=set()
+        )
+
+
+def filter_known_objects(
+    archive: ArchiveDiscoveryInterface,
+    update_info_callback: Optional[Callable[[Any, bool], None]] = None,
+):
+    """Filter ``archive``'s ``contents``, ``skipped_contents`` and ``directories``
+    to only return those that are unknown to the SWH archive using a discovery
+    algorithm.
+
+    The `update_info_callback` is an optional argument that will get called for
+    each new piece of information we get. The callback arguments are `(content,
+    known)`.
+    - content: the relevant model.Content object,
+    - known: a boolean, True if the file is known to the archive False otherwise.
+    """
+    contents = archive.contents
+    skipped_contents = archive.skipped_contents
+    directories = archive.directories
+
+    contents_count = len(contents)
+    skipped_contents_count = len(skipped_contents)
+    directories_count = len(directories)
+
+    graph = RandomDirSamplingDiscoveryGraph(
+        contents,
+        skipped_contents,
+        directories,
+        update_info_callback=update_info_callback,
+    )
+
+    while graph.undecided:
+        sample = graph.get_sample()
+        graph.do_query(archive, sample)
+
+    contents = [c for c in contents if c.sha1_git in graph.unknown]
+    skipped_contents = [c for c in skipped_contents if c.sha1_git in graph.unknown]
+    directories = [c for c in directories if c.id in graph.unknown]
+
+    logger.debug(
+        "Filtered out %d contents, %d skipped contents and %d directories",
+        contents_count - len(contents),
+        skipped_contents_count - len(skipped_contents),
+        directories_count - len(directories),
+    )
+
+    return (contents, skipped_contents, directories)
--- a/swh/model/exceptions.py
+++ b/swh/model/exceptions.py
@@ -33,11 +33,12 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.

-NON_FIELD_ERRORS = '__all__'
+NON_FIELD_ERRORS = "__all__"


 class ValidationError(Exception):
    """An error while validating data."""
+
    def __init__(self, message, code=None, params=None):
        """
        The `message` argument can be a single error, a list of errors, or a
@@ -54,16 +55,15 @@ class ValidationError(Exception):
            message = message[0]

        if isinstance(message, ValidationError):
-            if hasattr(message, 'error_dict'):
+            if hasattr(message, "error_dict"):
                message = message.error_dict
            # PY2 has a `message` property which is always there so we can't
            # duck-type on it. It was introduced in Python 2.5 and already
            # deprecated in Python 2.6.
-            elif not hasattr(message, 'message'):
+            elif not hasattr(message, "message"):
                message = message.error_list
            else:
-                message, code, params = (message.message, message.code,
-                                         message.params)
+                message, code, params = (message.message, message.code, message.params)

        if isinstance(message, dict):
            self.error_dict = {}
@@ -78,9 +78,8 @@ class ValidationError(Exception):
                # Normalize plain strings to instances of ValidationError.
                if not isinstance(message, ValidationError):
                    message = ValidationError(message)
-                if hasattr(message, 'error_dict'):
-                    self.error_list.extend(sum(message.error_dict.values(),
-                                               []))
+                if hasattr(message, "error_dict"):
+                    self.error_list.extend(sum(message.error_dict.values(), []))
                else:
                    self.error_list.extend(message.error_list)

@@ -94,18 +93,18 @@ class ValidationError(Exception):
    def message_dict(self):
        # Trigger an AttributeError if this ValidationError
        # doesn't have an error_dict.
-        getattr(self, 'error_dict')
+        getattr(self, "error_dict")

        return dict(self)

    @property
    def messages(self):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            return sum(dict(self).values(), [])
        return list(self)

    def update_error_dict(self, error_dict):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            for field, error_list in self.error_dict.items():
                error_dict.setdefault(field, []).extend(error_list)
        else:
@@ -113,7 +112,7 @@ class ValidationError(Exception):
        return error_dict

    def __iter__(self):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            for field, errors in self.error_dict.items():
                yield field, list(ValidationError(errors))
        else:
@@ -124,9 +123,13 @@ class ValidationError(Exception):
                yield message

    def __str__(self):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            return repr(dict(self))
        return repr(list(self))

    def __repr__(self):
-        return 'ValidationError(%s)' % self
+        return "ValidationError(%s)" % self
+
+
+class InvalidDirectoryPath(Exception):
+    pass
--- a/swh/model/fields/__init__.py
+++ b/swh/model/fields/__init__.py
@@ -6,8 +6,13 @@
 # We do our imports here but we don't use them, so flake8 complains
 # flake8: noqa

-from .simple import (validate_type, validate_int, validate_str, validate_bytes,
-                     validate_datetime, validate_enum)
-from .hashes import (validate_sha1, validate_sha1_git, validate_sha256)
-from .compound import (validate_against_schema, validate_all_keys,
-                       validate_any_key)
+from .compound import validate_against_schema, validate_all_keys, validate_any_key
+from .hashes import validate_sha1, validate_sha1_git, validate_sha256
+from .simple import (
+    validate_bytes,
+    validate_datetime,
+    validate_enum,
+    validate_int,
+    validate_str,
+    validate_type,
+)
--- a/swh/model/fields/compound.py
+++ b/swh/model/fields/compound.py
@@ -6,7 +6,7 @@
 from collections import defaultdict
 import itertools

-from ..exceptions import ValidationError, NON_FIELD_ERRORS
+from ..exceptions import NON_FIELD_ERRORS, ValidationError


 def validate_against_schema(model, schema, value):
@@ -26,19 +26,19 @@ def validate_against_schema(model, schema, value):

    if not isinstance(value, dict):
        raise ValidationError(
-            'Unexpected type %(type)s for %(model)s, expected dict',
+            "Unexpected type %(type)s for %(model)s, expected dict",
            params={
-                'model': model,
-                'type': value.__class__.__name__,
+                "model": model,
+                "type": value.__class__.__name__,
            },
-            code='model-unexpected-type',
+            code="model-unexpected-type",
        )

    errors = defaultdict(list)

    for key, (mandatory, validators) in itertools.chain(
        ((k, v) for k, v in schema.items() if k != NON_FIELD_ERRORS),
-        [(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))]
+        [(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))],
    ):
        if not validators:
            continue
@@ -54,9 +54,9 @@ def validate_against_schema(model, schema, value):
                if mandatory:
                    errors[key].append(
                        ValidationError(
-                            'Field %(field)s is mandatory',
-                            params={'field': key},
-                            code='model-field-mandatory',
+                            "Field %(field)s is mandatory",
+                            params={"field": key},
+                            code="model-field-mandatory",
                        )
                    )

@@ -74,19 +74,21 @@ def validate_against_schema(model, schema, value):
            else:
                if not valid:
                    errdata = {
-                        'validator': validator.__name__,
+                        "validator": validator.__name__,
                    }

                    if key == NON_FIELD_ERRORS:
-                        errmsg = 'Validation of model %(model)s failed in ' \
-                                 '%(validator)s'
-                        errdata['model'] = model
-                        errcode = 'model-validation-failed'
+                        errmsg = (
+                            "Validation of model %(model)s failed in " "%(validator)s"
+                        )
+                        errdata["model"] = model
+                        errcode = "model-validation-failed"
                    else:
-                        errmsg = 'Validation of field %(field)s failed in ' \
-                                 '%(validator)s'
-                        errdata['field'] = key
-                        errcode = 'field-validation-failed'
+                        errmsg = (
+                            "Validation of field %(field)s failed in " "%(validator)s"
+                        )
+                        errdata["field"] = key
+                        errcode = "field-validation-failed"

                    errors[key].append(
                        ValidationError(errmsg, params=errdata, code=errcode)
@@ -102,11 +104,11 @@ def validate_all_keys(value, keys):
    """Validate that all the given keys are present in value"""
    missing_keys = set(keys) - set(value)
    if missing_keys:
-        missing_fields = ', '.join(sorted(missing_keys))
+        missing_fields = ", ".join(sorted(missing_keys))
        raise ValidationError(
-            'Missing mandatory fields %(missing_fields)s',
-            params={'missing_fields': missing_fields},
-            code='missing-mandatory-field'
+            "Missing mandatory fields %(missing_fields)s",
+            params={"missing_fields": missing_fields},
+            code="missing-mandatory-field",
        )

    return True
@@ -116,11 +118,11 @@ def validate_any_key(value, keys):
    """Validate that any of the given keys is present in value"""
    present_keys = set(keys) & set(value)
    if not present_keys:
-        missing_fields = ', '.join(sorted(keys))
+        missing_fields = ", ".join(sorted(keys))
        raise ValidationError(
-            'Must contain one of the alternative fields %(missing_fields)s',
-            params={'missing_fields': missing_fields},
-            code='missing-alternative-field',
+            "Must contain one of the alternative fields %(missing_fields)s",
+            params={"missing_fields": missing_fields},
+            code="missing-alternative-field",
        )

    return True
--- a/swh/model/fields/hashes.py
+++ b/swh/model/fields/hashes.py
@@ -4,6 +4,7 @@
 # See top-level LICENSE file for more information

 import string
+
 from ..exceptions import ValidationError


@@ -22,22 +23,22 @@ def validate_hash(value, hash_type):
    """

    hash_lengths = {
-        'sha1': 20,
-        'sha1_git': 20,
-        'sha256': 32,
+        "sha1": 20,
+        "sha1_git": 20,
+        "sha256": 32,
    }

    hex_digits = set(string.hexdigits)

    if hash_type not in hash_lengths:
        raise ValidationError(
-            'Unexpected hash type %(hash_type)s, expected one of'
-            ' %(hash_types)s',
+            "Unexpected hash type %(hash_type)s, expected one of" " %(hash_types)s",
            params={
-                'hash_type': hash_type,
-                'hash_types': ', '.join(sorted(hash_lengths)),
+                "hash_type": hash_type,
+                "hash_types": ", ".join(sorted(hash_lengths)),
            },
-            code='unexpected-hash-type')
+            code="unexpected-hash-type",
+        )

    if isinstance(value, str):
        errors = []
@@ -48,10 +49,10 @@ def validate_hash(value, hash_type):
                    "Unexpected characters `%(unexpected_chars)s' for hash "
                    "type %(hash_type)s",
                    params={
-                        'unexpected_chars': ', '.join(sorted(extra_chars)),
-                        'hash_type': hash_type,
+                        "unexpected_chars": ", ".join(sorted(extra_chars)),
+                        "hash_type": hash_type,
                    },
-                    code='unexpected-hash-contents',
+                    code="unexpected-hash-contents",
                )
            )

@@ -60,14 +61,14 @@ def validate_hash(value, hash_type):
        if length != expected_length:
            errors.append(
                ValidationError(
-                    'Unexpected length %(length)d for hash type '
-                    '%(hash_type)s, expected %(expected_length)d',
+                    "Unexpected length %(length)d for hash type "
+                    "%(hash_type)s, expected %(expected_length)d",
                    params={
-                        'length': length,
-                        'expected_length': expected_length,
-                        'hash_type': hash_type,
+                        "length": length,
+                        "expected_length": expected_length,
+                        "hash_type": hash_type,
                    },
-                    code='unexpected-hash-length',
+                    code="unexpected-hash-length",
                )
            )

@@ -81,37 +82,37 @@ def validate_hash(value, hash_type):
        expected_length = hash_lengths[hash_type]
        if length != expected_length:
            raise ValidationError(
-                'Unexpected length %(length)d for hash type '
-                '%(hash_type)s, expected %(expected_length)d',
+                "Unexpected length %(length)d for hash type "
+                "%(hash_type)s, expected %(expected_length)d",
                params={
-                    'length': length,
-                    'expected_length': expected_length,
-                    'hash_type': hash_type,
+                    "length": length,
+                    "expected_length": expected_length,
+                    "hash_type": hash_type,
                },
-                code='unexpected-hash-length',
+                code="unexpected-hash-length",
            )

        return True

    raise ValidationError(
-        'Unexpected type %(type)s for hash, expected str or bytes',
+        "Unexpected type %(type)s for hash, expected str or bytes",
        params={
-            'type': value.__class__.__name__,
+            "type": value.__class__.__name__,
        },
-        code='unexpected-hash-value-type',
+        code="unexpected-hash-value-type",
    )


 def validate_sha1(sha1):
    """Validate that sha1 is a valid sha1 hash"""
-    return validate_hash(sha1, 'sha1')
+    return validate_hash(sha1, "sha1")


 def validate_sha1_git(sha1_git):
    """Validate that sha1_git is a valid sha1_git hash"""
-    return validate_hash(sha1_git, 'sha1_git')
+    return validate_hash(sha1_git, "sha1_git")


 def validate_sha256(sha256):
    """Validate that sha256 is a valid sha256 hash"""
-    return validate_hash(sha256, 'sha256')
+    return validate_hash(sha256, "sha256")
--- a/swh/model/fields/simple.py
+++ b/swh/model/fields/simple.py
@@ -13,16 +13,16 @@ def validate_type(value, type):
    """Validate that value is an integer"""
    if not isinstance(value, type):
        if isinstance(type, tuple):
-            typestr = 'one of %s' % ', '.join(typ.__name__ for typ in type)
+            typestr = "one of %s" % ", ".join(typ.__name__ for typ in type)
        else:
            typestr = type.__name__
        raise ValidationError(
-            'Unexpected type %(type)s, expected %(expected_type)s',
+            "Unexpected type %(type)s, expected %(expected_type)s",
            params={
-                'type': value.__class__.__name__,
-                'expected_type': typestr,
+                "type": value.__class__.__name__,
+                "expected_type": typestr,
            },
-            code='unexpected-type'
+            code="unexpected-type",
        )

    return True
@@ -54,10 +54,12 @@ def validate_datetime(value):
        errors.append(e)

    if isinstance(value, datetime.datetime) and value.tzinfo is None:
-        errors.append(ValidationError(
-            'Datetimes must be timezone-aware in swh',
-            code='datetime-without-tzinfo',
-        ))
+        errors.append(
+            ValidationError(
+                "Datetimes must be timezone-aware in swh",
+                code="datetime-without-tzinfo",
+            )
+        )

    if errors:
        raise ValidationError(errors)
@@ -69,12 +71,12 @@ def validate_enum(value, expected_values):
    """Validate that value is contained in expected_values"""
    if value not in expected_values:
        raise ValidationError(
-            'Unexpected value %(value)s, expected one of %(expected_values)s',
+            "Unexpected value %(value)s, expected one of %(expected_values)s",
            params={
-                'value': value,
-                'expected_values': ', '.join(sorted(expected_values)),
+                "value": value,
+                "expected_values": ", ".join(sorted(expected_values)),
            },
-            code='unexpected-value',
+            code="unexpected-value",
        )

    return True
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
-# Copyright (C) 2017-2018 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

+"""Conversion from filesystem tree to SWH objects.
+
+This module allows reading a tree of directories and files from a local
+filesystem, and convert them to in-memory data structures, which can then
+be exported to SWH data model objects, as defined in :mod:`swh.model.model`.
+"""
+
 import enum
+import fnmatch
+import glob
 import os
+import re
 import stat
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Pattern,
+    Tuple,
+    Union,
+    cast,
+)
+import warnings

-from typing import List
+import attr
+from deprecated import deprecated
+from typing_extensions import Final

-from .hashutil import MultiHash, HASH_BLOCK_SIZE
+from . import model
+from .exceptions import InvalidDirectoryPath
+from .git_objects import directory_entry_sort_key
+from .hashutil import MultiHash, hash_to_hex
 from .merkle import MerkleLeaf, MerkleNode
-from .identifiers import (
-    directory_identifier,
-    identifier_to_bytes as id_to_bytes,
-    identifier_to_str as id_to_str,
-)
+from .swhids import CoreSWHID
+from .swhids import ObjectType as SWHIDType
+
+
+class FromDiskType(model._StringCompatibleEnum):
+    """Possible object types for "from disk" object."""
+
+    CONTENT = "content"
+    DIRECTORY = "directory"
+
+    def __eq__(self, other):
+        # stay compatible with legacy string comparison (for now)
+        if isinstance(other, str):
+            # note: we should issue deprecation warning at some point
+            return self.value == other
+        return super().__eq__(other)
+
+    def __str__(self):
+        # preserve interpolation property (for now)
+        return self.value
+
+    def __hash__(self):
+        # make sure we don't confuse dictionary key matching (for now)
+        return hash(str(self.value))
+
+
+# There is an handful of other module that test for
+# DiskBackedContent.object_type in conjunction of Content.object_type, give
+# them a hand to let them handle compatibility in a smoother way.
+#
+# Remove this compatibility trick once this user have been migrated
+@deprecated(version="v6.13.0", reason="Use model.Content.object_type instead")
+def DiskBackedContent(*args, **kwargs):
+    return model.Content(*args, **kwargs)
+
+
+@attr.s(frozen=True, slots=True)
+class DiskBackedData:
+    path = attr.ib(type=bytes)
+
+    def __call__(self) -> bytes:
+        with open(self.path, "rb") as fd:
+            return fd.read()


 class DentryPerms(enum.IntEnum):
    """Admissible permissions for directory entries."""
+
    content = 0o100644
    """Content"""
    executable_content = 0o100755
@@ -68,8 +136,9 @@ class Content(MerkleLeaf):
    computation.

    """
-    __slots__ = []  # type: List[str]
-    type = 'content'
+
+    __slots__: List[str] = []
+    object_type: Final = FromDiskType.CONTENT

    @classmethod
    def from_bytes(cls, *, mode, data):
@@ -80,19 +149,22 @@ class Content(MerkleLeaf):
          data (bytes): raw contents of the file
        """
        ret = MultiHash.from_data(data).digest()
-        ret['length'] = len(data)
-        ret['perms'] = mode_to_perms(mode)
-        ret['data'] = data
+        ret["length"] = len(data)
+        ret["perms"] = mode_to_perms(mode)
+        ret["data"] = data
+        ret["status"] = "visible"

        return cls(ret)

    @classmethod
    def from_symlink(cls, *, path, mode):
        """Convert a symbolic link to a Software Heritage content entry"""
-        return cls.from_bytes(mode=mode, data=os.readlink(path))
+        content = cls.from_bytes(mode=mode, data=os.readlink(path))
+        content.data["path"] = path
+        return content

    @classmethod
-    def from_file(cls, *, path, data=False, save_path=False):
+    def from_file(cls, *, path, max_content_length=None):
        """Compute the Software Heritage content entry corresponding to an
        on-disk file.

@@ -101,56 +173,83 @@ class Content(MerkleLeaf):
        - using the content as a directory entry in a directory

        Args:
-          path (bytes): path to the file for which we're computing the
-            content entry
-          data (bool): add the file data to the entry
          save_path (bool): add the file path to the entry
+          max_content_length (Optional[int]): if given, all contents larger
+            than this will be skipped.

        """
        file_stat = os.lstat(path)
        mode = file_stat.st_mode
+        length = file_stat.st_size
+        too_large = max_content_length is not None and length > max_content_length

        if stat.S_ISLNK(mode):
            # Symbolic link: return a file whose contents are the link target
+
+            if too_large:
+                # Unlike large contents, we can't stream symlinks to
+                # MultiHash, and we don't want to fit them in memory if
+                # they exceed max_content_length either.
+                # Thankfully, this should not happen for reasonable values of
+                # max_content_length because of OS/filesystem limitations,
+                # so let's just raise an error.
+                raise Exception(f"Symlink too large ({length} bytes)")
+
            return cls.from_symlink(path=path, mode=mode)
        elif not stat.S_ISREG(mode):
            # not a regular file: return the empty file instead
-            return cls.from_bytes(mode=mode, data=b'')
+            return cls.from_bytes(mode=mode, data=b"")

-        length = file_stat.st_size
-
-        if not data:
-            ret = MultiHash.from_path(path).digest()
+        if too_large:
+            skip_reason = "Content too large"
+        else:
+            skip_reason = None
+
+        hashes = MultiHash.from_path(path).digest()
+        if skip_reason:
+            ret = {
+                **hashes,
+                "status": "absent",
+                "reason": skip_reason,
+            }
        else:
-            h = MultiHash(length=length)
-            chunks = []
-            with open(path, 'rb') as fobj:
-                while True:
-                    chunk = fobj.read(HASH_BLOCK_SIZE)
-                    if not chunk:
-                        break
-                    h.update(chunk)
-                    chunks.append(chunk)
-
-            ret = h.digest()
-            ret['data'] = b''.join(chunks)
-
-        if save_path:
-            ret['path'] = path
-        ret['perms'] = mode_to_perms(mode)
-        ret['length'] = length
+            ret = {
+                **hashes,
+                "status": "visible",
+            }
+
+        ret["path"] = path
+        ret["perms"] = mode_to_perms(mode)
+        ret["length"] = length

        obj = cls(ret)
        return obj

+    def swhid(self) -> CoreSWHID:
+        """Return node identifier as a SWHID"""
+        return CoreSWHID(object_type=SWHIDType.CONTENT, object_id=self.hash)
+
    def __repr__(self):
-        return 'Content(id=%s)' % id_to_str(self.hash)
+        return "Content(id=%s)" % hash_to_hex(self.hash)

    def compute_hash(self):
-        return self.data['sha1_git']
-
-
-def accept_all_directories(dirname, entries):
+        return self.data["sha1_git"]
+
+    def to_model(self) -> model.BaseContent:
+        """Builds a `model.BaseContent` object based on this leaf."""
+        data = self.get_data().copy()
+        data.pop("perms", None)
+        path = data.pop("path", None)
+        if data["status"] == "absent":
+            return model.SkippedContent.from_dict(data)
+        elif "data" not in data:
+            data["get_data"] = DiskBackedData(path=path)
+        return model.Content.from_dict(data)
+
+
+def accept_all_directories(
+    dirpath: bytes, dirname: bytes, entries: Optional[Iterable[Any]]
+) -> bool:
    """Default filter for :func:`Directory.from_disk` accepting all
    directories

@@ -158,10 +257,23 @@ def accept_all_directories(dirname, entries):
      dirname (bytes): directory name
      entries (list): directory entries
    """
+    warnings.warn(
+        "`accept_all_directories` is deprecated, use `accept_all_paths`",
+        DeprecationWarning,
+    )
+    return True
+
+
+def accept_all_paths(
+    path: bytes, name: bytes, entries: Optional[Iterable[Any]]
+) -> bool:
+    """Default filter for :func:`Directory.from_disk` accepting all paths"""
    return True


-def ignore_empty_directories(dirname, entries):
+def ignore_empty_directories(
+    dirpath: bytes, dirname: bytes, entries: Optional[Iterable[Any]]
+) -> bool:
    """Filter for :func:`directory_to_objects` ignoring empty directories

    Args:
@@ -170,6 +282,9 @@ def ignore_empty_directories(dirname, entries):
    Returns:
      True if the directory is not empty, false if the directory is empty
    """
+    if entries is None:
+        # Files are not ignored
+        return True
    return bool(entries)


@@ -187,8 +302,16 @@ def ignore_named_directories(names, *, case_sensitive=True):
    if not case_sensitive:
        names = [name.lower() for name in names]

-    def named_filter(dirname, entries,
-                     names=names, case_sensitive=case_sensitive):
+    def named_filter(
+        dirpath: str,
+        dirname: str,
+        entries: Iterable[Any],
+        names: Iterable[Any] = names,
+        case_sensitive: bool = case_sensitive,
+    ):
+        if entries is None:
+            # Files are not ignored
+            return True
        if case_sensitive:
            return dirname not in names
        else:
@@ -197,6 +320,102 @@ def ignore_named_directories(names, *, case_sensitive=True):
    return named_filter


+# TODO: `extract_regex_objs` has been copied and adapted from `swh.scanner`.
+# In the future `swh.scanner` should use the `swh.model` version and remove its own.
+def extract_regex_objs(
+    root_path: bytes, patterns: Iterable[bytes]
+) -> Iterator[Pattern[bytes]]:
+    """Generates a regex object for each pattern given in input and checks if
+     the path is a subdirectory or relative to the root path.
+
+    Args:
+      root_path (bytes): path to the root directory
+      patterns (list of byte): shell patterns to match
+
+     Yields:
+        an SRE_Pattern object
+    """
+    absolute_root_path = os.path.abspath(root_path)
+    for pattern in patterns:
+        if os.path.isabs(pattern):
+            pattern = os.path.relpath(pattern, root_path)
+        # python 3.10 has a `root_dir` argument for glob, but not the previous
+        # version. So we adjust the pattern
+        test_pattern = os.path.join(absolute_root_path, pattern)
+        for path in glob.glob(test_pattern):
+            if os.path.isabs(path) and not path.startswith(absolute_root_path):
+                error_msg = (
+                    b'The path "' + path + b'" is not a subdirectory or relative '
+                    b'to the root directory path: "' + root_path + b'"'
+                )
+                raise InvalidDirectoryPath(error_msg)
+
+        regex = fnmatch.translate((pattern.decode()))
+        yield re.compile(regex.encode())
+
+
+def ignore_directories_patterns(root_path: bytes, patterns: Iterable[bytes]):
+    """Filter for :func:`directory_to_objects` to ignore directories
+    matching certain patterns.
+
+    Args:
+      root_path (bytes): path of the root directory
+      patterns (list of bytes): patterns to ignore
+
+    Returns:
+      a directory filter for :func:`directory_to_objects`
+    """
+    sre_patterns = set(extract_regex_objs(root_path, patterns))
+
+    def pattern_filter(
+        dirpath: bytes,
+        dirname: bytes,
+        entries: Iterable[Any],
+        patterns: Iterable[Any] = sre_patterns,
+        root_path: bytes = os.path.abspath(root_path),
+    ):
+        full_path = os.path.abspath(os.path.join(dirpath, dirname))
+        relative_path = os.path.relpath(full_path, root_path)
+        return not any([pattern.match(relative_path) for pattern in patterns])
+
+    return pattern_filter
+
+
+def iter_directory(
+    directory: "Directory",
+) -> Tuple[List[model.Content], List[model.SkippedContent], List[model.Directory]]:
+    """Return the directory listing from a disk-memory directory instance.
+
+    Raises:
+        TypeError in case an unexpected object type is listed.
+
+    Returns:
+        Tuple of respectively iterable of content, skipped content and directories.
+
+    """
+    contents: List[model.Content] = []
+    skipped_contents: List[model.SkippedContent] = []
+    directories: List[model.Directory] = []
+
+    for i_obj in directory.iter_tree():
+        if isinstance(i_obj, Directory):
+            directories.append(i_obj.to_model())
+        elif isinstance(i_obj, Content):
+            obj = i_obj.to_model()
+            if isinstance(obj, model.SkippedContent):
+                skipped_contents.append(obj)
+            else:
+                # FIXME: read the data from disk later (when the
+                # storage buffer is flushed).
+                #
+                c_obj = cast(model.Content, obj)
+                contents.append(c_obj.with_data())
+        else:
+            raise TypeError(f"Unexpected object type from disk: {obj}")
+
+    return contents, skipped_contents, directories
+
+
 class Directory(MerkleNode):
    """Representation of a Software Heritage directory as a node in a Merkle Tree.

@@ -216,136 +435,259 @@ class Directory(MerkleNode):
    the same method. This enables the efficient collection of updated nodes,
    for instance when the client is applying diffs.
    """
-    __slots__ = ['__entries']
-    type = 'directory'
+
+    __slots__ = ["__entries", "__model_object"]
+    object_type: Final = FromDiskType.DIRECTORY

    @classmethod
-    def from_disk(cls, *, path, data=False, save_path=False,
-                  dir_filter=accept_all_directories):
+    def from_disk(
+        cls,
+        *,
+        path: bytes,
+        path_filter: Callable[
+            [bytes, bytes, Optional[List[bytes]]], bool
+        ] = accept_all_paths,
+        max_content_length: Optional[int] = None,
+        progress_callback: Optional[Callable[[int], None]] = None,
+    ) -> "Directory":
        """Compute the Software Heritage objects for a given directory tree

        Args:
          path (bytes): the directory to traverse
          data (bool): whether to add the data to the content objects
          save_path (bool): whether to add the path to the content objects
-          dir_filter (function): a filter to ignore some directories by
-            name or contents. Takes two arguments: dirname and entries, and
-            returns True if the directory should be added, False if the
-            directory should be ignored.
+          path_filter (function): a filter to ignore some paths.
+            Takes three arguments: `path`, `name` and `entries`.
+            `entries` is `None` for files, and a (possibly empty) list of names
+            for directories.
+            Returns True if the path should be added, False if the
+            path should be ignored.
+          max_content_length (Optional[int]): if given, all contents larger
+            than this will be skipped.
+          progress_callback (Optional function): if given, returns for each
+            non empty directories traversed the number of computed entries.
        """
-
+        # the top might have been specified with a final slash. This will
+        # confuse various code.
+        #
+        # We should prevent '/' as is however
+        if 1 < len(path) and path[-1:] == b"/":
+            path = path[0:1] + path[1:].rstrip(b"/")
+        assert len(path) <= 1 or path[-1:] != b"/"
        top_path = path
-        dirs = {}
+        top_path_prefix_size = len(top_path) + 1
+
+        dirs: Dict[bytes, Directory] = {}
+        dirs[top_path] = cls({"name": os.path.basename(top_path), "path": top_path})
+        filtered = []
+        to_visit = [path]
+        while to_visit:
+            root = to_visit.pop()
+            path, name = os.path.split(root)
+            with os.scandir(root) as it:
+                entries_list = list(it)
+            if root != top_path and not path_filter(
+                path, name, [entry.path for entry in entries_list]
+            ):
+                # we should not traverse the current directory, so stop right now,
+                # but also mark it as removed (for later cleanup)
+                filtered.append(root)
+                continue

-        for root, dentries, fentries in os.walk(top_path, topdown=False):
            entries = {}
-            # Join fentries and dentries in the same processing, as symbolic
-            # links to directories appear in dentries...
-            for name in fentries + dentries:
-                path = os.path.join(root, name)
-                if not os.path.isdir(path) or os.path.islink(path):
-                    content = Content.from_file(path=path, data=data,
-                                                save_path=save_path)
-                    entries[name] = content
+            for entry in entries_list:
+                if not entry.is_dir(follow_symlinks=False):
+                    if not path_filter(root, entry.name, None):
+                        continue
+                    content = Content.from_file(
+                        path=entry.path, max_content_length=max_content_length
+                    )
+                    entries[entry.name] = content
                else:
-                    if dir_filter(name, dirs[path].entries):
-                        entries[name] = dirs[path]
-
-            dirs[root] = cls({'name': os.path.basename(root)})
+                    entries[entry.name] = cls({"name": entry.name, "path": entry.path})
+                    dirs[entry.path] = entries[entry.name]
+                    to_visit.append(entry.path)
            dirs[root].update(entries)

-        return dirs[top_path]
+            if progress_callback is not None:
+                if len(entries) > 0:
+                    progress_callback(len(entries))
+        top_dir = dirs[top_path]
+
+        for path in reversed(filtered):
+            path = path[top_path_prefix_size:]
+            del top_dir[path]
+        # a bit sad but now we have to traverse the gathered tree structure to
+        # filter it again (e.g. for the ignore_empty_directory filter to work
+        # recursively)
+        todo: List[Tuple[bytes, Directory]] = [(b"", top_dir)]
+        traversal = []
+        while todo:
+            cpath, cdir = todo.pop(0)
+            traversal.append(cpath)
+            for dirname, subdir in cdir.items():
+                if subdir.object_type == FromDiskType.DIRECTORY:
+                    spath = cpath + b"/" + dirname
+                    todo.append((spath, subdir))
+        for dirpath in reversed(traversal):
+            node = top_dir[dirpath]
+            assert node.object_type == FromDiskType.DIRECTORY
+            path, name = os.path.split(dirpath)
+            if dirpath and not path_filter(path, name, list(node.keys())):
+                # should be filtered
+                del top_dir[dirpath]
+        top_dir.update_hash(force=True)
+        return top_dir

    def __init__(self, data=None):
        super().__init__(data=data)
        self.__entries = None
+        self.__model_object = None
+
+    # note: the overwrite could probably be done by parametrysing the
+    # MerkelNode type, but that is a much bigger rework than the series
+    # introducing this change.
+    def iter_tree(self, dedup=True) -> Iterator[Union["Directory", "Content"]]:
+        """Yields all children nodes, recursively. Common nodes are deduplicated
+        by default (deduplication can be turned off setting the given argument
+        'dedup' to False).
+        """
+        tree = super().iter_tree(dedup=dedup)
+        yield from cast(Iterator[Union["Directory", "Content"]], tree)

    def invalidate_hash(self):
        self.__entries = None
+        self.__model_object = None
        super().invalidate_hash()

    @staticmethod
    def child_to_directory_entry(name, child):
-        if isinstance(child, Directory):
+        if child.object_type == FromDiskType.DIRECTORY:
            return {
-                'type': 'dir',
-                'perms': DentryPerms.directory,
-                'target': child.hash,
-                'name': name,
+                "type": "dir",
+                "perms": DentryPerms.directory,
+                "target": child.hash,
+                "name": name,
            }
-        elif isinstance(child, Content):
+        elif child.object_type == FromDiskType.CONTENT:
            return {
-                'type': 'file',
-                'perms': child.data['perms'],
-                'target': child.hash,
-                'name': name,
+                "type": "file",
+                "perms": child.data["perms"],
+                "target": child.hash,
+                "name": name,
            }
        else:
-            raise ValueError('unknown child')
+            raise ValueError(f"unknown child {child}")

    def get_data(self, **kwargs):
        return {
-            'id': self.hash,
-            'entries': self.entries,
+            "id": self.hash,
+            "entries": self.entries,
        }

    @property
    def entries(self):
+        """Child nodes, sorted by name in the same way
+        :func:`swh.model.git_objects.directory_git_object` does."""
        if self.__entries is None:
-            self.__entries = [
-                self.child_to_directory_entry(name, child)
-                for name, child in self.items()
-            ]
+            self.__entries = sorted(
+                (
+                    self.child_to_directory_entry(name, child)
+                    for name, child in self.items()
+                ),
+                key=directory_entry_sort_key,
+            )

        return self.__entries

+    def swhid(self) -> CoreSWHID:
+        """Return node identifier as a SWHID"""
+        return CoreSWHID(object_type=SWHIDType.DIRECTORY, object_id=self.hash)
+
    def compute_hash(self):
-        return id_to_bytes(directory_identifier({'entries': self.entries}))
+        return self.to_model().id
+
+    def to_model(self) -> model.Directory:
+        """Builds a `model.Directory` object based on this node;
+        ignoring its children."""
+        if self.__model_object is None:
+            DirectoryEntry = model.DirectoryEntry
+
+            entries = []
+            for name, child in self.items():
+                if child.object_type == FromDiskType.DIRECTORY:
+                    e = DirectoryEntry(
+                        type="dir",
+                        perms=DentryPerms.directory,
+                        target=child.hash,
+                        name=name,
+                    )
+                elif child.object_type == FromDiskType.CONTENT:
+                    e = DirectoryEntry(
+                        type="file",
+                        perms=child.data["perms"],
+                        target=child.hash,
+                        name=name,
+                    )
+                else:
+                    raise ValueError(f"unknown child {child}")
+                entries.append(e)
+            entries.sort(key=directory_entry_sort_key)
+            self.__model_object = model.Directory(entries=tuple(entries))
+        return self.__model_object

    def __getitem__(self, key):
        if not isinstance(key, bytes):
-            raise ValueError('Can only get a bytes from Directory')
+            raise ValueError("Can only get a bytes from Directory")

        # Convenience shortcut
-        if key == b'':
+        if key == b"":
            return self

-        if b'/' not in key:
+        if b"/" not in key:
            return super().__getitem__(key)
        else:
-            key1, key2 = key.split(b'/', 1)
+            key1, key2 = key.split(b"/", 1)
            return self.__getitem__(key1)[key2]

    def __setitem__(self, key, value):
        if not isinstance(key, bytes):
-            raise ValueError('Can only set a bytes Directory entry')
+            raise ValueError("Can only set a bytes Directory entry")
        if not isinstance(value, (Content, Directory)):
-            raise ValueError('Can only set a Directory entry to a Content or '
-                             'Directory')
+            raise ValueError(
+                "Can only set a Directory entry to a Content or " "Directory"
+            )

-        if key == b'':
-            raise ValueError('Directory entry must have a name')
-        if b'\x00' in key:
-            raise ValueError('Directory entry name must not contain nul bytes')
+        if key == b"":
+            raise ValueError("Directory entry must have a name")
+        if b"\x00" in key:
+            raise ValueError("Directory entry name must not contain nul bytes")

-        if b'/' not in key:
+        if b"/" not in key:
            return super().__setitem__(key, value)
        else:
-            key1, key2 = key.rsplit(b'/', 1)
+            key1, key2 = key.rsplit(b"/", 1)
            self[key1].__setitem__(key2, value)

    def __delitem__(self, key):
        if not isinstance(key, bytes):
-            raise ValueError('Can only delete a bytes Directory entry')
+            raise ValueError("Can only delete a bytes Directory entry")

-        if b'/' not in key:
+        if b"/" not in key:
            super().__delitem__(key)
        else:
-            key1, key2 = key.rsplit(b'/', 1)
+            key1, key2 = key.rsplit(b"/", 1)
            del self[key1][key2]

+    def __contains__(self, key):
+        if b"/" not in key:
+            return super().__contains__(key)
+        else:
+            key1, key2 = key.split(b"/", 1)
+            return super().__contains__(key1) and self[key1].__contains__(key2)
+
    def __repr__(self):
-        return 'Directory(id=%s, entries=[%s])' % (
-            id_to_str(self.hash),
-            ', '.join(str(entry) for entry in self),
+        return "Directory(id=%s, entries=[%s])" % (
+            hash_to_hex(self.hash),
+            ", ".join(str(entry) for entry in self),
        )
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
-# Copyright (C) 2015-2019  The Software Heritage developers
+# Copyright (C) 2015-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

-import binascii
-import datetime
-import hashlib
+"""
+Converts SWH model objects to git(-like) objects

-from functools import lru_cache
-from typing import Any, Dict, NamedTuple
+Most of the functions in this module take as argument an object from
+:mod:`swh.model.model`, and format it like a git object.

-from .exceptions import ValidationError
-from .fields.hashes import validate_sha1
-from .hashutil import hash_git_data, hash_to_hex, MultiHash
+They are the inverse functions of those in :mod:`swh.loader.git.converters`,
+but with extensions, as SWH's model is a superset of Git's:

+* extensions of existing types (eg. revision/commit and release/tag dates
+  can be expressed with precision up to milliseconds, to support formatting
+  Mercurial objects)
+* new types, for SWH's specific needs (:class:`swh.model.model.RawExtrinsicMetadata`
+  and :class:`swh.model.model.ExtID`)
+* support for somewhat corrupted git objects that we need to reproduce

-ORIGIN = 'origin'
-SNAPSHOT = 'snapshot'
-REVISION = 'revision'
-RELEASE = 'release'
-DIRECTORY = 'directory'
-CONTENT = 'content'
+This is used for two purposes:

-PID_NAMESPACE = 'swh'
-PID_VERSION = 1
-PID_TYPES = ['ori', 'snp', 'rel', 'rev', 'dir', 'cnt']
-PID_SEP = ':'
-PID_CTXT_SEP = ';'
+* Format manifests that can be hashed to produce :ref:`intrinsic identifiers
+  <persistent-identifiers>`
+* Write git objects to reproduce git repositories that were ingested in the archive.
+"""


-@lru_cache()
-def identifier_to_bytes(identifier):
-    """Convert a text identifier to bytes.
+from __future__ import annotations
+
+import datetime
+from functools import lru_cache
+from typing import Dict, Iterable, List, Optional, Tuple, Union, cast
+import warnings
+
+from . import model
+from .collections import ImmutableDict
+from .hashutil import git_object_header, hash_to_bytehex

-    Args:
-        identifier: an identifier, either a 40-char hexadecimal string or a
-            bytes object of length 20
-    Returns:
-        The length 20 bytestring corresponding to the given identifier

-    Raises:
-        ValueError: if the identifier is of an unexpected type or length.
+def content_git_object(content: model.Content) -> bytes:
+    """Formats a content as a git blob.
+
+    A content's identifier is the blob sha1 à la git of the tagged content.
    """
+    content = cast(model.Content, content)

-    if isinstance(identifier, bytes):
-        if len(identifier) != 20:
-            raise ValueError(
-                'Wrong length for bytes identifier %s, expected 20' %
-                len(identifier))
-        return identifier
+    if content.data is None:
+        raise model.MissingData("Content data is None, cannot format.")

-    if isinstance(identifier, str):
-        if len(identifier) != 40:
-            raise ValueError(
-                'Wrong length for str identifier %s, expected 40' %
-                len(identifier))
-        return bytes.fromhex(identifier)
+    return git_object_header("blob", len(content.data)) + content.data

-    raise ValueError('Wrong type for identifier %s, expected bytes or str' %
-                     identifier.__class__.__name__)

+def directory_entry_sort_key(entry: model.DirectoryEntry):
+    """The sorting key for tree entries"""
+    if isinstance(entry, dict):
+        type_ = entry["type"]
+        name = entry["name"]
+    else:
+        type_ = entry.type
+        name = entry.name

-@lru_cache()
-def identifier_to_str(identifier):
-    """Convert an identifier to an hexadecimal string.
+    if type_ == "dir":
+        return name + b"/"
+    else:
+        return name

-    Args:
-        identifier: an identifier, either a 40-char hexadecimal string or a
-            bytes object of length 20

-    Returns:
-        The length 40 string corresponding to the given identifier, hex encoded
+@lru_cache()
+def _perms_to_bytes(perms):
+    """Convert the perms value to its canonical bytes representation"""
+    oc = oct(perms)[2:]
+    return oc.encode("ascii")

-    Raises:
-        ValueError: if the identifier is of an unexpected type or length.
-    """

-    if isinstance(identifier, str):
-        if len(identifier) != 40:
-            raise ValueError(
-                'Wrong length for str identifier %s, expected 40' %
-                len(identifier))
-        return identifier
+def escape_newlines(snippet):
+    """Escape the newlines present in snippet according to git rules.

-    if isinstance(identifier, bytes):
-        if len(identifier) != 20:
-            raise ValueError(
-                'Wrong length for bytes identifier %s, expected 20' %
-                len(identifier))
-        return binascii.hexlify(identifier).decode()
+    New lines in git manifests are escaped by indenting the next line by one
+    space.

-    raise ValueError('Wrong type for identifier %s, expected bytes or str' %
-                     identifier.__class__.__name__)
+    """

+    if b"\n" in snippet:
+        return b"\n ".join(snippet.split(b"\n"))
+    else:
+        return snippet

-def content_identifier(content):
-    """Return the intrinsic identifier for a content.

-    A content's identifier is the sha1, sha1_git and sha256 checksums of its
-    data.
+def format_date(date: model.Timestamp) -> bytes:
+    """Convert a date object into an UTC timestamp encoded as ascii bytes.

-    Args:
-        content: a content conforming to the Software Heritage schema
+    Git stores timestamps as an integer number of seconds since the UNIX epoch.

-    Returns:
-        A dictionary with all the hashes for the data
+    However, Software Heritage stores timestamps as an integer number of
+    microseconds (postgres type "datetime with timezone").

-    Raises:
-        KeyError: if the content doesn't have a data member.
+    Therefore, we print timestamps with no microseconds as integers, and
+    timestamps with microseconds as floating point values. We elide the
+    trailing zeroes from microsecond values, to "future-proof" our
+    representation if we ever need more precision in timestamps.

    """
+    if isinstance(date, dict):
+        # For backward compatibility
+        date = model.Timestamp.from_dict(date)

-    return MultiHash.from_data(content['data']).digest()
+    if not date.microseconds:
+        return str(date.seconds).encode()
+    else:
+        float_value = "%d.%06d" % (date.seconds, date.microseconds)
+        return float_value.rstrip("0").encode()


-def _sort_key(entry):
-    """The sorting key for tree entries"""
-    if entry['type'] == 'dir':
-        return entry['name'] + b'/'
-    else:
-        return entry['name']
+def normalize_timestamp(time_representation):
+    """Normalize a time representation for processing by Software Heritage

+    This function supports a numeric timestamp (representing a number of
+    seconds since the UNIX epoch, 1970-01-01 at 00:00 UTC), a
+    :obj:`datetime.datetime` object (with timezone information), or a
+    normalized Software Heritage time representation (idempotency).

-@lru_cache()
-def _perms_to_bytes(perms):
-    """Convert the perms value to its bytes representation"""
-    oc = oct(perms)[2:]
-    return oc.encode('ascii')
+    Args:
+        time_representation: the representation of a timestamp

+    Returns:
+        dict: a normalized dictionary with three keys:

-def escape_newlines(snippet):
-    """Escape the newlines present in snippet according to git rules.
+            - timestamp: a dict with two optional keys:

-    New lines in git manifests are escaped by indenting the next line by one
-    space.
+               - seconds: the integral number of seconds since the UNIX epoch
+               - microseconds: the integral number of microseconds

+            - offset: the timezone offset as a number of minutes relative to
+              UTC
+            - negative_utc: a boolean representing whether the offset is -0000
+              when offset = 0.
    """
-
-    if b'\n' in snippet:
-        return b'\n '.join(snippet.split(b'\n'))
+    if time_representation is None:
+        return None
    else:
-        return snippet
+        return model.TimestampWithTimezone.from_dict(time_representation).to_dict()


-def directory_identifier(directory):
-    """Return the intrinsic identifier for a directory.
+def directory_git_object(directory: Union[Dict, model.Directory]) -> bytes:
+    """Formats a directory as a git tree.

    A directory's identifier is the tree sha1 à la git of a directory listing,
    using the following algorithm, which is equivalent to the git algorithm for
@@ -179,229 +179,123 @@ def directory_identifier(directory):
      (Note that there is no separator between entries)

    """
+    if isinstance(directory, dict):
+        # For backward compatibility
+        warnings.warn(
+            "directory_git_object's argument should be a swh.model.model.Directory "
+            "object.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        directory = model.Directory.from_dict(directory)
+    directory = cast(model.Directory, directory)

    components = []

-    for entry in sorted(directory['entries'], key=_sort_key):
-        components.extend([
-            _perms_to_bytes(entry['perms']),
-            b'\x20',
-            entry['name'],
-            b'\x00',
-            identifier_to_bytes(entry['target']),
-        ])
-
-    return identifier_to_str(hash_git_data(b''.join(components), 'tree'))
-
-
-def format_date(date):
-    """Convert a date object into an UTC timestamp encoded as ascii bytes.
-
-    Git stores timestamps as an integer number of seconds since the UNIX epoch.
-
-    However, Software Heritage stores timestamps as an integer number of
-    microseconds (postgres type "datetime with timezone").
-
-    Therefore, we print timestamps with no microseconds as integers, and
-    timestamps with microseconds as floating point values. We elide the
-    trailing zeroes from microsecond values, to "future-proof" our
-    representation if we ever need more precision in timestamps.
-
-    """
-    if not isinstance(date, dict):
-        raise ValueError('format_date only supports dicts, %r received' % date)
-
-    seconds = date.get('seconds', 0)
-    microseconds = date.get('microseconds', 0)
-    if not microseconds:
-        return str(seconds).encode()
-    else:
-        float_value = ('%d.%06d' % (seconds, microseconds))
-        return float_value.rstrip('0').encode()
-
+    for entry in sorted(directory.entries, key=directory_entry_sort_key):
+        components.extend(
+            [
+                _perms_to_bytes(entry.perms),
+                b"\x20",
+                entry.name,
+                b"\x00",
+                entry.target,
+            ]
+        )

-@lru_cache()
-def format_offset(offset, negative_utc=None):
-    """Convert an integer number of minutes into an offset representation.
+    return format_git_object_from_parts("tree", components)

-    The offset representation is [+-]hhmm where:

-    - hh is the number of hours;
-    - mm is the number of minutes.
+def format_git_object_from_headers(
+    git_type: str,
+    headers: Iterable[Tuple[bytes, bytes]],
+    message: Optional[bytes] = None,
+) -> bytes:
+    """Format a git_object comprised of a git header and a manifest,
+    which is itself a sequence of `headers`, and an optional `message`.

-    A null offset is represented as +0000.
-    """
-    if offset < 0 or offset == 0 and negative_utc:
-        sign = '-'
-    else:
-        sign = '+'
+    The git_object format, compatible with the git format for tag and commit
+    objects, is as follows:

-    hours = abs(offset) // 60
-    minutes = abs(offset) % 60
+      - for each `key`, `value` in `headers`, emit:

-    t = '%s%02d%02d' % (sign, hours, minutes)
-    return t.encode()
+        - the `key`, literally
+        - an ascii space (``\\x20``)
+        - the `value`, with newlines escaped using :func:`escape_newlines`,
+        - an ascii newline (``\\x0a``)

+      - if the `message` is not None, emit:

-def normalize_timestamp(time_representation):
-    """Normalize a time representation for processing by Software Heritage
-
-    This function supports a numeric timestamp (representing a number of
-    seconds since the UNIX epoch, 1970-01-01 at 00:00 UTC), a
-    :obj:`datetime.datetime` object (with timezone information), or a
-    normalized Software Heritage time representation (idempotency).
+        - an ascii newline (``\\x0a``)
+        - the `message`, literally

    Args:
-        time_representation: the representation of a timestamp
+      headers: a sequence of key/value headers stored in the manifest;
+      message: an optional message used to trail the manifest.

    Returns:
-        dict: a normalized dictionary with three keys:
-
-            - timestamp: a dict with two optional keys:
-
-               - seconds: the integral number of seconds since the UNIX epoch
-               - microseconds: the integral number of microseconds
-
-            - offset: the timezone offset as a number of minutes relative to
-              UTC
-            - negative_utc: a boolean representing whether the offset is -0000
-              when offset = 0.
-
+      the formatted git_object as bytes
    """
+    entries: List[bytes] = []

-    if time_representation is None:
-        return None
+    for key, value in headers:
+        entries.extend((key, b" ", escape_newlines(value), b"\n"))

-    negative_utc = False
+    if message is not None:
+        entries.extend((b"\n", message))

-    if isinstance(time_representation, dict):
-        ts = time_representation['timestamp']
-        if isinstance(ts, dict):
-            seconds = ts.get('seconds', 0)
-            microseconds = ts.get('microseconds', 0)
-        elif isinstance(ts, int):
-            seconds = ts
-            microseconds = 0
-        else:
-            raise ValueError(
-                'normalize_timestamp received non-integer timestamp member:'
-                ' %r' % ts)
-        offset = time_representation['offset']
-        if 'negative_utc' in time_representation:
-            negative_utc = time_representation['negative_utc']
-    elif isinstance(time_representation, datetime.datetime):
-        seconds = int(time_representation.timestamp())
-        microseconds = time_representation.microsecond
-        utcoffset = time_representation.utcoffset()
-        if utcoffset is None:
-            raise ValueError(
-                'normalize_timestamp received datetime without timezone: %s' %
-                time_representation)
-
-        # utcoffset is an integer number of minutes
-        seconds_offset = utcoffset.total_seconds()
-        offset = int(seconds_offset) // 60
-    elif isinstance(time_representation, int):
-        seconds = time_representation
-        microseconds = 0
-        offset = 0
-    else:
-        raise ValueError(
-            'normalize_timestamp received non-integer timestamp:'
-            ' %r' % time_representation)
+    return format_git_object_from_parts(git_type, entries)

-    return {
-        'timestamp': {
-            'seconds': seconds,
-            'microseconds': microseconds,
-        },
-        'offset': offset,
-        'negative_utc': negative_utc,
-    }

+def format_git_object_from_parts(git_type: str, parts: Iterable[bytes]) -> bytes:
+    """Similar to :func:`format_git_object_from_headers`, but for manifests made of
+    a flat list of entries, instead of key-value + message, ie. trees and snapshots."""
+    concatenated_parts = b"".join(parts)

-def format_author(author):
-    """Format the specification of an author.
+    header = git_object_header(git_type, len(concatenated_parts))
+    return header + concatenated_parts

-    An author is either a byte string (passed unchanged), or a dict with three
-    keys, fullname, name and email.

-    If the fullname exists, return it; if it doesn't, we construct a fullname
-    using the following heuristics: if the name value is None, we return the
-    email in angle brackets, else, we return the name, a space, and the email
-    in angle brackets.
+def format_author_data(
+    author: model.Person, date_offset: Optional[model.TimestampWithTimezone]
+) -> bytes:
+    """Format authorship data according to git standards.

-    """
-    if isinstance(author, bytes) or author is None:
-        return author
+    Git authorship data has two components:

-    if 'fullname' in author:
-        return author['fullname']
+    - an author specification, usually a name and email, but in practice an
+      arbitrary bytestring
+    - optionally, a timestamp with a UTC offset specification

-    ret = []
-    if author['name'] is not None:
-        ret.append(author['name'])
-    if author['email'] is not None:
-        ret.append(b''.join([b'<', author['email'], b'>']))
+    The authorship data is formatted thus::

-    return b' '.join(ret)
-
-
-def format_author_line(header, author, date_offset):
-    """Format a an author line according to git standards.
-
-    An author line has three components:
-
-    - a header, describing the type of author (author, committer, tagger)
-    - a name and email, which is an arbitrary bytestring
-    - optionally, a timestamp with UTC offset specification
-
-    The author line is formatted thus::
-
-        `header` `name and email`[ `timestamp` `utc_offset`]
+        `name and email`[ `timestamp` `utc_offset`]

    The timestamp is encoded as a (decimal) number of seconds since the UNIX
    epoch (1970-01-01 at 00:00 UTC). As an extension to the git format, we
    support fractional timestamps, using a dot as the separator for the decimal
    part.

-    The utc offset is a number of minutes encoded as '[+-]HHMM'. Note some
+    The utc offset is a number of minutes encoded as '[+-]HHMM'. Note that some
    tools can pass a negative offset corresponding to the UTC timezone
    ('-0000'), which is valid and is encoded as such.

-    For convenience, this function returns the whole line with its trailing
-    newline.
-
-    Args:
-        header: the header of the author line (one of 'author', 'committer',
-            'tagger')
-        author: an author specification (dict with two bytes values: name and
-            email, or byte value)
-        date_offset: a normalized date/time representation as returned by
-            :func:`normalize_timestamp`.
-
    Returns:
-        the newline-terminated byte string containing the author line
-
+        the byte string containing the authorship data
    """

-    ret = [header.encode(), b' ', escape_newlines(format_author(author))]
-
-    date_offset = normalize_timestamp(date_offset)
+    ret = [author.fullname]

    if date_offset is not None:
-        date_f = format_date(date_offset['timestamp'])
-        offset_f = format_offset(date_offset['offset'],
-                                 date_offset['negative_utc'])
+        date_f = format_date(date_offset.timestamp)

-        ret.extend([b' ', date_f, b' ', offset_f])
+        ret.extend([b" ", date_f, b" ", date_offset.offset_bytes])

-    ret.append(b'\n')
-    return b''.join(ret)
+    return b"".join(ret)


-def revision_identifier(revision):
-    """Return the intrinsic identifier for a revision.
+def revision_git_object(revision: Union[Dict, model.Revision]) -> bytes:
+    """Formats a revision as a git tree.

    The fields used for the revision identifier computation are:

@@ -411,7 +305,7 @@ def revision_identifier(revision):
    - author_date
    - committer
    - committer_date
-    - metadata -> extra_headers
+    - extra_headers or metadata -> extra_headers
    - message

    A revision's identifier is the 'git'-checksum of a commit manifest
@@ -432,7 +326,7 @@ def revision_identifier(revision):
    The directory identifier is the ascii representation of its hexadecimal
    encoding.

-    Author and committer are formatted with the :func:`format_author` function.
+    Author and committer are formatted using the :attr:`Person.fullname` attribute only.
    Dates are formatted with the :func:`format_offset` function.

    Extra headers are an ordered list of [key, value] pairs. Keys are strings
@@ -450,79 +344,82 @@ def revision_identifier(revision):
    type.

    """
-    components = [
-        b'tree ', identifier_to_str(revision['directory']).encode(), b'\n',
-    ]
-    for parent in revision['parents']:
-        if parent:
-            components.extend([
-                b'parent ', identifier_to_str(parent).encode(), b'\n',
-            ])
+    if isinstance(revision, dict):
+        # For backward compatibility
+        warnings.warn(
+            "revision_git_object's argument should be a swh.model.model.Revision "
+            "object.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        revision = model.Revision.from_dict(revision)
+    revision = cast(model.Revision, revision)

-    components.extend([
-        format_author_line('author', revision['author'], revision['date']),
-        format_author_line('committer', revision['committer'],
-                           revision['committer_date']),
-    ])
+    headers = [(b"tree", hash_to_bytehex(revision.directory))]
+    for parent in revision.parents:
+        if parent:
+            headers.append((b"parent", hash_to_bytehex(parent)))
+
+    if revision.author is not None:
+        headers.append((b"author", format_author_data(revision.author, revision.date)))
+    if revision.committer is not None:
+        headers.append(
+            (
+                b"committer",
+                format_author_data(revision.committer, revision.committer_date),
+            )
+        )

    # Handle extra headers
-    metadata = revision.get('metadata')
-    if not metadata:
-        metadata = {}
+    metadata = revision.metadata or ImmutableDict()
+    extra_headers = revision.extra_headers or ()
+    if not extra_headers and "extra_headers" in metadata:
+        extra_headers = metadata["extra_headers"]

-    for key, value in metadata.get('extra_headers', []):
+    headers.extend(extra_headers)

-        # Integer values: decimal representation
-        if isinstance(value, int):
-            value = str(value).encode('utf-8')
+    return format_git_object_from_headers("commit", headers, revision.message)

-        # Unicode string values: utf-8 encoding
-        if isinstance(value, str):
-            value = value.encode('utf-8')

-        # encode the key to utf-8
-        components.extend([key.encode('utf-8'), b' ',
-                           escape_newlines(value), b'\n'])
-
-    if revision['message'] is not None:
-        components.extend([b'\n', revision['message']])
-
-    commit_raw = b''.join(components)
-    return identifier_to_str(hash_git_data(commit_raw, 'commit'))
-
-
-def target_type_to_git(target_type):
+def target_type_to_git(target_type: model.ReleaseTargetType) -> bytes:
    """Convert a software heritage target type to a git object type"""
    return {
-        'content': b'blob',
-        'directory': b'tree',
-        'revision': b'commit',
-        'release': b'tag',
-        'snapshot': b'refs'
+        model.ReleaseTargetType.CONTENT: b"blob",
+        model.ReleaseTargetType.DIRECTORY: b"tree",
+        model.ReleaseTargetType.REVISION: b"commit",
+        model.ReleaseTargetType.RELEASE: b"tag",
+        model.ReleaseTargetType.SNAPSHOT: b"refs",
    }[target_type]


-def release_identifier(release):
-    """Return the intrinsic identifier for a release."""
-    components = [
-        b'object ', identifier_to_str(release['target']).encode(), b'\n',
-        b'type ', target_type_to_git(release['target_type']), b'\n',
-        b'tag ', release['name'], b'\n',
-    ]
-
-    if 'author' in release and release['author']:
-        components.append(
-            format_author_line('tagger', release['author'], release['date'])
+def release_git_object(release: Union[Dict, model.Release]) -> bytes:
+    if isinstance(release, dict):
+        # For backward compatibility
+        warnings.warn(
+            "release_git_object's argument should be a swh.model.model.Directory "
+            "object.",
+            DeprecationWarning,
+            stacklevel=2,
        )
+        release = model.Release.from_dict(release)
+    release = cast(model.Release, release)
+
+    headers = [
+        (b"object", hash_to_bytehex(release.target)),
+        (b"type", target_type_to_git(release.target_type)),
+        (b"tag", release.name),
+    ]

-    if release['message'] is not None:
-        components.extend([b'\n', release['message']])
+    if release.author is not None:
+        headers.append((b"tagger", format_author_data(release.author, release.date)))

-    return identifier_to_str(hash_git_data(b''.join(components), 'tag'))
+    return format_git_object_from_headers("tag", headers, release.message)


-def snapshot_identifier(snapshot, *, ignore_unresolved=False):
-    """Return the intrinsic identifier for a snapshot.
+def snapshot_git_object(
+    snapshot: Union[Dict, model.Snapshot], *, ignore_unresolved: bool = False
+) -> bytes:
+    """Formats a snapshot as a git-like object.

    Snapshots are a set of named branches, which are pointers to objects at any
    level of the Software Heritage DAG.
@@ -567,242 +464,209 @@ def snapshot_identifier(snapshot, *, ignore_unresolved=False):
      length but are length-encoded to avoid ambiguity.

    Args:
-      snapshot (dict): the snapshot of which to compute the identifier. A
-        single entry is needed, ``'branches'``, which is itself a :class:`dict`
-        mapping each branch to its target
-      ignore_unresolved (bool): if `True`, ignore unresolved branch aliases.
-
-    Returns:
-      str: the intrinsic identifier for `snapshot`
-
+      ignore_unresolved: if False (the default), raises an exception when
+        alias branches point to non-existing branches
    """
+    if isinstance(snapshot, dict):
+        # For backward compatibility
+        warnings.warn(
+            "snapshot_git_object's argument should be a swh.model.model.Snapshot "
+            "object.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        snapshot = model.Snapshot.from_dict(snapshot)
+    snapshot = cast(model.Snapshot, snapshot)

    unresolved = []
    lines = []

-    for name, target in sorted(snapshot['branches'].items()):
+    for name, target in sorted(snapshot.branches.items()):
        if not target:
-            target_type = b'dangling'
-            target_id = b''
-        elif target['target_type'] == 'alias':
-            target_type = b'alias'
-            target_id = target['target']
-            if target_id not in snapshot['branches'] or target_id == name:
+            target_type = b"dangling"
+            target_id = b""
+        elif target.target_type == model.SnapshotTargetType.ALIAS:
+            target_type = b"alias"
+            target_id = target.target
+            if target_id not in snapshot.branches or target_id == name:
                unresolved.append((name, target_id))
        else:
-            target_type = target['target_type'].encode()
-            target_id = identifier_to_bytes(target['target'])
-
-        lines.extend([
-            target_type, b'\x20', name, b'\x00',
-            ('%d:' % len(target_id)).encode(), target_id,
-        ])
+            target_type = target.target_type.value.encode()
+            target_id = target.target
+
+        lines.extend(
+            [
+                target_type,
+                b"\x20",
+                name,
+                b"\x00",
+                ("%d:" % len(target_id)).encode(),
+                target_id,
+            ]
+        )

    if unresolved and not ignore_unresolved:
-        raise ValueError('Branch aliases unresolved: %s' %
-                         ', '.join('%s -> %s' % x for x in unresolved),
-                         unresolved)
+        raise ValueError(
+            "Branch aliases unresolved: %s"
+            % ", ".join("%r -> %r" % x for x in unresolved),
+            unresolved,
+        )

-    return identifier_to_str(hash_git_data(b''.join(lines), 'snapshot'))
+    return format_git_object_from_parts("snapshot", lines)


-def origin_identifier(origin):
-    """Return the intrinsic identifier for an origin.
+def raw_extrinsic_metadata_git_object(
+    metadata: Union[Dict, model.RawExtrinsicMetadata],
+) -> bytes:
+    """Formats RawExtrinsicMetadata as a git-like object.

-    An origin's identifier is the sha1 checksum of the entire origin URL
+    A raw_extrinsic_metadata identifier is a salted sha1 (using the git
+    hashing algorithm with the ``raw_extrinsic_metadata`` object type) of
+    a manifest following the format::

-    """
-    return hashlib.sha1(origin['url'].encode('ascii')).hexdigest()
-
-
-_object_type_map = {
-    ORIGIN: {
-        'short_name': 'ori',
-        'key_id': 'id'
-    },
-    SNAPSHOT: {
-        'short_name': 'snp',
-        'key_id': 'id'
-    },
-    RELEASE: {
-        'short_name': 'rel',
-        'key_id': 'id'
-    },
-    REVISION: {
-        'short_name': 'rev',
-        'key_id': 'id'
-    },
-    DIRECTORY: {
-        'short_name': 'dir',
-        'key_id': 'id'
-    },
-    CONTENT: {
-        'short_name': 'cnt',
-        'key_id': 'sha1_git'
-    }
-}
-
-
-_PersistentId = NamedTuple(
-    'PersistentId', [
-        ('namespace', str),
-        ('scheme_version', int),
-        ('object_type', str),
-        ('object_id', str),
-        ('metadata', Dict[str, Any]),
-    ])
-
-
-class PersistentId(_PersistentId):
-    """
-    Named tuple holding the relevant info associated to a Software Heritage
-    persistent identifier.
+        target $ExtendedSwhid
+        discovery_date $Timestamp
+        authority $StrWithoutSpaces $IRI
+        fetcher $Str $Version
+        format $StrWithoutSpaces
+        origin $IRI                         <- optional
+        visit $IntInDecimal                 <- optional
+        snapshot $CoreSwhid                 <- optional
+        release $CoreSwhid                  <- optional
+        revision $CoreSwhid                 <- optional
+        path $Bytes                         <- optional
+        directory $CoreSwhid                <- optional

-    Args:
-        namespace (str): the namespace of the identifier, defaults to 'swh'
-        scheme_version (int): the scheme version of the identifier,
-            defaults to 1
-        object_type (str): the type of object the identifier points to,
-            either 'content', 'directory', 'release', 'revision' or 'snapshot'
-        object_id (dict/bytes/str): object's dict representation or
-            object identifier
-        metadata (dict): optional dict filled with metadata related to
-            pointed object
-
-    Raises:
-        swh.model.exceptions.ValidationError: In case of invalid object type
-            or id
-
-    Once created, it contains the following attributes:
-
-    Attributes:
-        namespace (str): the namespace of the identifier
-        scheme_version (int): the scheme version of the identifier
-        object_type (str): the type of object the identifier points to
-        object_id (str): hexadecimal representation of the object hash
-        metadata (dict): metadata related to the pointed object
-
-    To get the raw persistent identifier string from an instance of
-    this named tuple, use the :func:`str` function::
-
-        pid = PersistentId(
-            object_type='content',
-            object_id='8ff44f081d43176474b267de5451f2c2e88089d0'
-        )
-        pid_str = str(pid)
-        # 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0'
-    """
-    __slots__ = ()
-
-    def __new__(cls, namespace=PID_NAMESPACE, scheme_version=PID_VERSION,
-                object_type='', object_id='', metadata={}):
-        o = _object_type_map.get(object_type)
-        if not o:
-            raise ValidationError('Wrong input: Supported types are %s' % (
-                list(_object_type_map.keys())))
-        if namespace != PID_NAMESPACE:
-            raise ValidationError(
-                "Wrong format: only supported namespace is '%s'"
-                % PID_NAMESPACE)
-        if scheme_version != PID_VERSION:
-            raise ValidationError(
-                'Wrong format: only supported version is %d' % PID_VERSION)
-        # internal swh representation resolution
-        if isinstance(object_id, dict):
-            object_id = object_id[o['key_id']]
-        validate_sha1(object_id)  # can raise if invalid hash
-        object_id = hash_to_hex(object_id)
-        return super(cls, PersistentId).__new__(
-            cls, namespace, scheme_version, object_type, object_id, metadata)
-
-    def __str__(self):
-        o = _object_type_map.get(self.object_type)
-        pid = PID_SEP.join([self.namespace, str(self.scheme_version),
-                            o['short_name'], self.object_id])
-        if self.metadata:
-            for k, v in self.metadata.items():
-                pid += '%s%s=%s' % (PID_CTXT_SEP, k, v)
-        return pid
-
-
-def persistent_identifier(object_type, object_id, scheme_version=1,
-                          metadata={}):
-    """Compute persistent identifier (stable over time) as per
-       documentation.
-
-    Documentation:
-        https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html  # noqa
+        $MetadataBytes

-    Args:
-        object_type (str): object's type, either 'content', 'directory',
-            'release', 'revision' or 'snapshot'
-        object_id (dict/bytes/str): object's dict representation or object
-            identifier
-        scheme_version (int): persistent identifier scheme version,
-            defaults to 1
-        metadata (dict): metadata related to the pointed object
-
-    Raises:
-        swh.model.exceptions.ValidationError: In case of invalid object type
-        or id
+    $IRI must be RFC 3987 IRIs (so they may contain newlines, that are escaped as
+    described below)

-    Returns:
-        str: the persistent identifier
+    $StrWithoutSpaces and $Version are ASCII strings, and may not contain spaces.
+
+    $Str is an UTF-8 string.
+
+    $CoreSwhid are core SWHIDs, as defined in :ref:`persistent-identifiers`.
+    $ExtendedSwhid is a core SWHID, with extra types allowed ('ori' for
+    origins and 'emd' for raw extrinsic metadata)

+    $Timestamp is a decimal representation of the rounded-down integer number of
+    seconds since the UNIX epoch (1970-01-01 00:00:00 UTC),
+    with no leading '0' (unless the timestamp value is zero) and no timezone.
+    It may be negative by prefixing it with a '-', which must not be followed
+    by a '0'.
+
+    Newlines in $Bytes, $Str, and $Iri are escaped as with other git fields,
+    ie. by adding a space after them.
    """
-    pid = PersistentId(scheme_version=scheme_version, object_type=object_type,
-                       object_id=object_id, metadata=metadata)
-    return str(pid)
+    if isinstance(metadata, dict):
+        # For backward compatibility
+        warnings.warn(
+            "raw_extrinsic_metadata_git_object's argument should be a "
+            "swh.model.model.RawExtrinsicMetadata object.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        metadata = model.RawExtrinsicMetadata.from_dict(metadata)
+    metadata = cast(model.RawExtrinsicMetadata, metadata)
+
+    # equivalent to using math.floor(dt.timestamp()) to round down,
+    # as int(dt.timestamp()) rounds toward zero,
+    # which would map two seconds on the 0 timestamp.
+    #
+    # This should never be an issue in practice as Software Heritage didn't
+    # start collecting metadata before 2015.
+    timestamp = (
+        metadata.discovery_date.astimezone(datetime.timezone.utc)
+        .replace(microsecond=0)
+        .timestamp()
+    )
+    assert timestamp.is_integer()
+
+    headers = [
+        (b"target", str(metadata.target).encode()),
+        (b"discovery_date", str(int(timestamp)).encode("ascii")),
+        (
+            b"authority",
+            f"{metadata.authority.type.value} {metadata.authority.url}".encode(),
+        ),
+        (
+            b"fetcher",
+            f"{metadata.fetcher.name} {metadata.fetcher.version}".encode(),
+        ),
+        (b"format", metadata.format.encode()),
+    ]

+    for key in (
+        "origin",
+        "visit",
+        "snapshot",
+        "release",
+        "revision",
+        "path",
+        "directory",
+    ):
+        if getattr(metadata, key, None) is not None:
+            value: bytes
+            if key == "path":
+                value = getattr(metadata, key)
+            else:
+                value = str(getattr(metadata, key)).encode()

-def parse_persistent_identifier(persistent_id):
-    """Parse swh's :ref:`persistent-identifiers` scheme.
+            headers.append((key.encode("ascii"), value))

-    Args:
-        persistent_id (str): A persistent identifier
+    return format_git_object_from_headers(
+        "raw_extrinsic_metadata", headers, metadata.metadata
+    )

-    Raises:
-        swh.model.exceptions.ValidationError: in case of:

-            * missing mandatory values (4)
-            * invalid namespace supplied
-            * invalid version supplied
-            * invalid type supplied
-            * missing hash
-            * invalid hash identifier supplied
+def extid_git_object(extid: model.ExtID) -> bytes:
+    """Formats an extid as a gi-like object.

-    Returns:
-        PersistentId: a named tuple holding the parsing result
+    An ExtID identifier is a salted sha1 (using the git hashing algorithm with
+    the ``extid`` object type) of a manifest following the format:
+
+    ```
+    extid_type $StrWithoutSpaces
+    [extid_version $Str]
+    extid $Bytes
+    target $CoreSwhid
+    [payload_type $StrWithoutSpaces]
+    [payload $ContentIdentifier]
+    ```
+
+    $StrWithoutSpaces is an ASCII string, and may not contain spaces.
+
+    Newlines in $Bytes are escaped as with other git fields, ie. by adding a
+    space after them.
+
+    The extid_version line is only generated if the version is non-zero.
+
+    The payload_type and payload lines are only generated if they are not
+    :const:`None`. $ContentIdentifier is the object ID of a content object.

    """
-    # <pid>;<contextual-information>
-    persistent_id_parts = persistent_id.split(PID_CTXT_SEP)
-    pid_data = persistent_id_parts.pop(0).split(':')
-
-    if len(pid_data) != 4:
-        raise ValidationError(
-            'Wrong format: There should be 4 mandatory values')
-
-    # Checking for parsing errors
-    _ns, _version, _type, _id = pid_data
-    pid_data[1] = int(pid_data[1])
-
-    for otype, data in _object_type_map.items():
-        if _type == data['short_name']:
-            pid_data[2] = otype
-            break
-
-    if not _id:
-        raise ValidationError(
-            'Wrong format: Identifier should be present')
-
-    persistent_id_metadata = {}
-    for part in persistent_id_parts:
-        try:
-            key, val = part.split('=')
-            persistent_id_metadata[key] = val
-        except Exception:
-            msg = 'Contextual data is badly formatted, form key=val expected'
-            raise ValidationError(msg)
-    pid_data.append(persistent_id_metadata)
-    return PersistentId(*pid_data)
+
+    headers = [
+        (b"extid_type", extid.extid_type.encode("ascii")),
+    ]
+    extid_version = extid.extid_version
+    if extid_version != 0:
+        headers.append((b"extid_version", str(extid_version).encode("ascii")))
+
+    headers.extend(
+        [
+            (b"extid", extid.extid),
+            (b"target", str(extid.target).encode("ascii")),
+        ]
+    )
+
+    payload_type = extid.payload_type
+    if payload_type is not None:
+        headers.append((b"payload_type", payload_type.encode("ascii")))
+
+    payload = extid.payload
+    if payload is not None:
+        headers.append((b"payload", payload))
+
+    return format_git_object_from_headers("extid", headers)
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
@@ -54,15 +54,16 @@ Basic usage examples:
 import binascii
 import functools
 import hashlib
-import os
-
 from io import BytesIO
-from typing import Callable, Dict
+import os
+from typing import Callable, Dict, Optional, Union

-ALGORITHMS = set(['sha1', 'sha256', 'sha1_git', 'blake2s256', 'blake2b512'])
+ALGORITHMS = set(
+    ["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5", "sha512"]
+)
 """Hashing algorithms supported by this module"""

-DEFAULT_ALGORITHMS = set(['sha1', 'sha256', 'sha1_git', 'blake2s256'])
+DEFAULT_ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256"])
 """Algorithms computed by default when calling the functions from this module.

 Subset of :const:`ALGORITHMS`.
@@ -71,7 +72,7 @@ Subset of :const:`ALGORITHMS`.
 HASH_BLOCK_SIZE = 32768
 """Block size for streaming hash computations made in this module"""

-_blake2_hash_cache = {}  # type: Dict[str, Callable]
+_blake2_hash_cache: Dict[str, Callable] = {}


 class MultiHash:
@@ -87,12 +88,13 @@ class MultiHash:
    computed and returned.

    """
+
    def __init__(self, hash_names=DEFAULT_ALGORITHMS, length=None):
        self.state = {}
        self.track_length = False
        for name in hash_names:
-            if name == 'length':
-                self.state['length'] = 0
+            if name == "length":
+                self.state["length"] = 0
                self.track_length = True
            else:
                self.state[name] = _new_hash(name, length)
@@ -116,7 +118,7 @@ class MultiHash:
    @classmethod
    def from_path(cls, path, hash_names=DEFAULT_ALGORITHMS):
        length = os.path.getsize(path)
-        with open(path, 'rb') as f:
+        with open(path, "rb") as f:
            ret = cls.from_file(f, hash_names=hash_names, length=length)
        return ret

@@ -128,48 +130,45 @@ class MultiHash:

    def update(self, chunk):
        for name, h in self.state.items():
-            if name == 'length':
+            if name == "length":
                continue
            h.update(chunk)
        if self.track_length:
-            self.state['length'] += len(chunk)
+            self.state["length"] += len(chunk)

    def digest(self):
        return {
-            name: h.digest() if name != 'length' else h
+            name: h.digest() if name != "length" else h
            for name, h in self.state.items()
        }

    def hexdigest(self):
        return {
-            name: h.hexdigest() if name != 'length' else h
+            name: h.hexdigest() if name != "length" else h
            for name, h in self.state.items()
        }

    def bytehexdigest(self):
        return {
-            name: hash_to_bytehex(h.digest()) if name != 'length' else h
+            name: hash_to_bytehex(h.digest()) if name != "length" else h
            for name, h in self.state.items()
        }

    def copy(self):
        copied_state = {
-            name: h.copy() if name != 'length' else h
-            for name, h in self.state.items()
+            name: h.copy() if name != "length" else h for name, h in self.state.items()
        }
        return self.from_state(copied_state, self.track_length)


 def _new_blake2_hash(algo):
-    """Return a function that initializes a blake2 hash.
-
-    """
+    """Return a function that initializes a blake2 hash."""
    if algo in _blake2_hash_cache:
        return _blake2_hash_cache[algo]()

    lalgo = algo.lower()
-    if not lalgo.startswith('blake2'):
-        raise ValueError('Algorithm %s is not a blake2 hash' % algo)
+    if not lalgo.startswith("blake2"):
+        raise ValueError("Algorithm %s is not a blake2 hash" % algo)

    blake_family = lalgo[:7]

@@ -178,27 +177,14 @@ def _new_blake2_hash(algo):
        try:
            digest_size, remainder = divmod(int(lalgo[7:]), 8)
        except ValueError:
-            raise ValueError(
-                'Unknown digest size for algo %s' % algo
-            ) from None
+            raise ValueError("Unknown digest size for algo %s" % algo) from None
        if remainder:
            raise ValueError(
-                'Digest size for algorithm %s must be a multiple of 8' % algo
+                "Digest size for algorithm %s must be a multiple of 8" % algo
            )

-    if lalgo in hashlib.algorithms_available:
-        # Handle the case where OpenSSL ships the given algorithm
-        # (e.g. Python 3.5 on Debian 9 stretch)
-        _blake2_hash_cache[algo] = lambda: hashlib.new(lalgo)
-    else:
-        # Try using the built-in implementation for Python 3.6+
-        if blake_family in hashlib.algorithms_available:
-            blake2 = getattr(hashlib, blake_family)
-        else:
-            import pyblake2
-            blake2 = getattr(pyblake2, blake_family)
-
-        _blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size)
+    blake2 = getattr(hashlib, blake_family)
+    _blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size)

    return _blake2_hash_cache[algo]()

@@ -208,18 +194,16 @@ def _new_hashlib_hash(algo):

    Handle the swh-specific names for the blake2-related algorithms
    """
-    if algo.startswith('blake2'):
+    if algo.startswith("blake2"):
        return _new_blake2_hash(algo)
    else:
        return hashlib.new(algo)


-def _new_git_hash(base_algo, git_type, length):
-    """Initialize a digest object (as returned by python's hashlib) for the
-    requested algorithm, and feed it with the header for a git object of the
-    given type and length.
+def git_object_header(git_type: str, length: int) -> bytes:
+    """Returns the header for a git object of the given type and length.

-    The header for hashing a git object consists of:
+    The header of a git object consists of:
     - The type of the object (encoded in ASCII)
     - One ASCII space (\x20)
     - The length of the object (decimal encoded in ASCII)
@@ -234,15 +218,26 @@ def _new_git_hash(base_algo, git_type, length):
    Returns:
        a hashutil.hash object
    """
+    git_object_types = {
+        "blob",
+        "tree",
+        "commit",
+        "tag",
+        "snapshot",
+        "raw_extrinsic_metadata",
+        "extid",
+    }

-    h = _new_hashlib_hash(base_algo)
-    git_header = '%s %d\0' % (git_type, length)
-    h.update(git_header.encode('ascii'))
+    if git_type not in git_object_types:
+        raise ValueError(
+            "Unexpected git object type %s, expected one of %s"
+            % (git_type, ", ".join(sorted(git_object_types)))
+        )

-    return h
+    return ("%s %d\0" % (git_type, length)).encode("ascii")


-def _new_hash(algo, length=None):
+def _new_hash(algo: str, length: Optional[int] = None):
    """Initialize a digest object (as returned by python's hashlib) for
    the requested algorithm. See the constant ALGORITHMS for the list
    of supported algorithms. If a git-specific hashing algorithm is
@@ -264,19 +259,22 @@ def _new_hash(algo, length=None):
    """
    if algo not in ALGORITHMS:
        raise ValueError(
-            'Unexpected hashing algorithm %s, expected one of %s' %
-            (algo, ', '.join(sorted(ALGORITHMS))))
+            "Unexpected hashing algorithm %s, expected one of %s"
+            % (algo, ", ".join(sorted(ALGORITHMS)))
+        )

-    if algo.endswith('_git'):
+    if algo.endswith("_git"):
        if length is None:
-            raise ValueError('Missing length for git hashing algorithm')
+            raise ValueError("Missing length for git hashing algorithm")
        base_algo = algo[:-4]
-        return _new_git_hash(base_algo, 'blob', length)
+        h = _new_hashlib_hash(base_algo)
+        h.update(git_object_header("blob", length))
+        return h

    return _new_hashlib_hash(algo)


-def hash_git_data(data, git_type, base_algo='sha1'):
+def hash_git_data(data, git_type, base_algo="sha1"):
    """Hash the given data as a git object of type git_type.

    Args:
@@ -289,21 +287,15 @@ def hash_git_data(data, git_type, base_algo='sha1'):
    Raises:
        ValueError if the git_type is unexpected.
    """
-
-    git_object_types = {'blob', 'tree', 'commit', 'tag', 'snapshot'}
-
-    if git_type not in git_object_types:
-        raise ValueError('Unexpected git object type %s, expected one of %s' %
-                         (git_type, ', '.join(sorted(git_object_types))))
-
-    h = _new_git_hash(base_algo, git_type, len(data))
+    h = _new_hashlib_hash(base_algo)
+    h.update(git_object_header(git_type, len(data)))
    h.update(data)

    return h.digest()


 @functools.lru_cache()
-def hash_to_hex(hash):
+def hash_to_hex(hash: Union[str, bytes]) -> str:
    """Converts a hash (in hex or bytes form) to its hexadecimal ascii form

    Args:
@@ -315,11 +307,11 @@ def hash_to_hex(hash):
    """
    if isinstance(hash, str):
        return hash
-    return binascii.hexlify(hash).decode('ascii')
+    return binascii.hexlify(hash).decode("ascii")


 @functools.lru_cache()
-def hash_to_bytehex(hash):
+def hash_to_bytehex(hash: bytes) -> bytes:
    """Converts a hash to its hexadecimal bytes representation

    Args:
@@ -332,7 +324,7 @@ def hash_to_bytehex(hash):


 @functools.lru_cache()
-def hash_to_bytes(hash):
+def hash_to_bytes(hash: Union[str, bytes]) -> bytes:
    """Converts a hash (in hex or bytes form) to its raw bytes form

    Args:
@@ -348,7 +340,7 @@ def hash_to_bytes(hash):


 @functools.lru_cache()
-def bytehex_to_hash(hex):
+def bytehex_to_hash(hex: bytes) -> bytes:
    """Converts a hexadecimal bytes representation of a hash to that hash

    Args:

--- a/swh/model/hypothesis_strategies.py
+++ b/swh/model/hypothesis_strategies.py
-# Copyright (C) 2019 The Software Heritage developers
+# Copyright (C) 2019-2021 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

-import attr
 import datetime
+import functools
+import string
+from typing import Any, Callable, List, Sequence, Set, Tuple, Union

+from deprecated import deprecated
+from hypothesis import assume
+from hypothesis.extra.dateutil import timezones
 from hypothesis.strategies import (
-    binary, builds, characters, composite, dictionaries, from_regex,
-    integers, just, lists, none, one_of, sampled_from, text, tuples,
+    SearchStrategy,
+    binary,
+    booleans,
+    builds,
+    characters,
+    composite,
+    datetimes,
+    dictionaries,
+    from_regex,
+    integers,
+    just,
+    lists,
+    none,
+    one_of,
+    sampled_from,
+    sets,
+    text,
+    tuples,
 )

-
 from .from_disk import DentryPerms
 from .model import (
-    Person, Timestamp, TimestampWithTimezone, Origin, OriginVisit,
-    Snapshot, SnapshotBranch, TargetType, Release, Revision,
-    Directory, DirectoryEntry, Content, SkippedContent
+    BaseContent,
+    BaseModel,
+    Content,
+    Directory,
+    DirectoryEntry,
+    MetadataAuthority,
+    MetadataFetcher,
+    ModelObjectType,
+    Origin,
+    OriginVisit,
+    OriginVisitStatus,
+    Person,
+    RawExtrinsicMetadata,
+    Release,
+    ReleaseTargetType,
+    Revision,
+    RevisionType,
+    SkippedContent,
+    Snapshot,
+    SnapshotBranch,
+    SnapshotTargetType,
+    Timestamp,
+    TimestampWithTimezone,
 )
-from .identifiers import snapshot_identifier, identifier_to_bytes
-
+from .swhids import ExtendedObjectType, ExtendedSWHID

 pgsql_alphabet = characters(
-    blacklist_categories=('Cs', ),
-    blacklist_characters=['\u0000'])  # postgresql does not like these
+    blacklist_categories=["Cs"],
+    blacklist_characters=["\u0000"],
+)  # postgresql does not like these


 def optional(strategy):
@@ -42,208 +82,532 @@ def sha1():
    return binary(min_size=20, max_size=20)


+def binaries_without_bytes(blacklist: Sequence[int]):
+    """Like hypothesis.strategies.binary, but takes a sequence of bytes that
+    should not be included."""
+    return lists(sampled_from([i for i in range(256) if i not in blacklist])).map(bytes)
+
+
+@composite
+def extended_swhids(draw):
+    object_type = draw(sampled_from(ExtendedObjectType))
+    object_id = draw(sha1_git())
+    return ExtendedSWHID(object_type=object_type, object_id=object_id)
+
+
+def aware_datetimes():
+    # datetimes in Software Heritage are not used for software artifacts
+    # (which may be much older than 2000), but only for objects like scheduler
+    # task runs, and origin visits, which were created by Software Heritage,
+    # so at least in 2015.
+    # We're forbidding old datetimes, because until 1956, many timezones had seconds
+    # in their "UTC offsets" (see
+    # <https://en.wikipedia.org/wiki/Time_zone#Worldwide_time_zones>), which is not
+    # encodable in ISO8601; and we need our datetimes to be ISO8601-encodable in the
+    # RPC protocol
+    min_value = datetime.datetime(2000, 1, 1, 0, 0, 0)
+    return datetimes(min_value=min_value, timezones=timezones())
+
+
 @composite
-def urls(draw):
-    protocol = draw(sampled_from(['git', 'http', 'https', 'deb']))
-    domain = draw(from_regex(r'\A([a-z]([a-z0-9-]*)\.){1,3}[a-z0-9]+\Z'))
+def iris(draw):
+    protocol = draw(sampled_from(["git", "http", "https", "deb"]))
+    domain = draw(from_regex(r"\A([a-z]([a-z0-9é🏛️-]*)\.){1,3}([a-z0-9é])+\Z"))

-    return '%s://%s' % (protocol, domain)
+    return "%s://%s" % (protocol, domain)


-def persons():
-    return builds(Person)
+@composite
+def persons_d(draw):
+    fullname = draw(binary())
+    email = draw(optional(binary()))
+    name = draw(optional(binary()))
+    assume(not (len(fullname) == 32 and email is None and name is None))
+    return dict(fullname=fullname, name=name, email=email)
+
+
+def persons(**kwargs):
+    return persons_d(**kwargs).map(Person.from_dict)
+
+
+def timestamps_d(**kwargs):
+    defaults = dict(
+        seconds=integers(Timestamp.MIN_SECONDS, Timestamp.MAX_SECONDS),
+        microseconds=integers(Timestamp.MIN_MICROSECONDS, Timestamp.MAX_MICROSECONDS),
+    )
+    return builds(dict, **{**defaults, **kwargs})


 def timestamps():
-    max_seconds = datetime.datetime.max.replace(
-        tzinfo=datetime.timezone.utc).timestamp()
-    min_seconds = datetime.datetime.min.replace(
-        tzinfo=datetime.timezone.utc).timestamp()
-    return builds(
-        Timestamp,
-        seconds=integers(min_seconds, max_seconds),
-        microseconds=integers(0, 1000000))
+    return timestamps_d().map(Timestamp.from_dict)


-def timestamps_with_timezone():
-    return builds(
-        TimestampWithTimezone,
-        timestamp=timestamps(),
-        offset=integers(min_value=-14*60, max_value=14*60))
+@composite
+def timestamps_with_timezone_d(
+    draw,
+    *,
+    timestamp=timestamps_d(),
+    offset=integers(min_value=-14 * 60, max_value=14 * 60),
+    negative_utc=booleans(),
+):
+    timestamp = draw(timestamp)
+    offset = draw(offset)
+    negative_utc = draw(negative_utc)
+    assume(not (negative_utc and offset))
+    return dict(timestamp=timestamp, offset=offset, negative_utc=negative_utc)
+
+
+timestamps_with_timezone = timestamps_with_timezone_d().map(
+    TimestampWithTimezone.from_dict
+)


-def origins():
-    return builds(
-        Origin,
-        type=sampled_from(['git', 'hg', 'svn', 'pypi', 'deb']),
-        url=urls())
+def origins_d(*, url=iris().filter(lambda iri: len(iri.encode()) < 2048)):
+    return builds(dict, url=url)


-def origin_visits():
-    return builds(
-        OriginVisit,
-        visit=integers(0, 1000),
-        origin=urls(),
-        status=sampled_from(['ongoing', 'full', 'partial']),
+def origins(**kwargs):
+    return origins_d(**kwargs).map(Origin.from_dict)
+
+
+def origin_visits_d(**kwargs):
+    defaults = dict(
+        visit=integers(1, 1000),
+        origin=iris(),
+        date=aware_datetimes(),
        type=pgsql_text(),
-        snapshot=optional(sha1_git()))
+    )
+    return builds(dict, **{**defaults, **kwargs})


-@composite
-def releases(draw):
-    (date, author) = draw(one_of(
-        tuples(none(), none()),
-        tuples(timestamps_with_timezone(), persons())))
-    rel = draw(builds(
-        Release,
-        author=none(),
-        date=none(),
-        target=sha1_git()))
-    return attr.evolve(
-        rel,
-        date=date,
-        author=author)
-
-
-def revision_metadata():
+def origin_visits(**kwargs):
+    return origin_visits_d(**kwargs).map(OriginVisit.from_dict)
+
+
+def metadata_dicts():
    return dictionaries(pgsql_text(), pgsql_text())


-def revisions():
-    return builds(
-        Revision,
-        date=timestamps_with_timezone(),
-        committer_date=timestamps_with_timezone(),
-        parents=lists(sha1_git()),
+def origin_visit_statuses_d(**kwargs):
+    defaults = dict(
+        visit=integers(1, 1000),
+        origin=iris(),
+        type=optional(sampled_from(["git", "svn", "pypi", "debian"])),
+        status=sampled_from(
+            ["created", "ongoing", "full", "partial", "not_found", "failed"]
+        ),
+        date=aware_datetimes(),
+        snapshot=optional(sha1_git()),
+        metadata=optional(metadata_dicts()),
+    )
+    return builds(dict, **{**defaults, **kwargs})
+
+
+def origin_visit_statuses(**kwargs):
+    return origin_visit_statuses_d(**kwargs).map(OriginVisitStatus.from_dict)
+
+
+@composite
+def releases_d(draw, **kwargs):
+    defaults = dict(
+        target_type=sampled_from([x.value for x in ReleaseTargetType]),
+        name=binary(),
+        message=optional(binary()),
+        synthetic=booleans(),
+        target=sha1_git(),
+        metadata=optional(revision_metadata()),
+        raw_manifest=optional(binary()),
+    )
+
+    d = draw(
+        one_of(
+            # None author/date:
+            builds(dict, author=none(), date=none(), **{**defaults, **kwargs}),
+            # non-None author/date:
+            builds(
+                dict,
+                date=timestamps_with_timezone_d(),
+                author=persons_d(),
+                **{**defaults, **kwargs},
+            ),
+            # it is also possible for date to be None but not author, but let's not
+            # overwhelm hypothesis with this edge case
+        )
+    )
+
+    if d["raw_manifest"] is None:
+        del d["raw_manifest"]
+    return d
+
+
+def releases(**kwargs):
+    return releases_d(**kwargs).map(Release.from_dict)
+
+
+revision_metadata = metadata_dicts
+
+
+def extra_headers():
+    return lists(
+        tuples(binary(min_size=0, max_size=50), binary(min_size=0, max_size=500))
+    ).map(tuple)
+
+
+@composite
+def revisions_d(draw, **kwargs):
+    defaults = dict(
+        message=optional(binary()),
+        synthetic=booleans(),
+        parents=tuples(sha1_git()),
        directory=sha1_git(),
-        metadata=one_of(none(), revision_metadata()))
+        type=sampled_from([x.value for x in RevisionType]),
+        metadata=optional(revision_metadata()),
+        extra_headers=extra_headers(),
+        raw_manifest=optional(binary()),
+    )
+    d = draw(
+        one_of(
+            # None author/committer/date/committer_date
+            builds(
+                dict,
+                author=none(),
+                committer=none(),
+                date=none(),
+                committer_date=none(),
+                **{**defaults, **kwargs},
+            ),
+            # non-None author/committer/date/committer_date
+            builds(
+                dict,
+                author=persons_d(),
+                committer=persons_d(),
+                date=timestamps_with_timezone_d(),
+                committer_date=timestamps_with_timezone_d(),
+                **{**defaults, **kwargs},
+            ),
+            # There are many other combinations, but let's not overwhelm hypothesis
+            # with these edge cases
+        )
+    )
    # TODO: metadata['extra_headers'] can have binary keys and values

+    if d["raw_manifest"] is None:
+        del d["raw_manifest"]
+    return d

-def directory_entries():
-    return builds(
-        DirectoryEntry,
+
+def revisions(**kwargs):
+    return revisions_d(**kwargs).map(Revision.from_dict)
+
+
+def directory_entries_d(**kwargs):
+    defaults = dict(
+        name=binaries_without_bytes(b"/"),
        target=sha1_git(),
-        perms=sampled_from([perm.value for perm in DentryPerms]))
+    )
+    return one_of(
+        builds(
+            dict,
+            type=just("file"),
+            perms=one_of(
+                integers(min_value=0o100000, max_value=0o100777),  # regular file
+                integers(min_value=0o120000, max_value=0o120777),  # symlink
+            ),
+            **{**defaults, **kwargs},
+        ),
+        builds(
+            dict,
+            type=just("dir"),
+            perms=integers(
+                min_value=DentryPerms.directory,
+                max_value=DentryPerms.directory + 0o777,
+            ),
+            **{**defaults, **kwargs},
+        ),
+        builds(
+            dict,
+            type=just("rev"),
+            perms=integers(
+                min_value=DentryPerms.revision,
+                max_value=DentryPerms.revision + 0o777,
+            ),
+            **{**defaults, **kwargs},
+        ),
+    )


-def directories():
-    return builds(
-        Directory,
-        entries=lists(directory_entries()))
+def directory_entries(**kwargs):
+    return directory_entries_d(**kwargs).map(DirectoryEntry)
+
+
+@composite
+def directories_d(draw, raw_manifest=optional(binary())):
+    d = draw(builds(dict, entries=tuples(directory_entries_d())))
+
+    d["raw_manifest"] = draw(raw_manifest)
+    if d["raw_manifest"] is None:
+        del d["raw_manifest"]
+    return d
+
+
+def directories(**kwargs):
+    return directories_d(**kwargs).map(Directory.from_dict)
+
+
+def contents_d():
+    return one_of(present_contents_d(), skipped_contents_d())


 def contents():
    return one_of(present_contents(), skipped_contents())


-@composite
-def present_contents(draw):
-    return draw(builds(
-        Content,
-        length=integers(min_value=0, max_value=2**63-1),
-        sha1=sha1(),
-        sha1_git=sha1_git(),
-        sha256=binary(min_size=32, max_size=32),
-        blake2s256=binary(min_size=32, max_size=32),
-        status=one_of(just('visible'), just('hidden')),
-        data=binary(),
-    ))
+def present_contents_d(**kwargs):
+    defaults = dict(
+        data=binary(max_size=4096),
+        ctime=optional(aware_datetimes()),
+        status=one_of(just("visible"), just("hidden")),
+    )
+    return builds(dict, **{**defaults, **kwargs})
+
+
+def present_contents(**kwargs):
+    return present_contents_d().map(lambda d: Content.from_data(**d))


 @composite
-def skipped_contents(draw):
-    return draw(builds(
-        SkippedContent,
-        length=integers(min_value=-1, max_value=2**63-1),
-        sha1=optional(sha1()),
-        sha1_git=optional(sha1_git()),
-        sha256=optional(binary(min_size=32, max_size=32)),
-        blake2s256=optional(binary(min_size=32, max_size=32)),
-        status=just('absent'),
-        reason=pgsql_text(),
-    ))
+def skipped_contents_d(
+    draw, reason=pgsql_text(), status=just("absent"), ctime=optional(aware_datetimes())
+):
+    result = BaseContent._hash_data(draw(binary(max_size=4096)))
+    result.pop("data")
+    nullify_attrs = draw(
+        sets(sampled_from(["sha1", "sha1_git", "sha256", "blake2s256"]))
+    )
+    for k in nullify_attrs:
+        result[k] = None
+    result["reason"] = draw(reason)
+    result["status"] = draw(status)
+    result["ctime"] = draw(ctime)
+    return result
+
+
+def skipped_contents(**kwargs):
+    return skipped_contents_d().map(SkippedContent.from_dict)


 def branch_names():
    return binary(min_size=1)


-def branch_targets_object():
+def snapshot_targets_object_d():
    return builds(
-        SnapshotBranch,
+        dict,
        target=sha1_git(),
-        target_type=sampled_from([
-            TargetType.CONTENT, TargetType.DIRECTORY, TargetType.REVISION,
-            TargetType.RELEASE, TargetType.SNAPSHOT]))
+        target_type=sampled_from(
+            [x.value for x in SnapshotTargetType if x.value not in ("alias",)]
+        ),
+    )
+

+branch_targets_object_d = deprecated(
+    version="v6.13.0", reason="use snapshot_targets_object_d"
+)(snapshot_targets_object_d)

-def branch_targets_alias():
+
+def snapshot_targets_alias_d():
    return builds(
-        SnapshotBranch,
-        target_type=just(TargetType.ALIAS))
+        dict, target=sha1_git(), target_type=just("alias")
+    )  # SnapshotTargetType.ALIAS.value))
+

+branch_targets_alias_d = deprecated(
+    version="v6.13.0", reason="use snapshot_targets_alias_d"
+)(snapshot_targets_alias_d)

-def branch_targets(*, only_objects=False):
+
+def snapshot_targets_d(*, only_objects=False):
    if only_objects:
-        return branch_targets_object()
+        return snapshot_targets_object_d()
    else:
-        return one_of(branch_targets_alias(), branch_targets_object())
+        return one_of(snapshot_targets_alias_d(), snapshot_targets_object_d())
+
+
+branch_targets_d = deprecated(version="v6.13.0", reason="use snapshot_targets_d")(
+    snapshot_targets_d
+)
+
+
+def snapshot_targets(*, only_objects=False):
+    return builds(
+        SnapshotBranch.from_dict, snapshot_targets_d(only_objects=only_objects)
+    )


 @composite
-def snapshots(draw, *, min_size=0, max_size=100, only_objects=False):
-    branches = draw(dictionaries(
-        keys=branch_names(),
-        values=one_of(
-            none(),
-            branch_targets(only_objects=only_objects)
-        ),
-        min_size=min_size,
-        max_size=max_size,
-    ))
+def snapshots_d(draw, *, min_size=0, max_size=100, only_objects=False):
+    branches = draw(
+        dictionaries(
+            keys=branch_names(),
+            values=optional(snapshot_targets_d(only_objects=only_objects)),
+            min_size=min_size,
+            max_size=max_size,
+        )
+    )

    if not only_objects:
        # Make sure aliases point to actual branches
        unresolved_aliases = {
-            target.target
-            for target in branches.values()
-            if (target
-                and target.target_type == 'alias'
-                and target.target not in branches)
-         }
-
-        for alias in unresolved_aliases:
-            branches[alias] = draw(branch_targets(only_objects=True))
+            branch: target["target"]
+            for branch, target in branches.items()
+            if (
+                target
+                and target["target_type"] == "alias"
+                and target["target"] not in branches
+            )
+        }
+        for alias_name, alias_target in unresolved_aliases.items():
+            # Override alias branch with one pointing to a real object
+            # if max_size constraint is reached
+            alias = alias_target if len(branches) < max_size else alias_name
+            branches[alias] = draw(snapshot_targets_d(only_objects=True))

    # Ensure no cycles between aliases
    while True:
        try:
-            id_ = snapshot_identifier({
-                'branches': {
-                    name: branch.to_dict() if branch else None
-                    for (name, branch) in branches.items()}})
+            snapshot = Snapshot.from_dict(
+                {
+                    "branches": {
+                        name: branch or None for (name, branch) in branches.items()
+                    }
+                }
+            )
        except ValueError as e:
-            for (source, target) in e.args[1]:
-                branches[source] = draw(branch_targets(only_objects=True))
+            for source, target in e.args[1]:
+                branches[source] = draw(snapshot_targets_d(only_objects=True))
        else:
            break
-    return Snapshot(
-        id=identifier_to_bytes(id_),
-        branches=branches)

+    return snapshot.to_dict()

-def objects():
-    return one_of(
-        origins().map(lambda x: ('origin', x)),
-        origin_visits().map(lambda x: ('origin_visit', x)),
-        snapshots().map(lambda x: ('snapshot', x)),
-        releases().map(lambda x: ('release', x)),
-        revisions().map(lambda x: ('revision', x)),
-        directories().map(lambda x: ('directory', x)),
-        contents().map(lambda x: ('content', x)),
+
+def snapshots(*, min_size=0, max_size=100, only_objects=False):
+    return snapshots_d(
+        min_size=min_size, max_size=max_size, only_objects=only_objects
+    ).map(Snapshot.from_dict)
+
+
+def metadata_authorities(url=iris()):
+    return builds(MetadataAuthority, url=url, metadata=just(None))
+
+
+def metadata_fetchers(**kwargs):
+    defaults = dict(
+        name=text(min_size=1, alphabet=string.printable),
+        version=text(
+            min_size=1,
+            alphabet=string.ascii_letters + string.digits + string.punctuation,
+        ),
+    )
+    return builds(
+        MetadataFetcher,
+        metadata=just(None),
+        **{**defaults, **kwargs},
    )


-def object_dicts():
-    return objects().map(lambda x: (x[0], x[1].to_dict()))
+def raw_extrinsic_metadata(**kwargs):
+    defaults = dict(
+        target=extended_swhids(),
+        discovery_date=aware_datetimes(),
+        authority=metadata_authorities(),
+        fetcher=metadata_fetchers(),
+        format=text(min_size=1, alphabet=string.printable),
+    )
+    return builds(RawExtrinsicMetadata, **{**defaults, **kwargs})
+
+
+def raw_extrinsic_metadata_d(**kwargs):
+    return raw_extrinsic_metadata(**kwargs).map(RawExtrinsicMetadata.to_dict)
+
+
+def _tuplify(object_type: ModelObjectType, obj: BaseModel):
+    return (object_type, obj)
+
+
+def objects(
+    # remove the Union once deprecated usage have been migrated
+    blacklist_types: Union[Set[ModelObjectType] | Any] = {
+        ModelObjectType.ORIGIN_VISIT_STATUS,
+    },
+    split_content: bool = False,
+):
+    """generates a random couple (type, obj)
+
+    which obj is an instance of the Model class corresponding to obj_type.
+
+    `blacklist_types` is a list of obj_type to exclude from the strategy.
+
+    If `split_content` is True, generates Content and SkippedContent under different
+    obj_type, resp. "content" and "skipped_content".
+    """
+    strategies: List[
+        Tuple[ModelObjectType, Callable[[], SearchStrategy[BaseModel]]]
+    ] = [
+        (ModelObjectType.ORIGIN, origins),
+        (ModelObjectType.ORIGIN_VISIT, origin_visits),
+        (ModelObjectType.ORIGIN_VISIT_STATUS, origin_visit_statuses),
+        (ModelObjectType.SNAPSHOT, snapshots),
+        (ModelObjectType.RELEASE, releases),
+        (ModelObjectType.REVISION, revisions),
+        (ModelObjectType.DIRECTORY, directories),
+        (ModelObjectType.RAW_EXTRINSIC_METADATA, raw_extrinsic_metadata),
+    ]
+    if split_content:
+        strategies.append((ModelObjectType.CONTENT, present_contents))
+        strategies.append((ModelObjectType.SKIPPED_CONTENT, skipped_contents))
+    else:
+        strategies.append((ModelObjectType.CONTENT, contents))
+
+    candidates = [
+        obj_gen().map(functools.partial(_tuplify, obj_type))
+        for (obj_type, obj_gen) in strategies
+        if obj_type not in blacklist_types
+    ]
+    return one_of(*candidates)
+
+
+def object_dicts(
+    blacklist_types=(ModelObjectType.ORIGIN_VISIT_STATUS,), split_content=False
+):
+    """generates a random couple (type, dict)
+
+    which dict is suitable for <ModelForType>.from_dict() factory methods.
+
+    `blacklist_types` is a list of obj_type to exclude from the strategy.
+
+    If `split_content` is True, generates Content and SkippedContent under different
+    obj_type, resp. "content" and "skipped_content".
+
+    """
+    strategies = [
+        (ModelObjectType.ORIGIN, origins_d),
+        (ModelObjectType.ORIGIN_VISIT, origin_visits_d),
+        (ModelObjectType.ORIGIN_VISIT_STATUS, origin_visit_statuses_d),
+        (ModelObjectType.SNAPSHOT, snapshots_d),
+        (ModelObjectType.RELEASE, releases_d),
+        (ModelObjectType.REVISION, revisions_d),
+        (ModelObjectType.DIRECTORY, directories_d),
+        (ModelObjectType.RAW_EXTRINSIC_METADATA, raw_extrinsic_metadata_d),
+    ]
+    if split_content:
+        strategies.append((ModelObjectType.CONTENT, present_contents_d))
+        strategies.append((ModelObjectType.SKIPPED_CONTENT, skipped_contents_d))
+    else:
+        strategies.append((ModelObjectType.CONTENT, contents_d))
+    args = [
+        obj_gen().map(lambda x, obj_type=obj_type: (obj_type, x))
+        for (obj_type, obj_gen) in strategies
+        if obj_type not in blacklist_types
+    ]
+    return one_of(*args)
--- a/swh/model/merkle.py
+++ b/swh/model/merkle.py
-# Copyright (C) 2017 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

 """Merkle tree data structure"""

-import abc
-import collections
-
-from typing import List, Optional
-
-
-def deep_update(left, right):
-    """Recursively update the left mapping with deeply nested values from the right
-    mapping.
-
-    This function is useful to merge the results of several calls to
-    :func:`MerkleNode.collect`.
-
-    Arguments:
-      left: a mapping (modified by the update operation)
-      right: a mapping
-
-    Returns:
-      the left mapping, updated with nested values from the right mapping
-
-    Example:
-        >>> a = {
-        ...     'key1': {
-        ...         'key2': {
-        ...              'key3': 'value1/2/3',
-        ...         },
-        ...     },
-        ... }
-        >>> deep_update(a, {
-        ...     'key1': {
-        ...         'key2': {
-        ...              'key4': 'value1/2/4',
-        ...         },
-        ...     },
-        ... }) == {
-        ...     'key1': {
-        ...         'key2': {
-        ...             'key3': 'value1/2/3',
-        ...             'key4': 'value1/2/4',
-        ...         },
-        ...     },
-        ... }
-        True
-        >>> deep_update(a, {
-        ...     'key1': {
-        ...         'key2': {
-        ...              'key3': 'newvalue1/2/3',
-        ...         },
-        ...     },
-        ... }) == {
-        ...     'key1': {
-        ...         'key2': {
-        ...             'key3': 'newvalue1/2/3',
-        ...             'key4': 'value1/2/4',
-        ...         },
-        ...     },
-        ... }
-        True
+from __future__ import annotations

-    """
-    for key, rvalue in right.items():
-        if isinstance(rvalue, collections.Mapping):
-            new_lvalue = deep_update(left.get(key, {}), rvalue)
-            left[key] = new_lvalue
-        else:
-            left[key] = rvalue
-    return left
+import abc
+from typing import Any, Dict, Iterator, List, Set


 class MerkleNode(dict, metaclass=abc.ABCMeta):
@@ -102,16 +39,18 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
    The collection of updated data from the tree is implemented through the
    :func:`collect` function and associated helpers.

-    Attributes:
-      data (dict): data associated to the current node
-      parents (list): known parents of the current node
-      collected (bool): whether the current node has been collected
-
    """
-    __slots__ = ['parents', 'data', '__hash', 'collected']

-    type = None  # type: Optional[str]  # TODO: make this an enum
-    """Type of the current node (used as a classifier for :func:`collect`)"""
+    __slots__ = ["parents", "data", "__hash", "collected"]
+
+    data: Dict
+    """data associated to the current node"""
+
+    parents: List
+    """known parents of the current node"""
+
+    collected: bool
+    """whether the current node has been collected"""

    def __init__(self, data=None):
        super().__init__()
@@ -120,6 +59,16 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        self.__hash = None
        self.collected = False

+    def __eq__(self, other):
+        return (
+            isinstance(other, MerkleNode)
+            and super().__eq__(other)
+            and self.data == other.data
+        )
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
    def invalidate_hash(self):
        """Invalidate the cached hash of the current node."""
        if not self.__hash:
@@ -130,7 +79,7 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        for parent in self.parents:
            parent.invalidate_hash()

-    def update_hash(self, *, force=False):
+    def update_hash(self, *, force=False) -> Any:
        """Recursively compute the hash of the current node.

        Args:
@@ -150,20 +99,23 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        return self.__hash

    @property
-    def hash(self):
+    def hash(self) -> Any:
        """The hash of the current node, as calculated by
        :func:`compute_hash`.
        """
        return self.update_hash()

+    def __hash__(self):
+        return hash(self.hash)
+
    @abc.abstractmethod
-    def compute_hash(self):
+    def compute_hash(self) -> Any:
        """Compute the hash of the current node.

        The hash should depend on the data of the node, as well as on hashes
        of the children nodes.
        """
-        raise NotImplementedError('Must implement compute_hash method')
+        raise NotImplementedError("Must implement compute_hash method")

    def __setitem__(self, name, new_child):
        """Add a child, invalidating the current hash"""
@@ -212,47 +164,24 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        """
        return self.data

-    def collect_node(self, **kwargs):
-        """Collect the data for the current node, for use by :func:`collect`.
-
-        Arguments:
-          kwargs: passed as-is to :func:`get_data`.
-
-        Returns:
-          A :class:`dict` compatible with :func:`collect`.
-        """
+    def collect_node(self) -> Set[MerkleNode]:
+        """Collect the current node if it has not been yet, for use by :func:`collect`."""
        if not self.collected:
            self.collected = True
-            return {self.type: {self.hash: self.get_data(**kwargs)}}
+            return {self}
        else:
-            return {}
-
-    def collect(self, **kwargs):
-        """Collect the data for all nodes in the subtree rooted at `self`.
+            return set()

-        The data is deduplicated by type and by hash.
-
-        Arguments:
-          kwargs: passed as-is to :func:`get_data`.
+    def collect(self) -> Set[MerkleNode]:
+        """Collect the added and modified nodes in the subtree rooted at `self`
+        since the last collect operation.

        Returns:
-           A :class:`dict` with the following structure::
-
-             {
-               'typeA': {
-                 node1.hash: node1.get_data(),
-                 node2.hash: node2.get_data(),
-               },
-               'typeB': {
-                 node3.hash: node3.get_data(),
-                 ...
-               },
-               ...
-             }
+           A :class:`set` of collected nodes
        """
-        ret = self.collect_node(**kwargs)
+        ret = self.collect_node()
        for child in self.values():
-            deep_update(ret, child.collect(**kwargs))
+            ret.update(child.collect())

        return ret

@@ -266,23 +195,39 @@ class MerkleNode(dict, metaclass=abc.ABCMeta):
        for child in self.values():
            child.reset_collect()

+    def iter_tree(self, dedup=True) -> Iterator[MerkleNode]:
+        """Yields all children nodes, recursively. Common nodes are deduplicated
+        by default (deduplication can be turned off setting the given argument
+        'dedup' to False).
+        """
+        yield from self._iter_tree(seen=set(), dedup=dedup)
+
+    def _iter_tree(self, seen: Set[bytes], dedup) -> Iterator[MerkleNode]:
+        if self.hash not in seen:
+            if dedup:
+                seen.add(self.hash)
+            yield self
+            for child in self.values():
+                yield from child._iter_tree(seen=seen, dedup=dedup)
+

 class MerkleLeaf(MerkleNode):
    """A leaf to a Merkle tree.

    A Merkle leaf is simply a Merkle node with children disabled.
    """
-    __slots__ = []  # type: List[str]
+
+    __slots__: List[str] = []

    def __setitem__(self, name, child):
-        raise ValueError('%s is a leaf' % self.__class__.__name__)
+        raise ValueError("%s is a leaf" % self.__class__.__name__)

    def __getitem__(self, name):
-        raise ValueError('%s is a leaf' % self.__class__.__name__)
+        raise ValueError("%s is a leaf" % self.__class__.__name__)

    def __delitem__(self, name):
-        raise ValueError('%s is a leaf' % self.__class__.__name__)
+        raise ValueError("%s is a leaf" % self.__class__.__name__)

    def update(self, new_children):
        """Children update operation. Disabled for leaves."""
-        raise ValueError('%s is a leaf' % self.__class__.__name__)
+        raise ValueError("%s is a leaf" % self.__class__.__name__)
--- a/swh/model/model.py
+++ b/swh/model/model.py
-# Copyright (C) 2018-2019 The Software Heritage developers
+# Copyright (C) 2018-2024 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

-import datetime
+"""
+Implementation of Software Heritage's data model
+
+See :ref:`data-model` for an overview of the data model.
+
+The classes defined in this module are immutable
+`attrs objects <https://attrs.org/>`__ and enums.
+
+All classes define a ``from_dict`` class method and a ``to_dict``
+method to convert between them and msgpack-serializable objects.
+"""

-from abc import ABCMeta, abstractmethod
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+import collections
+import datetime
 from enum import Enum
-from typing import List, Optional, Dict
+import hashlib
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+import warnings

 import attr
+from attr._make import _AndValidator
+from attr.validators import and_
+from attrs_strict import AttributeTypeError
 import dateutil.parser
+import iso8601
+from typing_extensions import Final
+
+from . import git_objects
+from .collections import ImmutableDict
+from .hashutil import DEFAULT_ALGORITHMS, MultiHash, hash_to_bytehex, hash_to_hex
+from .swhids import CoreSWHID
+from .swhids import ExtendedObjectType as SwhidExtendedObjectType
+from .swhids import ExtendedSWHID
+from .swhids import ObjectType as SwhidObjectType
+
+
+class MissingData(Exception):
+    """Raised by `Content.with_data` when it has no way of fetching the
+    data (but not when fetching the data fails)."""
+
+    pass
+
+
+KeyType = Union[Dict[str, str], Dict[str, bytes], bytes]
+"""The type returned by BaseModel.unique_key()."""

-from .identifiers import (
-    normalize_timestamp, directory_identifier, revision_identifier,
-    release_identifier, snapshot_identifier
-)
-from .hashutil import DEFAULT_ALGORITHMS, hash_to_bytes

 SHA1_SIZE = 20

+_OFFSET_CHARS = frozenset(b"+-0123456789")
+
 # TODO: Limit this to 20 bytes
 Sha1Git = bytes
-
-
-class BaseModel:
+Sha1 = bytes
+
+
+def hash_repr(h: bytes) -> str:
+    if h is None:
+        return "None"
+    else:
+        return f"hash_to_bytes('{hash_to_hex(h)}')"
+
+
+def parents_repr(parents: Tuple[Sha1Git, ...]):
+    return repr(tuple(hash_repr(p) for p in parents)).replace('"', "")
+
+
+def freeze_optional_dict(
+    d: Union[None, Dict, ImmutableDict],
+) -> Optional[ImmutableDict]:
+    if isinstance(d, dict):
+        return ImmutableDict(d)
+    else:
+        return d
+
+
+def dictify(value):
+    "Helper function used by BaseModel.to_dict()"
+    if isinstance(value, BaseModel):
+        return value.to_dict()
+    elif isinstance(value, (CoreSWHID, ExtendedSWHID)):
+        return str(value)
+    elif isinstance(value, Enum):
+        return value.value
+    elif isinstance(value, (dict, ImmutableDict)):
+        return {k: dictify(v) for k, v in value.items()}
+    elif isinstance(value, tuple):
+        return tuple(dictify(v) for v in value)
+    else:
+        return value
+
+
+def generic_type_validator(instance, attribute, value):
+    """validates the type of an attribute value whatever the attribute type"""
+    raise NotImplementedError("generic type check should have been optimized")
+
+
+def _true_validator(instance, attribute, value, expected_type=None, origin_value=None):
+    pass
+
+
+def _none_validator(instance, attribute, value, expected_type=None, origin_value=None):
+    if value is not None:
+        if origin_value is None:
+            origin_value = value
+        raise AttributeTypeError(origin_value, attribute)
+
+
+def _origin_type_validator(
+    instance, attribute, value, expected_type=None, origin_value=None
+):
+    # This is functionally equivalent to using just this:
+    #   return isinstance(value, type)
+    # but using type equality before isinstance allows very quick checks
+    # when the exact class is used (which is the overwhelming majority of cases)
+    # while still allowing subclasses to be used.
+    if expected_type is None:
+        expected_type = attribute.type
+    if not (type(value) is expected_type or isinstance(value, expected_type)):
+        if origin_value is None:
+            origin_value = value
+        raise AttributeTypeError(origin_value, attribute)
+
+
+def _tuple_infinite_validator(
+    instance,
+    attribute,
+    value,
+    expected_type=None,
+    origin_value=None,
+):
+    type_ = type(value)
+    if origin_value is None:
+        origin_value = value
+    if type_ != tuple and not isinstance(value, tuple):
+        raise AttributeTypeError(origin_value, attribute)
+    if expected_type is None:
+        expected_type = attribute.type
+    args = expected_type.__args__
+    # assert len(args) == 2 and args[1] is Ellipsis
+    expected_value_type = args[0]
+    validator = optimized_validator(expected_value_type)
+    for i in value:
+        validator(
+            instance,
+            attribute,
+            i,
+            expected_type=expected_value_type,
+            origin_value=origin_value,
+        )
+
+
+def _tuple_bytes_bytes_validator(
+    instance,
+    attribute,
+    value,
+    expected_type=None,
+    origin_value=None,
+):
+    type_ = type(value)
+    if type_ != tuple and not isinstance(value, tuple):
+        if origin_value is None:
+            origin_value = value
+        raise AttributeTypeError(origin_value, attribute)
+    if len(value) != 2:
+        if origin_value is None:
+            origin_value = value
+        raise AttributeTypeError(origin_value, attribute)
+    if type(value[0]) is not bytes or type(value[1]) is not bytes:
+        if origin_value is None:
+            origin_value = value
+        raise AttributeTypeError(origin_value, attribute)
+
+
+def _tuple_finite_validator(
+    instance,
+    attribute,
+    value,
+    expected_type=None,
+    origin_value=None,
+):
+    # might be useful to optimise the sub-validator tuple, in practice, we only
+    # have [bytes, bytes]
+    type_ = type(value)
+    if origin_value is None:
+        origin_value = value
+    if type_ != tuple and not isinstance(value, tuple):
+        raise AttributeTypeError(origin_value, attribute)
+    if expected_type is None:
+        expected_type = attribute.type
+    args = expected_type.__args__
+
+    # assert len(args) != 2 or args[1] is Ellipsis
+    if len(args) != len(value):
+        raise AttributeTypeError(origin_value, attribute)
+    for item_type, item in zip(args, value):
+        validator = optimized_validator(item_type)
+        validator(
+            instance,
+            attribute,
+            item,
+            expected_type=item_type,
+            origin_value=origin_value,
+        )
+
+
+def _immutable_dict_validator(
+    instance,
+    attribute,
+    value,
+    expected_type=None,
+    origin_value=None,
+):
+    value_type = type(value)
+    if origin_value is None:
+        origin_value = value
+    if value_type != ImmutableDict and not isinstance(value, ImmutableDict):
+        raise AttributeTypeError(origin_value, attribute)
+
+    if expected_type is None:
+        expected_type = attribute.type
+    (expected_key_type, expected_value_type) = expected_type.__args__
+
+    key_validator = optimized_validator(expected_key_type)
+    value_validator = optimized_validator(expected_value_type)
+
+    for item_key, item_value in value.items():
+        key_validator(
+            instance,
+            attribute,
+            item_key,
+            expected_type=expected_key_type,
+            origin_value=origin_value,
+        )
+        value_validator(
+            instance,
+            attribute,
+            item_value,
+            expected_type=expected_value_type,
+            origin_value=origin_value,
+        )
+
+
+def optimized_validator(type_):
+    if type_ is object or type_ is Any:
+        return _true_validator
+
+    if type_ is None:
+        return _none_validator
+
+    origin = getattr(type_, "__origin__", None)
+
+    # Non-generic type, check it directly
+    if origin is None:
+        return _origin_type_validator
+
+    # Then, if it's a container, check its items.
+    if origin is tuple:
+        args = type_.__args__
+        if len(args) == 2 and args[1] is Ellipsis:
+            # Infinite tuple
+            return _tuple_infinite_validator
+        elif args == (bytes, bytes):
+            return _tuple_bytes_bytes_validator
+        else:
+            return _tuple_finite_validator
+    elif origin is Union:
+        args = type_.__args__
+        all_validators = tuple((optimized_validator(t), t) for t in args)
+
+        def union_validator(
+            instance,
+            attribute,
+            value,
+            expected_type=None,
+            origin_value=None,
+        ):
+            if origin_value is None:
+                origin_value = value
+            for validator, type_ in all_validators:
+                try:
+                    validator(
+                        instance,
+                        attribute,
+                        value,
+                        expected_type=type_,
+                        origin_value=origin_value,
+                    )
+                except AttributeTypeError:
+                    pass
+                else:
+                    break
+            else:
+                raise AttributeTypeError(origin_value, attribute)
+
+        return union_validator
+    elif origin is ImmutableDict:
+        return _immutable_dict_validator
+    # No need to check dict or list. because they are converted to ImmutableDict
+    # and tuple respectively.
+    raise NotImplementedError(f"Type-checking {type_}")
+
+
+def optimize_all_validators(cls, old_fields):
+    """process validators to turn them into a faster version … eventually"""
+    new_fields = []
+    for f in old_fields:
+        validator = f.validator
+        if validator is generic_type_validator:
+            validator = optimized_validator(f.type)
+        elif isinstance(validator, _AndValidator):
+            new_and = []
+            for v in validator._validators:
+                if v is generic_type_validator:
+                    v = optimized_validator(f.type)
+                new_and.append(v)
+            validator = and_(*new_and)
+        else:
+            validator = None
+
+        if validator is not None:
+            f = f.evolve(validator=validator)
+        new_fields.append(f)
+    if attr.__version__ < "21.3.0":
+        # https://github.com/python-attrs/attrs/issues/821
+        from attr._make import _make_attr_tuple_class
+
+        attr_names = [f.name for f in new_fields]
+        AttrsClass = _make_attr_tuple_class(cls.__name__, attr_names)
+        return AttrsClass(new_fields)
+    else:
+        return new_fields
+
+
+ModelType = TypeVar("ModelType", bound="BaseModel")
+HashableModelType = TypeVar("HashableModelType", bound="BaseHashableModel")
+
+
+class _StringCompatibleEnum(Enum):
+    def __eq__(self, other):
+        # stay compatible with legacy string comparison (for now)
+        if isinstance(other, str):
+            warnings.warn(
+                "Use the enum value instead of string",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            return self.value == other
+        return super().__eq__(other)
+
+    def __str__(self):
+        # preserve interpolation property (for now)
+        return self.value
+
+    def __hash__(self):
+        # make sure we don't confuse dictionary key matching (for now)
+        return hash(str(self.value))
+
+
+class ModelObjectType(_StringCompatibleEnum):
+    """Possible object types of Model object"""
+
+    CONTENT = "content"
+    DIRECTORY = "directory"
+    DIRECTORY_ENTRY = "directory_entry"
+    EXTID = "extid"
+    METADATA_AUTHORITY = "metadata_authority"
+    METADATA_FETCHER = "metadata_fetcher"
+    ORIGIN = "origin"
+    ORIGIN_VISIT = "origin_visit"
+    ORIGIN_VISIT_STATUS = "origin_visit_status"
+    PERSON = "person"
+    RAW_EXTRINSIC_METADATA = "raw_extrinsic_metadata"
+    RELEASE = "release"
+    REVISION = "revision"
+    SKIPPED_CONTENT = "skipped_content"
+    SNAPSHOT = "snapshot"
+    SNAPSHOT_BRANCH = "snapshot_branch"
+    TIMESTAMP = "timestamp"
+    TIMESTAMP_WITH_TIMEZONE = "timestamp_with_timezone"
+
+
+class BaseModel(ABC):
    """Base class for SWH model classes.

    Provides serialization/deserialization to/from Python dictionaries,
    that are suitable for JSON/msgpack-like formats."""

+    __slots__ = ()
+
+    @property
+    @abstractmethod
+    def object_type(self) -> ModelObjectType:
+        # Some juggling to please mypy
+        #
+        # Note: starting from Python 3.11 we can combine @property with
+        # @classmethod which is the real intend here.
+        raise NotImplementedError
+
    def to_dict(self):
        """Wrapper of `attr.asdict` that can be overridden by subclasses
        that have special handling of some of the fields."""
-
-        def dictify(value):
-            if isinstance(value, BaseModel):
-                return value.to_dict()
-            elif isinstance(value, Enum):
-                return value.value
-            elif isinstance(value, dict):
-                return {k: dictify(v) for k, v in value.items()}
-            elif isinstance(value, list):
-                return [dictify(v) for v in value]
-            else:
-                return value
-
-        ret = attr.asdict(self, recurse=False)
-        return dictify(ret)
+        return dictify(attr.asdict(self, recurse=False))

    @classmethod
    def from_dict(cls, d):
@@ -55,324 +427,1088 @@ class BaseModel:
        recursively builds the corresponding objects."""
        return cls(**d)

+    def evolve(self: ModelType, **kwargs) -> ModelType:
+        """Alias to call :func:`attr.evolve` on this object, returning a new object."""
+        return attr.evolve(self, **kwargs)  # type: ignore[misc]
+
+    def anonymize(self: ModelType) -> Optional[ModelType]:
+        """Returns an anonymized version of the object, if needed.
+
+        If the object model does not need/support anonymization, returns None.
+        """
+        return None
+
+    def unique_key(self) -> KeyType:
+        """Returns a unique key for this object, that can be used for
+        deduplication."""
+        raise NotImplementedError(f"unique_key for {self}")
+
+    def check(self) -> None:
+        """Performs internal consistency checks, and raises an error if one fails."""
+        # without the type-ignore comment below, attr >= 22.1.0 causes mypy to report:
+        #   Argument 1 has incompatible type "BaseModel"; expected "AttrsInstance"
+        attr.validate(self)  # type: ignore[arg-type]
+

-class HashableObject(metaclass=ABCMeta):
+def _compute_hash_from_manifest(manifest: bytes) -> Sha1Git:
+    return hashlib.new("sha1", manifest).digest()
+
+
+class BaseHashableModel(BaseModel, ABC):
    """Mixin to automatically compute object identifier hash when
    the associated model is instantiated."""

-    @staticmethod
-    @abstractmethod
-    def compute_hash(object_dict):
+    __slots__ = ()
+
+    id: Sha1Git
+
+    def compute_hash(self) -> bytes:
        """Derived model classes must implement this to compute
-        the object hash from its dict representation."""
-        pass
+        the object hash.
+
+        This method is called by the object initialization if the `id`
+        attribute is set to an empty value.
+        """
+        return self._compute_hash_from_attributes()
+
+    @abstractmethod
+    def _compute_hash_from_attributes(self) -> Sha1Git:
+        raise NotImplementedError(f"_compute_hash_from_attributes for {self}")

    def __attrs_post_init__(self):
        if not self.id:
-            obj_id = hash_to_bytes(self.compute_hash(self.to_dict()))
-            object.__setattr__(self, 'id', obj_id)
+            obj_id = self.compute_hash()
+            object.__setattr__(self, "id", obj_id)
+
+    def evolve(self: HashableModelType, **kwargs) -> HashableModelType:
+        """Alias to call :func:`attr.evolve` on this object, returning a new object
+        with its ``id`` recomputed based on the content."""
+        if "id" in kwargs:
+            raise TypeError(
+                f"{self.__class__.__name__}.evolve recomputes the id itself; "
+                f"use attr.evolve to change the id manually."
+            )
+        obj = attr.evolve(self, **kwargs)  # type: ignore[misc]
+        new_hash = obj.compute_hash()
+        return attr.evolve(obj, id=new_hash)  # type: ignore[misc]
+
+    def unique_key(self) -> KeyType:
+        return self.id
+
+    def check(self) -> None:
+        super().check()
+
+        if self.id != self.compute_hash():
+            raise ValueError("'id' does not match recomputed hash.")
+
+
+HashableObject = BaseHashableModel  # deprecated alias
+

+class HashableObjectWithManifest(BaseHashableModel):
+    """Derived class of BaseHashableModel, for objects that may need to store
+    verbatim git objects as ``raw_manifest`` to preserve original hashes."""

-@attr.s(frozen=True)
+    __slots__ = ()
+
+    raw_manifest: Optional[bytes] = None
+    """Stores the original content of git objects when they cannot be faithfully
+    represented using only the other attributes.
+
+    This should only be used as a last resort, and only set in the Git loader,
+    for objects too corrupt to fit the data model."""
+
+    def to_dict(self):
+        d = super().to_dict()
+        if d["raw_manifest"] is None:
+            del d["raw_manifest"]
+        return d
+
+    def compute_hash(self) -> bytes:
+        """Derived model classes must implement this to compute
+        the object hash.
+
+        This method is called by the object initialization if the `id`
+        attribute is set to an empty value.
+        """
+        if self.raw_manifest is None:
+            return super().compute_hash()  # calls self._compute_hash_from_attributes()
+        else:
+            return _compute_hash_from_manifest(self.raw_manifest)
+
+    def check(self) -> None:
+        super().check()
+
+        if (
+            self.raw_manifest is not None
+            and self.id == self._compute_hash_from_attributes()
+        ):
+            raise ValueError(
+                f"{self} has a non-none raw_manifest attribute, but does not need it."
+            )
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
 class Person(BaseModel):
    """Represents the author/committer of a revision or release."""
-    name = attr.ib(type=bytes)
-    email = attr.ib(type=bytes)
-    fullname = attr.ib(type=bytes)

+    object_type: Final = ModelObjectType.PERSON
+
+    fullname = attr.ib(type=bytes, validator=generic_type_validator)
+    name = attr.ib(type=Optional[bytes], validator=generic_type_validator, eq=False)
+    email = attr.ib(type=Optional[bytes], validator=generic_type_validator, eq=False)
+
+    @classmethod
+    def from_fullname(cls, fullname: bytes):
+        """Returns a Person object, by guessing the name and email from the
+        fullname, in the `name <email>` format.
+
+        The fullname is left unchanged."""
+        if fullname is None:
+            raise TypeError("fullname is None.")
+
+        name: Optional[bytes]
+        email: Optional[bytes]
+
+        try:
+            open_bracket = fullname.index(b"<")
+        except ValueError:
+            name = fullname
+            email = None
+        else:
+            raw_name = fullname[:open_bracket]
+            raw_email = fullname[open_bracket + 1 :]
+
+            if not raw_name:
+                name = None
+            else:
+                name = raw_name.strip()
+
+            try:
+                close_bracket = raw_email.rindex(b">")
+            except ValueError:
+                email = raw_email
+            else:
+                email = raw_email[:close_bracket]
+
+        return Person(
+            name=name or None,
+            email=email or None,
+            fullname=fullname,
+        )
+
+    def anonymize(self) -> Person:
+        """Returns an anonymized version of the Person object.
+
+        Anonymization is simply a Person which fullname is the hashed, with unset name
+        or email.
+        """
+        return Person(
+            fullname=hashlib.sha256(self.fullname).digest(),
+            name=None,
+            email=None,
+        )
+
+    @classmethod
+    def from_dict(cls, d):
+        """
+        If the fullname is missing, construct a fullname
+        using the following heuristics: if the name value is None, we return the
+        email in angle brackets, else, we return the name, a space, and the email
+        in angle brackets.
+        """
+        if "fullname" not in d:
+            parts = []
+            if d["name"] is not None:
+                parts.append(d["name"])
+            if d["email"] is not None:
+                parts.append(b"".join([b"<", d["email"], b">"]))
+
+            fullname = b" ".join(parts)
+            d = {**d, "fullname": fullname}
+        d = {"name": None, "email": None, **d}
+        return super().from_dict(d)
+
+
+class TimestampOverflowException(ValueError):
+    """Raised when trying to build :class:`Timestamp` from a timestamp too far in
+    the past or future"""

-@attr.s(frozen=True)
+    pass
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
 class Timestamp(BaseModel):
    """Represents a naive timestamp from a VCS."""
+
+    object_type: Final = ModelObjectType.TIMESTAMP
+
    seconds = attr.ib(type=int)
    microseconds = attr.ib(type=int)

+    # maximum and minimum values allowed by datetime.datetime.fromtimestamp()
+    MIN_SECONDS = -62135510961  # 0001-01-02T00:00:00
+    MAX_SECONDS = 253402297199  # 9999-12-31T23:59:59
+
+    MIN_MICROSECONDS = 0
+    MAX_MICROSECONDS = 10**6 - 1
+
    @seconds.validator
    def check_seconds(self, attribute, value):
-        """Check that seconds fit in a 64-bits signed integer."""
-        if not (-2**63 <= value < 2**63):
-            raise ValueError('Seconds must be a signed 64-bits integer.')
+        """Check that ``seconds`` can be stored in all supported mediums
+        (PostgreSQL/Cassandra/ORC; PostgreSQL being the limiting factor)."""
+        if value.__class__ is not int:
+            raise AttributeTypeError(value, attribute)
+
+        # common good sense; less strict than the checks below
+        # if not (-(2**63) <= value < 2**63):
+        #     raise TimestampOverflowException("Seconds must be a signed 64-bits integer.")
+
+        # values outside this range do not fit in Python's datetime, so we cannot
+        # write them to postgresql with psycopg2
+        if not (self.MIN_SECONDS <= value <= self.MAX_SECONDS):
+            raise TimestampOverflowException(
+                f"Seconds must be in [{self.MIN_SECONDS}, {self.MAX_SECONDS}]"
+            )

    @microseconds.validator
    def check_microseconds(self, attribute, value):
        """Checks that microseconds are positive and < 1000000."""
-        if not (0 <= value < 10**6):
-            raise ValueError('Microseconds must be in [0, 1000000[.')
+        if value.__class__ is not int:
+            raise AttributeTypeError(value, attribute)
+        if not (self.MIN_MICROSECONDS <= value <= self.MAX_MICROSECONDS):
+            raise ValueError(
+                "Microseconds must be in [{self.MIN_MICROSECONDS}, {self.MAX_MICROSECONDS}]."
+            )


-@attr.s(frozen=True)
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
 class TimestampWithTimezone(BaseModel):
    """Represents a TZ-aware timestamp from a VCS."""
-    timestamp = attr.ib(type=Timestamp)
-    offset = attr.ib(type=int)
-    negative_utc = attr.ib(type=bool)
-
-    @offset.validator
-    def check_offset(self, attribute, value):
-        """Checks the offset is a 16-bits signed integer (in theory, it
-        should always be between -14 and +14 hours)."""
-        if not (-2**15 <= value < 2**15):
-            # max 14 hours offset in theory, but you never know what
-            # you'll find in the wild...
-            raise ValueError('offset too large: %d minutes' % value)
+
+    object_type: Final = ModelObjectType.TIMESTAMP_WITH_TIMEZONE
+
+    timestamp = attr.ib(type=Timestamp, validator=generic_type_validator)
+
+    offset_bytes = attr.ib(type=bytes, validator=generic_type_validator)
+    """Raw git representation of the timezone, as an offset from UTC.
+    It should follow this format: ``+HHMM`` or ``-HHMM`` (including ``+0000`` and
+    ``-0000``).
+
+    However, when created from git objects, it must be the exact bytes used in the
+    original objects, so it may differ from this format when they do.
+    """

    @classmethod
-    def from_dict(cls, d):
+    def from_numeric_offset(
+        cls, timestamp: Timestamp, offset: int, negative_utc: bool
+    ) -> TimestampWithTimezone:
+        """Returns a :class:`TimestampWithTimezone` instance from the old dictionary
+        format (with ``offset`` and ``negative_utc`` instead of ``offset_bytes``).
+        """
+        negative = offset < 0 or negative_utc
+        (hours, minutes) = divmod(abs(offset), 60)
+        offset_bytes = f"{'-' if negative else '+'}{hours:02}{minutes:02}".encode()
+        tstz = TimestampWithTimezone(timestamp=timestamp, offset_bytes=offset_bytes)
+        assert tstz.offset_minutes() == offset, (tstz.offset_minutes(), offset)
+        return tstz
+
+    @classmethod
+    def from_dict(
+        cls, time_representation: Union[Dict, datetime.datetime, int]
+    ) -> TimestampWithTimezone:
        """Builds a TimestampWithTimezone from any of the formats
        accepted by :func:`swh.model.normalize_timestamp`."""
-        d = normalize_timestamp(d)
-        return cls(
-            timestamp=Timestamp.from_dict(d['timestamp']),
-            offset=d['offset'],
-            negative_utc=d['negative_utc'])
+        # TODO: this accept way more types than just dicts; find a better
+        # name
+        if isinstance(time_representation, dict):
+            ts = time_representation["timestamp"]
+            if isinstance(ts, dict):
+                seconds = ts.get("seconds", 0)
+                microseconds = ts.get("microseconds", 0)
+            elif isinstance(ts, int):
+                seconds = ts
+                microseconds = 0
+            else:
+                raise ValueError(
+                    f"TimestampWithTimezone.from_dict received non-integer timestamp "
+                    f"member {ts!r}"
+                )
+
+            timestamp = Timestamp(seconds=seconds, microseconds=microseconds)
+
+            if "offset_bytes" in time_representation:
+                return cls(
+                    timestamp=timestamp,
+                    offset_bytes=time_representation["offset_bytes"],
+                )
+            else:
+                # old format
+                offset = time_representation["offset"]
+                negative_utc = time_representation.get("negative_utc") or False
+                return cls.from_numeric_offset(timestamp, offset, negative_utc)
+        elif isinstance(time_representation, datetime.datetime):
+            # TODO: warn when using from_dict() on a datetime
+            utcoffset = time_representation.utcoffset()
+            time_representation = time_representation.astimezone(datetime.timezone.utc)
+            microseconds = time_representation.microsecond
+            if microseconds:
+                time_representation = time_representation.replace(microsecond=0)
+            seconds = int(time_representation.timestamp())
+            if utcoffset is None:
+                raise ValueError(
+                    f"TimestampWithTimezone.from_dict received datetime without "
+                    f"timezone: {time_representation}"
+                )
+
+            # utcoffset is an integer number of minutes
+            seconds_offset = utcoffset.total_seconds()
+            offset = int(seconds_offset) // 60
+            # TODO: warn if remainder is not zero
+            return cls.from_numeric_offset(
+                Timestamp(seconds=seconds, microseconds=microseconds), offset, False
+            )
+        elif isinstance(time_representation, int):
+            # TODO: warn when using from_dict() on an int
+            seconds = time_representation
+            timestamp = Timestamp(seconds=time_representation, microseconds=0)
+            return cls(timestamp=timestamp, offset_bytes=b"+0000")
+        else:
+            raise ValueError(
+                f"TimestampWithTimezone.from_dict received non-integer timestamp: "
+                f"{time_representation!r}"
+            )
+
+    @classmethod
+    def from_datetime(cls, dt: datetime.datetime) -> TimestampWithTimezone:
+        return cls.from_dict(dt)
+
+    def to_datetime(self) -> datetime.datetime:
+        """Convert to a datetime (with a timezone set to the recorded fixed UTC offset)
+
+        Beware that this conversion can be lossy: ``-0000`` and 'weird' offsets
+        cannot be represented. Also note that it may fail due to type overflow.
+        """
+        td = datetime.timedelta(minutes=self.offset_minutes())
+        try:
+            tz = datetime.timezone(td)
+        except ValueError:
+            # Larger or smaller than 24h, so it's bogus. self.timestamp.seconds is
+            # a number of seconds since Epoch, so it's safe to ignore the timezone
+            # and replace it with any other one. We arbitrarily pick UTC.
+            tz = datetime.timezone.utc
+        timestamp = datetime.datetime.fromtimestamp(self.timestamp.seconds, tz)
+        timestamp = timestamp.replace(microsecond=self.timestamp.microseconds)
+        return timestamp

+    @classmethod
+    def from_iso8601(cls, s):
+        """Builds a TimestampWithTimezone from an ISO8601-formatted string."""
+        dt = iso8601.parse_date(s)
+        tstz = cls.from_datetime(dt)
+        if dt.tzname() == "-00:00":
+            assert tstz.offset_bytes == b"+0000"
+            tstz = attr.evolve(tstz, offset_bytes=b"-0000")
+        return tstz
+
+    @staticmethod
+    def _parse_offset_bytes(offset_bytes: bytes) -> int:
+        """Parses an ``offset_bytes`` value (in Git's ``[+-]HHMM`` format),
+        and returns the corresponding numeric values (in number of minutes).
+
+        Tries to account for some mistakes in the format, to support incorrect
+        Git implementations.
+
+        >>> TimestampWithTimezone._parse_offset_bytes(b"+0000")
+        0
+        >>> TimestampWithTimezone._parse_offset_bytes(b"-0000")
+        0
+        >>> TimestampWithTimezone._parse_offset_bytes(b"+0200")
+        120
+        >>> TimestampWithTimezone._parse_offset_bytes(b"-0200")
+        -120
+        >>> TimestampWithTimezone._parse_offset_bytes(b"+200")
+        120
+        >>> TimestampWithTimezone._parse_offset_bytes(b"-200")
+        -120
+        >>> TimestampWithTimezone._parse_offset_bytes(b"+02")
+        120
+        >>> TimestampWithTimezone._parse_offset_bytes(b"-02")
+        -120
+        >>> TimestampWithTimezone._parse_offset_bytes(b"+0010")
+        10
+        >>> TimestampWithTimezone._parse_offset_bytes(b"-0010")
+        -10
+        >>> TimestampWithTimezone._parse_offset_bytes(b"+200000000000000000")
+        0
+        >>> TimestampWithTimezone._parse_offset_bytes(b"+0160")  # 60 minutes...
+        0
+        """
+        offset_str = offset_bytes.decode()
+        assert offset_str[0] in "+-"
+        sign = int(offset_str[0] + "1")
+        if len(offset_str) <= 3:
+            hours = int(offset_str[1:])
+            minutes = 0
+        else:
+            hours = int(offset_str[1:-2])
+            minutes = int(offset_str[-2:])

-@attr.s(frozen=True)
-class Origin(BaseModel):
+        offset = sign * (hours * 60 + minutes)
+        if (0 <= minutes <= 59) and (-(2**15) <= offset < 2**15):
+            return offset
+        else:
+            # can't parse it to a reasonable value; give up and pretend it's UTC.
+            return 0
+
+    def offset_minutes(self):
+        """Returns the offset, as a number of minutes since UTC.
+
+        >>> TimestampWithTimezone(
+        ...     Timestamp(seconds=1642765364, microseconds=0), offset_bytes=b"+0000"
+        ... ).offset_minutes()
+        0
+        >>> TimestampWithTimezone(
+        ...     Timestamp(seconds=1642765364, microseconds=0), offset_bytes=b"+0200"
+        ... ).offset_minutes()
+        120
+        >>> TimestampWithTimezone(
+        ...     Timestamp(seconds=1642765364, microseconds=0), offset_bytes=b"-0200"
+        ... ).offset_minutes()
+        -120
+        >>> TimestampWithTimezone(
+        ...     Timestamp(seconds=1642765364, microseconds=0), offset_bytes=b"+0530"
+        ... ).offset_minutes()
+        330
+        """
+        return self._parse_offset_bytes(self.offset_bytes)
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class Origin(BaseHashableModel):
    """Represents a software source: a VCS and an URL."""
-    url = attr.ib(type=str)
-    type = attr.ib(type=Optional[str], default=None)

-    def to_dict(self):
-        r = super().to_dict()
-        r.pop('type', None)
-        return r
+    object_type: Final = ModelObjectType.ORIGIN
+
+    url = attr.ib(type=str, validator=generic_type_validator)
+
+    id = attr.ib(type=Sha1Git, validator=generic_type_validator, default=b"")

+    def unique_key(self) -> KeyType:
+        return {"url": self.url}

-@attr.s(frozen=True)
+    def _compute_hash_from_attributes(self) -> bytes:
+        return _compute_hash_from_manifest(self.url.encode("utf-8"))
+
+    def swhid(self) -> ExtendedSWHID:
+        """Returns a SWHID representing this origin."""
+        return ExtendedSWHID(
+            object_type=SwhidExtendedObjectType.ORIGIN,
+            object_id=self.id,
+        )
+
+    @url.validator
+    def check_url(self, attribute, value):
+        if len(value.encode()) >= 2048:
+            # Rationale for this value:
+            # 1. Needs to be stored in a postgresql btree, which is limited to
+            #    somewhere around 2700 bytes
+            # 2. URLs longer than 2048 characters won't work very well in browsers,
+            #    and repository URLs are often meant to at least display something
+            #    when opened in a browser. https://stackoverflow.com/a/417184/539465
+            # 3. Even though this field is actually an IRI, it is usually in ASCII
+            #    so this should be a good-enough approximation
+            raise ValueError("Origin URL is too long")
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
 class OriginVisit(BaseModel):
-    """Represents a visit of an origin at a given point in time, by a
+    """Represents an origin visit with a given type at a given point in time, by a
    SWH loader."""
-    origin = attr.ib(type=str)
+
+    object_type: Final = ModelObjectType.ORIGIN_VISIT
+
+    origin = attr.ib(type=str, validator=generic_type_validator)
    date = attr.ib(type=datetime.datetime)
-    status = attr.ib(
-        type=str,
-        validator=attr.validators.in_(['ongoing', 'full', 'partial']))
-    type = attr.ib(type=str)
-    snapshot = attr.ib(type=Optional[Sha1Git])
-    metadata = attr.ib(type=Optional[Dict[str, object]],
-                       default=None)
-
-    visit = attr.ib(type=Optional[int],
-                    default=None)
+    type = attr.ib(type=str, validator=generic_type_validator)
    """Should not be set before calling 'origin_visit_add()'."""
+    visit = attr.ib(type=Optional[int], validator=generic_type_validator, default=None)
+
+    @date.validator
+    def check_date(self, attribute, value):
+        """Checks the date has a timezone."""
+        if value.__class__ is not datetime.datetime:
+            raise AttributeTypeError(value, attribute)
+        if value is not None and value.tzinfo is None:
+            raise ValueError("date must be a timezone-aware datetime.")

    def to_dict(self):
        """Serializes the date as a string and omits the visit id if it is
        `None`."""
        ov = super().to_dict()
-        if ov['visit'] is None:
-            del ov['visit']
+        if ov["visit"] is None:
+            del ov["visit"]
        return ov

-    @classmethod
-    def from_dict(cls, d):
-        """Parses the date from a string, and accepts missing visit ids."""
-        d = d.copy()
-        date = d.pop('date')
-        return cls(
-            date=(date
-                  if isinstance(date, datetime.datetime)
-                  else dateutil.parser.parse(date)),
-            **d)
+    def unique_key(self) -> KeyType:
+        return {"origin": self.origin, "date": str(self.date)}
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class OriginVisitStatus(BaseModel):
+    """Represents a visit update of an origin at a given point in time."""
+
+    object_type: Final = ModelObjectType.ORIGIN_VISIT_STATUS

+    origin = attr.ib(type=str, validator=generic_type_validator)
+    visit = attr.ib(type=int, validator=generic_type_validator)

-class TargetType(Enum):
+    date = attr.ib(type=datetime.datetime)
+    status = attr.ib(
+        type=str,
+        validator=attr.validators.in_(
+            ["created", "ongoing", "full", "partial", "not_found", "failed"]
+        ),
+    )
+    snapshot = attr.ib(
+        type=Optional[Sha1Git], validator=generic_type_validator, repr=hash_repr
+    )
+    # Type is optional be to able to use it before adding it to the database model
+    type = attr.ib(type=Optional[str], validator=generic_type_validator, default=None)
+    metadata = attr.ib(
+        type=Optional[ImmutableDict[str, object]],
+        validator=generic_type_validator,
+        converter=freeze_optional_dict,
+        default=None,
+    )
+
+    @date.validator
+    def check_date(self, attribute, value):
+        """Checks the date has a timezone."""
+        if value.__class__ is not datetime.datetime:
+            raise AttributeTypeError(value, attribute)
+        if value is not None and value.tzinfo is None:
+            raise ValueError("date must be a timezone-aware datetime.")
+
+    def unique_key(self) -> KeyType:
+        return {"origin": self.origin, "visit": str(self.visit), "date": str(self.date)}
+
+    def origin_swhid(self) -> ExtendedSWHID:
+        return Origin(url=self.origin).swhid()
+
+    def snapshot_swhid(self) -> Optional[CoreSWHID]:
+        if self.snapshot is None:
+            return None
+        return CoreSWHID(object_type=SwhidObjectType.SNAPSHOT, object_id=self.snapshot)
+
+
+class SnapshotTargetType(Enum):
    """The type of content pointed to by a snapshot branch. Usually a
    revision or an alias."""
-    CONTENT = 'content'
-    DIRECTORY = 'directory'
-    REVISION = 'revision'
-    RELEASE = 'release'
-    SNAPSHOT = 'snapshot'
-    ALIAS = 'alias'
+
+    CONTENT = "content"
+    DIRECTORY = "directory"
+    REVISION = "revision"
+    RELEASE = "release"
+    SNAPSHOT = "snapshot"
+    ALIAS = "alias"
+
+    def __repr__(self):
+        return f"SnapshotTargetType.{self.name}"
+
+
+# Remove this compatibility trick once all user have been migrated.
+#
+# We cannot use @deprecated as this would modify SnapshotTargetType directly
+TargetType = SnapshotTargetType


-class ObjectType(Enum):
+class ReleaseTargetType(Enum):
    """The type of content pointed to by a release. Usually a revision"""
-    CONTENT = 'content'
-    DIRECTORY = 'directory'
-    REVISION = 'revision'
-    RELEASE = 'release'
-    SNAPSHOT = 'snapshot'

+    CONTENT = "content"
+    DIRECTORY = "directory"
+    REVISION = "revision"
+    RELEASE = "release"
+    SNAPSHOT = "snapshot"

-@attr.s(frozen=True)
+    def __repr__(self):
+        return f"ReleaseTargetType.{self.name}"
+
+
+# Remove this compatibility trick once all user have been migrated.
+#
+# We cannot use @deprecated as this would modify SnapshotTargetType directly
+ObjectType = ReleaseTargetType
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
 class SnapshotBranch(BaseModel):
    """Represents one of the branches of a snapshot."""
-    target = attr.ib(type=bytes)
-    target_type = attr.ib(type=TargetType)
+
+    object_type: Final = ModelObjectType.SNAPSHOT_BRANCH
+
+    target = attr.ib(type=bytes, repr=hash_repr)
+    target_type = attr.ib(type=SnapshotTargetType, validator=generic_type_validator)

    @target.validator
    def check_target(self, attribute, value):
        """Checks the target type is not an alias, checks the target is a
        valid sha1_git."""
-        if self.target_type != TargetType.ALIAS and self.target is not None:
+        if value.__class__ is not bytes:
+            raise AttributeTypeError(value, attribute)
+        if self.target_type != SnapshotTargetType.ALIAS and self.target is not None:
            if len(value) != 20:
-                raise ValueError('Wrong length for bytes identifier: %d' %
-                                 len(value))
+                raise ValueError("Wrong length for bytes identifier: %d" % len(value))

    @classmethod
    def from_dict(cls, d):
-        return cls(
-            target=d['target'],
-            target_type=TargetType(d['target_type']))
+        return cls(target=d["target"], target_type=SnapshotTargetType(d["target_type"]))
+
+    def swhid(self) -> Optional[CoreSWHID]:
+        """Returns a SWHID for the current branch or None if the branch has no
+        target or is an alias."""
+        if self.target is None or self.target_type == SnapshotTargetType.ALIAS:
+            return None
+        return CoreSWHID(
+            object_id=self.target, object_type=SwhidObjectType[self.target_type.name]
+        )


-@attr.s(frozen=True)
-class Snapshot(BaseModel, HashableObject):
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class Snapshot(BaseHashableModel):
    """Represents the full state of an origin at a given point in time."""
-    branches = attr.ib(type=Dict[bytes, Optional[SnapshotBranch]])
-    id = attr.ib(type=Sha1Git, default=b'')

-    @staticmethod
-    def compute_hash(object_dict):
-        return snapshot_identifier(object_dict)
+    object_type: Final = ModelObjectType.SNAPSHOT
+
+    branches = attr.ib(
+        type=ImmutableDict[bytes, Optional[SnapshotBranch]],
+        validator=generic_type_validator,
+        converter=freeze_optional_dict,
+    )
+    id = attr.ib(
+        type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
+    )
+
+    def _compute_hash_from_attributes(self) -> bytes:
+        return _compute_hash_from_manifest(
+            git_objects.snapshot_git_object(self, ignore_unresolved=True)
+        )

    @classmethod
    def from_dict(cls, d):
        d = d.copy()
        return cls(
-            branches={
-                name: SnapshotBranch.from_dict(branch) if branch else None
-                for (name, branch) in d.pop('branches').items()
-            },
-            **d)
-
-
-@attr.s(frozen=True)
-class Release(BaseModel, HashableObject):
-    name = attr.ib(type=bytes)
-    message = attr.ib(type=bytes)
-    target = attr.ib(type=Optional[Sha1Git])
-    target_type = attr.ib(type=ObjectType)
-    synthetic = attr.ib(type=bool)
-    author = attr.ib(type=Optional[Person],
-                     default=None)
-    date = attr.ib(type=Optional[TimestampWithTimezone],
-                   default=None)
-    metadata = attr.ib(type=Optional[Dict[str, object]],
-                       default=None)
-    id = attr.ib(type=Sha1Git, default=b'')
-
-    @staticmethod
-    def compute_hash(object_dict):
-        return release_identifier(object_dict)
+            branches=ImmutableDict(
+                (name, SnapshotBranch.from_dict(branch) if branch else None)
+                for (name, branch) in d.pop("branches").items()
+            ),
+            **d,
+        )
+
+    def swhid(self) -> CoreSWHID:
+        """Returns a SWHID representing this object."""
+        return CoreSWHID(object_type=SwhidObjectType.SNAPSHOT, object_id=self.id)
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class Release(HashableObjectWithManifest, BaseModel):
+    object_type: Final = ModelObjectType.RELEASE
+
+    name = attr.ib(type=bytes, validator=generic_type_validator)
+    message = attr.ib(type=Optional[bytes], validator=generic_type_validator)
+    target = attr.ib(
+        type=Optional[Sha1Git], validator=generic_type_validator, repr=hash_repr
+    )
+    target_type = attr.ib(type=ReleaseTargetType, validator=generic_type_validator)
+    synthetic = attr.ib(type=bool, validator=generic_type_validator)
+    author = attr.ib(
+        type=Optional[Person], validator=generic_type_validator, default=None
+    )
+    date = attr.ib(
+        type=Optional[TimestampWithTimezone],
+        validator=generic_type_validator,
+        default=None,
+    )
+    metadata = attr.ib(
+        type=Optional[ImmutableDict[str, object]],
+        validator=generic_type_validator,
+        converter=freeze_optional_dict,
+        default=None,
+    )
+    id = attr.ib(
+        type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
+    )
+    raw_manifest = attr.ib(type=Optional[bytes], default=None)
+
+    def _compute_hash_from_attributes(self) -> bytes:
+        return _compute_hash_from_manifest(git_objects.release_git_object(self))

    @author.validator
    def check_author(self, attribute, value):
        """If the author is `None`, checks the date is `None` too."""
        if self.author is None and self.date is not None:
-            raise ValueError('release date must be None if author is None.')
+            raise ValueError("release date must be None if author is None.")

    def to_dict(self):
        rel = super().to_dict()
-        if rel['metadata'] is None:
-            del rel['metadata']
+        if rel["metadata"] is None:
+            del rel["metadata"]
        return rel

    @classmethod
    def from_dict(cls, d):
        d = d.copy()
-        if d.get('author'):
-            d['author'] = Person.from_dict(d['author'])
-        if d.get('date'):
-            d['date'] = TimestampWithTimezone.from_dict(d['date'])
-        return cls(
-            target_type=ObjectType(d.pop('target_type')),
-            **d)
+        if d.get("author"):
+            d["author"] = Person.from_dict(d["author"])
+        if d.get("date"):
+            d["date"] = TimestampWithTimezone.from_dict(d["date"])
+        return cls(target_type=ReleaseTargetType(d.pop("target_type")), **d)
+
+    def swhid(self) -> CoreSWHID:
+        """Returns a SWHID representing this object."""
+        return CoreSWHID(object_type=SwhidObjectType.RELEASE, object_id=self.id)
+
+    def target_swhid(self) -> Optional[CoreSWHID]:
+        """Returns the SWHID for the target of this release or None if unset."""
+        if self.target is None:
+            return None
+        return CoreSWHID(
+            object_id=self.target, object_type=SwhidObjectType[self.target_type.name]
+        )
+
+    def anonymize(self) -> Release:
+        """Returns an anonymized version of the Release object.
+
+        Anonymization consists in replacing the author with an anonymized Person object.
+        """
+        author = self.author and self.author.anonymize()
+        return attr.evolve(self, author=author)


 class RevisionType(Enum):
-    GIT = 'git'
-    TAR = 'tar'
-    DSC = 'dsc'
-    SUBVERSION = 'svn'
-    MERCURIAL = 'hg'
-
-
-@attr.s(frozen=True)
-class Revision(BaseModel, HashableObject):
-    message = attr.ib(type=bytes)
-    author = attr.ib(type=Person)
-    committer = attr.ib(type=Person)
-    date = attr.ib(type=Optional[TimestampWithTimezone])
-    committer_date = attr.ib(type=Optional[TimestampWithTimezone])
-    type = attr.ib(type=RevisionType)
-    directory = attr.ib(type=Sha1Git)
-    synthetic = attr.ib(type=bool)
-    metadata = attr.ib(type=Optional[Dict[str, object]],
-                       default=None)
-    parents = attr.ib(type=List[Sha1Git],
-                      default=attr.Factory(list))
-    id = attr.ib(type=Sha1Git, default=b'')
+    GIT = "git"
+    TAR = "tar"
+    DSC = "dsc"
+    SUBVERSION = "svn"
+    MERCURIAL = "hg"
+    CVS = "cvs"
+    BAZAAR = "bzr"
+
+    def __repr__(self):
+        return f"RevisionType.{self.name}"
+
+
+def tuplify_extra_headers(value: Iterable):
+    return tuple((k, v) for k, v in value)
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class Revision(HashableObjectWithManifest, BaseModel):
+    object_type: Final = ModelObjectType.REVISION
+
+    message = attr.ib(type=Optional[bytes], validator=generic_type_validator)
+    author = attr.ib(type=Optional[Person], validator=generic_type_validator)
+    committer = attr.ib(type=Optional[Person], validator=generic_type_validator)
+    date = attr.ib(
+        type=Optional[TimestampWithTimezone], validator=generic_type_validator
+    )
+    committer_date = attr.ib(
+        type=Optional[TimestampWithTimezone], validator=generic_type_validator
+    )
+    type = attr.ib(type=RevisionType, validator=generic_type_validator)
+    directory = attr.ib(type=Sha1Git, validator=generic_type_validator, repr=hash_repr)
+    synthetic = attr.ib(type=bool, validator=generic_type_validator)
+    metadata = attr.ib(
+        type=Optional[ImmutableDict[str, object]],
+        validator=generic_type_validator,
+        converter=freeze_optional_dict,
+        default=None,
+    )
+    parents = attr.ib(
+        type=Tuple[Sha1Git, ...],
+        validator=generic_type_validator,
+        default=(),
+        repr=parents_repr,
+    )
+    id = attr.ib(
+        type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
+    )
+    extra_headers = attr.ib(
+        type=Tuple[Tuple[bytes, bytes], ...],
+        validator=generic_type_validator,
+        converter=tuplify_extra_headers,
+        default=(),
+    )
+    raw_manifest = attr.ib(type=Optional[bytes], default=None)

-    @staticmethod
-    def compute_hash(object_dict):
-        return revision_identifier(object_dict)
+    def __attrs_post_init__(self):
+        super().__attrs_post_init__()
+        # ensure metadata is a deep copy of whatever was given, and if needed
+        # extract extra_headers from there
+        if self.metadata:
+            metadata = self.metadata
+            if not self.extra_headers and "extra_headers" in metadata:
+                (extra_headers, metadata) = metadata.copy_pop("extra_headers")
+                object.__setattr__(
+                    self,
+                    "extra_headers",
+                    tuplify_extra_headers(extra_headers),
+                )
+                attr.validate(self)
+            object.__setattr__(self, "metadata", metadata)
+
+    def _compute_hash_from_attributes(self) -> bytes:
+        return _compute_hash_from_manifest(git_objects.revision_git_object(self))
+
+    @author.validator
+    def check_author(self, attribute, value):
+        """If the author is `None`, checks the date is `None` too."""
+        if self.author is None and self.date is not None:
+            raise ValueError("revision date must be None if author is None.")
+
+    @committer.validator
+    def check_committer(self, attribute, value):
+        """If the committer is `None`, checks the committer_date is `None` too."""
+        if self.committer is None and self.committer_date is not None:
+            raise ValueError(
+                "revision committer_date must be None if committer is None."
+            )

    @classmethod
    def from_dict(cls, d):
        d = d.copy()
-        date = d.pop('date')
+        date = d.pop("date")
        if date:
            date = TimestampWithTimezone.from_dict(date)

-        committer_date = d.pop('committer_date')
+        committer_date = d.pop("committer_date")
        if committer_date:
-            committer_date = TimestampWithTimezone.from_dict(
-                committer_date)
+            committer_date = TimestampWithTimezone.from_dict(committer_date)
+
+        author = d.pop("author")
+        if author:
+            author = Person.from_dict(author)
+
+        committer = d.pop("committer")
+        if committer:
+            committer = Person.from_dict(committer)

        return cls(
-            author=Person.from_dict(d.pop('author')),
-            committer=Person.from_dict(d.pop('committer')),
+            author=author,
+            committer=committer,
            date=date,
            committer_date=committer_date,
-            type=RevisionType(d.pop('type')),
-            **d)
+            type=RevisionType(d.pop("type")),
+            parents=tuple(d.pop("parents")),  # for BW compat
+            **d,
+        )

+    def swhid(self) -> CoreSWHID:
+        """Returns a SWHID representing this object."""
+        return CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=self.id)

-@attr.s(frozen=True)
+    def directory_swhid(self) -> CoreSWHID:
+        """Returns the SWHID for the directory referenced by the revision."""
+        return CoreSWHID(
+            object_type=SwhidObjectType.DIRECTORY, object_id=self.directory
+        )
+
+    def parent_swhids(self) -> List[CoreSWHID]:
+        """Returns a list of SWHID for the parent revisions."""
+        return [
+            CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=parent)
+            for parent in self.parents
+        ]
+
+    def anonymize(self) -> Revision:
+        """Returns an anonymized version of the Revision object.
+
+        Anonymization consists in replacing the author and committer with an anonymized
+        Person object.
+        """
+        return attr.evolve(
+            self,
+            author=None if self.author is None else self.author.anonymize(),
+            committer=None if self.committer is None else self.committer.anonymize(),
+        )
+
+
+_DIR_ENTRY_TYPES = ["file", "dir", "rev"]
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
 class DirectoryEntry(BaseModel):
+    object_type: Final = ModelObjectType.DIRECTORY_ENTRY
+
    name = attr.ib(type=bytes)
-    type = attr.ib(type=str,
-                   validator=attr.validators.in_(['file', 'dir', 'rev']))
-    target = attr.ib(type=Sha1Git)
-    perms = attr.ib(type=int)
+    type = attr.ib(type=str, validator=attr.validators.in_(_DIR_ENTRY_TYPES))
+    target = attr.ib(type=Sha1Git, validator=generic_type_validator, repr=hash_repr)
+    perms = attr.ib(type=int, validator=generic_type_validator, converter=int, repr=oct)
    """Usually one of the values of `swh.model.from_disk.DentryPerms`."""

-
-@attr.s(frozen=True)
-class Directory(BaseModel, HashableObject):
-    entries = attr.ib(type=List[DirectoryEntry])
-    id = attr.ib(type=Sha1Git, default=b'')
-
-    @staticmethod
-    def compute_hash(object_dict):
-        return directory_identifier(object_dict)
+    DIR_ENTRY_TYPE_TO_SWHID_OBJECT_TYPE = {
+        "file": SwhidObjectType.CONTENT,
+        "dir": SwhidObjectType.DIRECTORY,
+        "rev": SwhidObjectType.REVISION,
+    }
+
+    @name.validator
+    def check_name(self, attribute, value):
+        if value.__class__ is not bytes:
+            raise AttributeTypeError(value, attribute)
+        if b"/" in value:
+            raise ValueError(f"{value!r} is not a valid directory entry name.")
+
+    def swhid(self) -> CoreSWHID:
+        """Returns a SWHID for this directory entry"""
+        return CoreSWHID(
+            object_type=DirectoryEntry.DIR_ENTRY_TYPE_TO_SWHID_OBJECT_TYPE[self.type],
+            object_id=self.target,
+        )
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class Directory(HashableObjectWithManifest, BaseModel):
+    object_type: Final = ModelObjectType.DIRECTORY
+
+    entries = attr.ib(type=Tuple[DirectoryEntry, ...], validator=generic_type_validator)
+    id = attr.ib(
+        type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
+    )
+    raw_manifest = attr.ib(type=Optional[bytes], default=None)
+
+    def _compute_hash_from_attributes(self) -> bytes:
+        return _compute_hash_from_manifest(git_objects.directory_git_object(self))
+
+    @entries.validator
+    def check_entries(self, attribute, value):
+        seen = set()
+        for entry in value:
+            if entry.name in seen:
+                # Cannot use self.swhid() here, self.id may be None
+                raise ValueError(
+                    f"swh:1:dir:{hash_to_hex(self.id)} has duplicated entry name: "
+                    f"{entry.name!r}"
+                )
+            seen.add(entry.name)

    @classmethod
    def from_dict(cls, d):
        d = d.copy()
        return cls(
-            entries=[DirectoryEntry.from_dict(entry)
-                     for entry in d.pop('entries')],
-            **d)
+            entries=tuple(
+                DirectoryEntry.from_dict(entry) for entry in d.pop("entries")
+            ),
+            **d,
+        )

+    def swhid(self) -> CoreSWHID:
+        """Returns a SWHID representing this object."""
+        return CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=self.id)

-@attr.s(frozen=True)
-class BaseContent(BaseModel):
-    def to_dict(self):
-        content = super().to_dict()
-        if content['ctime'] is None:
-            del content['ctime']
-        return content
+    @classmethod
+    def from_possibly_duplicated_entries(
+        cls,
+        *,
+        entries: Tuple[DirectoryEntry, ...],
+        id: Sha1Git = b"",
+        raw_manifest: Optional[bytes] = None,
+    ) -> Tuple[bool, "Directory"]:
+        """Constructs a ``Directory`` object from a list of entries that may contain
+        duplicated names.
+
+        This is required to represent legacy objects, that were ingested in the
+        storage database before this check was added.
+
+        As it is impossible for a ``Directory`` instances to have more than one entry
+        with a given names, this function computes a ``raw_manifest`` and renames one of
+        the entries before constructing the ``Directory``.
+
+        Returns:
+            ``(is_corrupt, directory)`` where ``is_corrupt`` is True iff some
+            entry names were indeed duplicated
+        """
+        # First, try building a Directory object normally without any extra computation,
+        # which works the overwhelming majority of the time:
+        try:
+            return (False, Directory(entries=entries, id=id, raw_manifest=raw_manifest))
+        except ValueError:
+            pass
+
+        # If it fails:
+        # 1. compute a raw_manifest if there isn't already one:
+        if raw_manifest is None:
+            # invalid_directory behaves like a Directory object, but without the
+            # duplicated entry check; which allows computing its raw_manifest
+            invalid_directory = type("", (), {})()
+            invalid_directory.entries = entries
+            raw_manifest = git_objects.directory_git_object(invalid_directory)
+
+        # 2. look for duplicated entries:
+        entries_by_name: Dict[bytes, Dict[str, List[DirectoryEntry]]] = (
+            collections.defaultdict(lambda: collections.defaultdict(list))
+        )
+        for entry in entries:
+            entries_by_name[entry.name][entry.type].append(entry)
+
+        # 3. strip duplicates
+        deduplicated_entries = []
+        for entry_lists in entries_by_name.values():
+            # We could pick one entry at random to keep the original name; but we try to
+            # "minimize" the impact, by preserving entries of type "rev" first
+            # (because renaming them would likely break git submodules entirely
+            # when this directory is written to disk),
+            # then entries of type "dir" (because renaming them affects the path
+            # of every file in the dir, instead of just one "cnt").
+            dir_entry_types = ("rev", "dir", "file")
+            assert set(dir_entry_types) == set(_DIR_ENTRY_TYPES)
+            picked_winner = False  # when True, all future entries must be renamed
+            for type_ in dir_entry_types:
+                for entry in entry_lists[type_]:
+                    if not picked_winner:
+                        # this is the "most important" entry according to this
+                        # heuristic; it gets to keep its name.
+                        deduplicated_entries.append(entry)
+                        picked_winner = True
+                    else:
+                        # the heuristic already found an entry more important than
+                        # this one; so this one must be renamed to something.
+                        # we pick the beginning of its hash, it should be good enough
+                        # to avoid any conflict.
+                        new_name = (
+                            entry.name + b"_" + hash_to_bytehex(entry.target)[0:10]
+                        )
+                        renamed_entry = attr.evolve(entry, name=new_name)
+                        deduplicated_entries.append(renamed_entry)
+
+        # Finally, return the "fixed" the directory
+        dir_ = Directory(
+            entries=tuple(deduplicated_entries), id=id, raw_manifest=raw_manifest
+        )
+        return (True, dir_)
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class BaseContent(BaseModel, ABC):
+    status = attr.ib(
+        type=str, validator=attr.validators.in_(["visible", "hidden", "absent"])
+    )
+
+    @staticmethod
+    def _hash_data(data: bytes):
+        """Hash some data, returning most of the fields of a content object"""
+        d = MultiHash.from_data(data).digest()
+        d["data"] = data
+        d["length"] = len(data)
+
+        return d

    @classmethod
    def from_dict(cls, d, use_subclass=True):
        if use_subclass:
            # Chooses a subclass to instantiate instead.
-            if d['status'] == 'absent':
+            if d["status"] == "absent":
                return SkippedContent.from_dict(d)
            else:
                return Content.from_dict(d)
@@ -381,90 +1517,612 @@ class BaseContent(BaseModel):

    def get_hash(self, hash_name):
        if hash_name not in DEFAULT_ALGORITHMS:
-            raise ValueError('{} is not a valid hash name.'.format(hash_name))
+            raise ValueError("{} is not a valid hash name.".format(hash_name))
        return getattr(self, hash_name)

+    def hashes(self) -> Dict[str, bytes]:
+        """Returns a dictionary {hash_name: hash_value}"""
+        return {algo: getattr(self, algo) for algo in DEFAULT_ALGORITHMS}
+

-@attr.s(frozen=True)
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
 class Content(BaseContent):
-    sha1 = attr.ib(type=bytes)
-    sha1_git = attr.ib(type=Sha1Git)
-    sha256 = attr.ib(type=bytes)
-    blake2s256 = attr.ib(type=bytes)
+    object_type: Final = ModelObjectType.CONTENT
+
+    sha1 = attr.ib(type=bytes, validator=generic_type_validator, repr=hash_repr)
+    sha1_git = attr.ib(type=Sha1Git, validator=generic_type_validator, repr=hash_repr)
+    sha256 = attr.ib(type=bytes, validator=generic_type_validator, repr=hash_repr)
+    blake2s256 = attr.ib(type=bytes, validator=generic_type_validator, repr=hash_repr)

    length = attr.ib(type=int)

    status = attr.ib(
        type=str,
-        default='visible',
-        validator=attr.validators.in_(['visible', 'hidden']))
-    data = attr.ib(type=Optional[bytes],
-                   default=None)
-
-    ctime = attr.ib(type=Optional[datetime.datetime],
-                    default=None)
+        validator=attr.validators.in_(["visible", "hidden"]),
+        default="visible",
+    )
+
+    data = attr.ib(type=Optional[bytes], validator=generic_type_validator, default=None)
+    get_data = attr.ib(
+        type=Optional[Callable[[], bytes]],
+        default=None,
+        cmp=False,
+    )
+
+    ctime = attr.ib(
+        type=Optional[datetime.datetime],
+        default=None,
+        eq=False,
+    )

    @length.validator
    def check_length(self, attribute, value):
        """Checks the length is positive."""
+        if value.__class__ is not int:
+            raise AttributeTypeError(value, attribute)
        if value < 0:
-            raise ValueError('Length must be positive.')
+            raise ValueError("Length must be positive.")
+
+    @ctime.validator
+    def check_ctime(self, attribute, value):
+        """Checks the ctime has a timezone."""
+        if value is not None:
+            if value.__class__ is not datetime.datetime:
+                raise AttributeTypeError(value, attribute)
+            if value.tzinfo is None:
+                raise ValueError("ctime must be a timezone-aware datetime.")

    def to_dict(self):
-        content = super().to_dict()
-        if content['data'] is None:
-            del content['data']
+        content = super(Content, self.with_data(raise_if_missing=False)).to_dict()
+        for k in ("get_data", "data", "ctime"):
+            if content[k] is None:
+                del content[k]
        return content

+    @classmethod
+    def from_data(cls, data, status="visible", ctime=None) -> Content:
+        """Generate a Content from a given `data` byte string.
+
+        This populates the Content with the hashes and length for the data
+        passed as argument, as well as the data itself.
+        """
+        d = cls._hash_data(data)
+        d["status"] = status
+        d["ctime"] = ctime
+        return cls(**d)
+
    @classmethod
    def from_dict(cls, d):
+        if isinstance(d.get("ctime"), str):
+            d = d.copy()
+            d["ctime"] = dateutil.parser.parse(d["ctime"])
        return super().from_dict(d, use_subclass=False)

+    def with_data(self, raise_if_missing: bool = True) -> Content:
+        """Loads the ``data`` attribute if ``get_data`` is not :const:`None`.

-@attr.s(frozen=True)
-class SkippedContent(BaseContent):
-    sha1 = attr.ib(type=Optional[bytes])
-    sha1_git = attr.ib(type=Optional[Sha1Git])
-    sha256 = attr.ib(type=Optional[bytes])
-    blake2s256 = attr.ib(type=Optional[bytes])
+        This call is almost a no-op, but subclasses may overload this method
+        to lazy-load data (eg. from disk or objstorage).

-    length = attr.ib(type=int)
+        Args:
+            raise_if_missing: if :const:`True` (default), raise :class:`MissingData`
+                exception if no data is attached to content object
+        """
+        if self.data is not None:
+            return self
+        new_data = None
+        if self.get_data is not None:
+            new_data = self.get_data()
+        if new_data is None and raise_if_missing:
+            raise MissingData("Content data and get_data are both None.")
+        return attr.evolve(self, data=new_data, get_data=None)

-    status = attr.ib(
-        type=str,
-        validator=attr.validators.in_(['absent']))
-    reason = attr.ib(type=Optional[str],
-                     default=None)
+    def unique_key(self) -> KeyType:
+        return self.sha1  # TODO: use a dict of hashes

-    origin = attr.ib(type=Optional[Origin],
-                     default=None)
+    def swhid(self) -> CoreSWHID:
+        """Returns a SWHID representing this object."""
+        return CoreSWHID(object_type=SwhidObjectType.CONTENT, object_id=self.sha1_git)

-    ctime = attr.ib(type=Optional[datetime.datetime],
-                    default=None)
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class SkippedContent(BaseContent):
+    object_type: Final = ModelObjectType.SKIPPED_CONTENT
+
+    sha1 = attr.ib(
+        type=Optional[bytes], validator=generic_type_validator, repr=hash_repr
+    )
+    sha1_git = attr.ib(
+        type=Optional[Sha1Git], validator=generic_type_validator, repr=hash_repr
+    )
+    sha256 = attr.ib(
+        type=Optional[bytes], validator=generic_type_validator, repr=hash_repr
+    )
+    blake2s256 = attr.ib(
+        type=Optional[bytes], validator=generic_type_validator, repr=hash_repr
+    )
+
+    length = attr.ib(type=Optional[int])
+
+    status = attr.ib(type=str, validator=attr.validators.in_(["absent"]))
+    reason = attr.ib(type=Optional[str], default=None)
+
+    origin = attr.ib(type=Optional[str], validator=generic_type_validator, default=None)
+
+    ctime = attr.ib(
+        type=Optional[datetime.datetime],
+        validator=generic_type_validator,
+        default=None,
+        eq=False,
+    )

    @reason.validator
    def check_reason(self, attribute, value):
        """Checks the reason is full if status != absent."""
        assert self.reason == value
        if value is None:
-            raise ValueError('Must provide a reason if content is absent.')
+            raise ValueError("Must provide a reason if content is absent.")
+        elif value.__class__ is not str:
+            raise AttributeTypeError(value, attribute)

    @length.validator
    def check_length(self, attribute, value):
        """Checks the length is positive or -1."""
-        if value < -1:
-            raise ValueError('Length must be positive or -1.')
+        if value.__class__ is not int:
+            raise AttributeTypeError(value, attribute)
+        elif value < -1:
+            raise ValueError("Length must be positive or -1.")
+
+    @ctime.validator
+    def check_ctime(self, attribute, value):
+        """Checks the ctime has a timezone."""
+        if value is not None:
+            if value.__class__ is not datetime.datetime:
+                raise AttributeTypeError(value, attribute)
+            elif value.tzinfo is None:
+                raise ValueError("ctime must be a timezone-aware datetime.")

    def to_dict(self):
        content = super().to_dict()
-        if content['origin'] is None:
-            del content['origin']
+        if content["origin"] is None:
+            del content["origin"]
+        if content["ctime"] is None:
+            del content["ctime"]
        return content

+    @classmethod
+    def from_data(
+        cls, data: bytes, reason: str, ctime: Optional[datetime.datetime] = None
+    ) -> SkippedContent:
+        """Generate a SkippedContent from a given `data` byte string.
+
+        This populates the SkippedContent with the hashes and length for the
+        data passed as argument.
+
+        You can use `attr.evolve` on such a generated content to nullify some
+        of its attributes, e.g. for tests.
+        """
+        d = cls._hash_data(data)
+        del d["data"]
+        d["status"] = "absent"
+        d["reason"] = reason
+        d["ctime"] = ctime
+        return cls(**d)
+
    @classmethod
    def from_dict(cls, d):
-        d2 = d
-        d = d.copy()
-        if d.pop('data', None) is not None:
-            raise ValueError('SkippedContent has no "data" attribute %r' % d2)
-        return super().from_dict(d, use_subclass=False)
+        d2 = d.copy()
+        if d2.pop("data", None) is not None:
+            raise ValueError('SkippedContent has no "data" attribute %r' % d)
+        return super().from_dict(d2, use_subclass=False)
+
+    def unique_key(self) -> KeyType:
+        return self.hashes()
+
+    def swhid(self) -> Optional[CoreSWHID]:
+        """Returns a SWHID representing this object or None if unset."""
+        if self.sha1_git is None:
+            return None
+        return CoreSWHID(object_type=SwhidObjectType.CONTENT, object_id=self.sha1_git)
+
+
+class MetadataAuthorityType(Enum):
+    DEPOSIT_CLIENT = "deposit_client"
+    FORGE = "forge"
+    REGISTRY = "registry"
+
+    def __repr__(self):
+        return f"MetadataAuthorityType.{self.name}"
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class MetadataAuthority(BaseModel):
+    """Represents an entity that provides metadata about an origin or
+    software artifact."""
+
+    object_type: Final = ModelObjectType.METADATA_AUTHORITY
+
+    type = attr.ib(type=MetadataAuthorityType, validator=generic_type_validator)
+    url = attr.ib(type=str, validator=generic_type_validator)
+    metadata = attr.ib(
+        type=Optional[ImmutableDict[str, Any]],
+        default=None,
+        validator=generic_type_validator,
+        converter=freeze_optional_dict,
+    )
+
+    def to_dict(self):
+        d = super().to_dict()
+        if d["metadata"] is None:
+            del d["metadata"]
+        return d
+
+    @classmethod
+    def from_dict(cls, d):
+        d = {
+            **d,
+            "type": MetadataAuthorityType(d["type"]),
+        }
+        return super().from_dict(d)
+
+    def unique_key(self) -> KeyType:
+        return {"type": self.type.value, "url": self.url}
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class MetadataFetcher(BaseModel):
+    """Represents a software component used to fetch metadata from a metadata
+    authority, and ingest them into the Software Heritage archive."""
+
+    object_type: Final = ModelObjectType.METADATA_FETCHER
+
+    name = attr.ib(type=str, validator=generic_type_validator)
+    version = attr.ib(type=str, validator=generic_type_validator)
+    metadata = attr.ib(
+        type=Optional[ImmutableDict[str, Any]],
+        default=None,
+        validator=generic_type_validator,
+        converter=freeze_optional_dict,
+    )
+
+    def to_dict(self):
+        d = super().to_dict()
+        if d["metadata"] is None:
+            del d["metadata"]
+        return d
+
+    def unique_key(self) -> KeyType:
+        return {"name": self.name, "version": self.version}
+
+
+def normalize_discovery_date(value: Any) -> datetime.datetime:
+    if not isinstance(value, datetime.datetime):
+        raise TypeError("discovery_date must be a timezone-aware datetime.")
+
+    if value.tzinfo is None:
+        raise ValueError("discovery_date must be a timezone-aware datetime.")
+
+    # Normalize timezone to utc, and truncate microseconds to 0
+    return value.astimezone(datetime.timezone.utc).replace(microsecond=0)
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class RawExtrinsicMetadata(BaseHashableModel):
+    object_type: Final = ModelObjectType.RAW_EXTRINSIC_METADATA
+
+    # target object
+    target = attr.ib(type=ExtendedSWHID, validator=generic_type_validator)
+
+    # source
+    discovery_date = attr.ib(type=datetime.datetime, converter=normalize_discovery_date)
+    authority = attr.ib(type=MetadataAuthority, validator=generic_type_validator)
+    fetcher = attr.ib(type=MetadataFetcher, validator=generic_type_validator)
+
+    # the metadata itself
+    format = attr.ib(type=str, validator=generic_type_validator)
+    metadata = attr.ib(type=bytes, validator=generic_type_validator)
+
+    # context
+    origin = attr.ib(type=Optional[str], default=None, validator=generic_type_validator)
+    visit = attr.ib(type=Optional[int], default=None)
+    snapshot = attr.ib(type=Optional[CoreSWHID], default=None)
+    release = attr.ib(type=Optional[CoreSWHID], default=None)
+    revision = attr.ib(type=Optional[CoreSWHID], default=None)
+    path = attr.ib(type=Optional[bytes], default=None)
+    directory = attr.ib(type=Optional[CoreSWHID], default=None)
+
+    id = attr.ib(
+        type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
+    )
+
+    def _compute_hash_from_attributes(self) -> bytes:
+        return _compute_hash_from_manifest(
+            git_objects.raw_extrinsic_metadata_git_object(self)
+        )
+
+    @origin.validator
+    def check_origin(self, attribute, value):
+        if value is None:
+            return
+
+        if value.__class__ is not str:
+            raise AttributeTypeError(value, attribute)
+        obj_type = self.target.object_type
+        if not (
+            obj_type is SwhidExtendedObjectType.SNAPSHOT
+            or obj_type is SwhidExtendedObjectType.RELEASE
+            or obj_type is SwhidExtendedObjectType.REVISION
+            or obj_type is SwhidExtendedObjectType.DIRECTORY
+            or obj_type is SwhidExtendedObjectType.CONTENT
+        ):
+            raise ValueError(
+                f"Unexpected 'origin' context for "
+                f"{self.target.object_type.name.lower()} object: {value}"
+            )
+
+        if value.startswith("swh:"):
+            # Technically this is valid; but:
+            # 1. SWHIDs are URIs, not URLs
+            # 2. if a SWHID gets here, it's very likely to be a mistake
+            #    (and we can remove this check if it turns out there is a
+            #    legitimate use for it).
+            raise ValueError(f"SWHID used as context origin URL: {value}")
+
+    @visit.validator
+    def check_visit(self, attribute, value):
+        if value is None:
+            return
+        if value.__class__ is not int:
+            raise AttributeTypeError(value, attribute)
+
+        obj_type = self.target.object_type
+        if not (
+            obj_type is SwhidExtendedObjectType.SNAPSHOT
+            or obj_type is SwhidExtendedObjectType.RELEASE
+            or obj_type is SwhidExtendedObjectType.REVISION
+            or obj_type is SwhidExtendedObjectType.DIRECTORY
+            or obj_type is SwhidExtendedObjectType.CONTENT
+        ):
+            raise ValueError(
+                f"Unexpected 'visit' context for "
+                f"{self.target.object_type.name.lower()} object: {value}"
+            )
+
+        if self.origin is None:
+            raise ValueError("'origin' context must be set if 'visit' is.")
+
+        if value <= 0:
+            raise ValueError("Nonpositive visit id")
+
+    @snapshot.validator
+    def check_snapshot(self, attribute, value):
+        if value is None:
+            return
+        if value.__class__ is not CoreSWHID:
+            raise AttributeTypeError(value, attribute)
+
+        obj_type = self.target.object_type
+        if not (
+            obj_type is SwhidExtendedObjectType.RELEASE
+            or obj_type is SwhidExtendedObjectType.REVISION
+            or obj_type is SwhidExtendedObjectType.DIRECTORY
+            or obj_type is SwhidExtendedObjectType.CONTENT
+        ):
+            raise ValueError(
+                f"Unexpected 'snapshot' context for "
+                f"{self.target.object_type.name.lower()} object: {value}"
+            )
+
+        if value.object_type != SwhidObjectType.SNAPSHOT:
+            raise ValueError(
+                f"Expected SWHID type 'snapshot', "
+                f"got '{value.object_type.name.lower()}' in {value}"
+            )
+
+    @release.validator
+    def check_release(self, attribute, value):
+        if value is None:
+            return
+        if value.__class__ is not CoreSWHID:
+            raise AttributeTypeError(value, attribute)
+
+        obj_type = self.target.object_type
+        if not (
+            obj_type is SwhidExtendedObjectType.REVISION
+            or obj_type is SwhidExtendedObjectType.DIRECTORY
+            or obj_type is SwhidExtendedObjectType.CONTENT
+        ):
+            raise ValueError(
+                f"Unexpected 'release' context for "
+                f"{self.target.object_type.name.lower()} object: {value}"
+            )
+
+        if value.object_type != SwhidObjectType.RELEASE:
+            raise ValueError(
+                f"Expected SWHID type 'release', "
+                f"got '{value.object_type.name.lower()}' in {value}"
+            )
+
+    @revision.validator
+    def check_revision(self, attribute, value):
+        if value is None:
+            return
+
+        if value.__class__ is not CoreSWHID:
+            raise AttributeTypeError(value, attribute)
+
+        obj_type = self.target.object_type
+        if not (
+            obj_type is SwhidExtendedObjectType.DIRECTORY
+            or obj_type is SwhidExtendedObjectType.CONTENT
+        ):
+            raise ValueError(
+                f"Unexpected 'revision' context for "
+                f"{self.target.object_type.name.lower()} object: {value}"
+            )
+
+        if value.object_type != SwhidObjectType.REVISION:
+            raise ValueError(
+                f"Expected SWHID type 'revision', "
+                f"got '{value.object_type.name.lower()}' in {value}"
+            )
+
+    @path.validator
+    def check_path(self, attribute, value):
+        if value is None:
+            return
+
+        if value.__class__ is not bytes:
+            raise AttributeTypeError(value, attribute)
+
+        obj_type = self.target.object_type
+        if not (
+            obj_type is SwhidExtendedObjectType.DIRECTORY
+            or obj_type is SwhidExtendedObjectType.CONTENT
+        ):
+            raise ValueError(
+                f"Unexpected 'path' context for "
+                f"{self.target.object_type.name.lower()} object: {value}"
+            )
+
+    @directory.validator
+    def check_directory(self, attribute, value):
+        if value is None:
+            return
+
+        if value.__class__ is not CoreSWHID:
+            raise AttributeTypeError(value, attribute)
+
+        if self.target.object_type is not SwhidExtendedObjectType.CONTENT:
+            raise ValueError(
+                f"Unexpected 'directory' context for "
+                f"{self.target.object_type.name.lower()} object: {value}"
+            )
+
+        if value.object_type != SwhidObjectType.DIRECTORY:
+            raise ValueError(
+                f"Expected SWHID type 'directory', "
+                f"got '{value.object_type.name.lower()}' in {value}"
+            )
+
+    def to_dict(self):
+        d = super().to_dict()
+
+        context_keys = (
+            "origin",
+            "visit",
+            "snapshot",
+            "release",
+            "revision",
+            "directory",
+            "path",
+        )
+        for context_key in context_keys:
+            if d[context_key] is None:
+                del d[context_key]
+        return d
+
+    @classmethod
+    def from_dict(cls, d):
+        if "type" in d:
+            # Convert from old schema
+            type_ = d.pop("type")
+            if type_ == "origin":
+                d["target"] = str(Origin(d["target"]).swhid())
+
+        d = {
+            **d,
+            "target": ExtendedSWHID.from_string(d["target"]),
+            "authority": MetadataAuthority.from_dict(d["authority"]),
+            "fetcher": MetadataFetcher.from_dict(d["fetcher"]),
+        }
+
+        swhid_keys = ("snapshot", "release", "revision", "directory")
+        for swhid_key in swhid_keys:
+            if d.get(swhid_key):
+                d[swhid_key] = CoreSWHID.from_string(d[swhid_key])
+
+        return super().from_dict(d)
+
+    def swhid(self) -> ExtendedSWHID:
+        """Returns a SWHID representing this RawExtrinsicMetadata object."""
+        return ExtendedSWHID(
+            object_type=SwhidExtendedObjectType.RAW_EXTRINSIC_METADATA,
+            object_id=self.id,
+        )
+
+
+@attr.s(frozen=True, slots=True, field_transformer=optimize_all_validators)
+class ExtID(BaseHashableModel):
+    object_type: Final = ModelObjectType.EXTID
+
+    extid_type = attr.ib(type=str, validator=generic_type_validator)
+    extid = attr.ib(type=bytes, validator=generic_type_validator)
+    target = attr.ib(type=CoreSWHID, validator=generic_type_validator)
+    extid_version = attr.ib(type=int, validator=generic_type_validator, default=0)
+
+    payload_type = attr.ib(
+        type=Optional[str], validator=generic_type_validator, default=None
+    )
+    payload = attr.ib(
+        type=Optional[Sha1Git],
+        validator=generic_type_validator,
+        default=None,
+        repr=hash_repr,
+    )
+
+    id = attr.ib(
+        type=Sha1Git, validator=generic_type_validator, default=b"", repr=hash_repr
+    )
+
+    @payload_type.validator
+    def check_payload_type(self, attribute, value):
+        if value is not None and self.payload is None:
+            raise ValueError("'payload' must be set if 'payload_type' is.")
+
+    @payload.validator
+    def check_payload(self, attribute, value):
+        if value is not None and self.payload_type is None:
+            raise ValueError("'payload_type' must be set if 'payload' is.")
+
+    @classmethod
+    def from_dict(cls, d):
+        return cls(
+            extid=d["extid"],
+            extid_type=d["extid_type"],
+            target=CoreSWHID.from_string(d["target"]),
+            extid_version=d.get("extid_version", 0),
+            payload_type=d.get("payload_type"),
+            payload=d.get("payload"),
+        )
+
+    def _compute_hash_from_attributes(self) -> bytes:
+        return _compute_hash_from_manifest(git_objects.extid_git_object(self))
+
+
+# Note: we need the type ignore stanza here because mypy cannot figure that all
+# subclasses of BaseModel do have an object_type attribute, even if BaseModel
+# itself does not (because these are Final)
+SWH_MODEL_OBJECT_TYPES: Dict[str, Type[BaseModel]] = {
+    cls.object_type: cls  # type: ignore
+    for cls in (
+        Person,
+        Timestamp,
+        TimestampWithTimezone,
+        Origin,
+        OriginVisit,
+        OriginVisitStatus,
+        Snapshot,
+        SnapshotBranch,
+        Release,
+        Revision,
+        Directory,
+        DirectoryEntry,
+        Content,
+        SkippedContent,
+        MetadataAuthority,
+        MetadataFetcher,
+        RawExtrinsicMetadata,
+        ExtID,
+    )
+}
--- a/swh/model/swhids.py
+++ b/swh/model/swhids.py
+# Copyright (C) 2015-2021  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""
+Classes to represent :ref:`SWH persistend IDentifiers <persistent-identifiers>`.
+
+:class:`CoreSWHID` represents a SWHID with no qualifier, and :class:`QualifiedSWHID`
+represents a SWHID that may have qualifiers.
+:class:`ExtendedSWHID` extends the definition of SWHID to other object types,
+and is used internally in Software Heritage; it does not support qualifiers.
+"""
+
+from __future__ import annotations
+
+import enum
+import re
+from typing import Any, Dict, Generic, Optional, Tuple, Type, TypeVar, Union
+import urllib.parse
+
+import attr
+from attrs_strict import type_validator
+
+from .exceptions import ValidationError
+from .hashutil import hash_to_bytes, hash_to_hex
+
+
+class ObjectType(enum.Enum):
+    """Possible object types of a QualifiedSWHID or CoreSWHID.
+
+    The values of each variant is what is used in the SWHID's string representation."""
+
+    SNAPSHOT = "snp"
+    REVISION = "rev"
+    RELEASE = "rel"
+    DIRECTORY = "dir"
+    CONTENT = "cnt"
+
+
+class ExtendedObjectType(enum.Enum):
+    """Possible object types of an ExtendedSWHID.
+
+    The variants are a superset of :class:`ObjectType`'s"""
+
+    SNAPSHOT = "snp"
+    REVISION = "rev"
+    RELEASE = "rel"
+    DIRECTORY = "dir"
+    CONTENT = "cnt"
+    ORIGIN = "ori"
+    RAW_EXTRINSIC_METADATA = "emd"
+
+
+SWHID_NAMESPACE = "swh"
+SWHID_VERSION = 1
+SWHID_TYPES = ["snp", "rel", "rev", "dir", "cnt"]
+EXTENDED_SWHID_TYPES = SWHID_TYPES + ["ori", "emd"]
+SWHID_SEP = ":"
+SWHID_CTXT_SEP = ";"
+SWHID_QUALIFIERS = {"origin", "anchor", "visit", "path", "lines"}
+
+SWHID_RE_RAW = (
+    f"(?P<namespace>{SWHID_NAMESPACE})"
+    f"{SWHID_SEP}(?P<scheme_version>{SWHID_VERSION})"
+    f"{SWHID_SEP}(?P<object_type>{'|'.join(EXTENDED_SWHID_TYPES)})"
+    f"{SWHID_SEP}(?P<object_id>[0-9a-f]{{40}})"
+    f"({SWHID_CTXT_SEP}(?P<qualifiers>\\S+))?"
+)
+SWHID_RE = re.compile(SWHID_RE_RAW)
+
+
+# type of the "object_type" attribute of the SWHID class; either
+# ObjectType or ExtendedObjectType
+_TObjectType = TypeVar("_TObjectType", ObjectType, ExtendedObjectType)
+
+# the SWHID class itself (this is used so that X.from_string() can return X
+# for all X subclass of _BaseSWHID)
+_TSWHID = TypeVar("_TSWHID", bound="_BaseSWHID")
+
+
+@attr.s(frozen=True, kw_only=True, repr=False)
+class _BaseSWHID(Generic[_TObjectType]):
+    """Common base class for CoreSWHID, QualifiedSWHID, and ExtendedSWHID.
+
+    This is an "abstract" class and should not be instantiated directly;
+    it only exists to deduplicate code between these three SWHID classes."""
+
+    namespace = attr.ib(type=str, default=SWHID_NAMESPACE)
+    """the namespace of the identifier, defaults to ``swh``"""
+
+    scheme_version = attr.ib(type=int, default=SWHID_VERSION)
+    """the scheme version of the identifier, defaults to 1"""
+
+    # overridden by subclasses
+    object_type: _TObjectType
+    """the type of object the identifier points to"""
+
+    object_id = attr.ib(type=bytes, validator=type_validator())
+    """object's identifier"""
+
+    @namespace.validator
+    def check_namespace(self, attribute, value):
+        if value != SWHID_NAMESPACE:
+            raise ValidationError(
+                "Invalid SWHID: invalid namespace: %(namespace)s",
+                params={"namespace": value},
+            )
+
+    @scheme_version.validator
+    def check_scheme_version(self, attribute, value):
+        if value != SWHID_VERSION:
+            raise ValidationError(
+                "Invalid SWHID: invalid version: %(version)s", params={"version": value}
+            )
+
+    @object_id.validator
+    def check_object_id(self, attribute, value):
+        if len(value) != 20:
+            raise ValidationError(
+                "Invalid SWHID: invalid checksum: %(object_id)s",
+                params={"object_id": hash_to_hex(value)},
+            )
+
+    def __str__(self) -> str:
+        return self._format_core_swhid()
+
+    def _format_core_swhid(self) -> str:
+        return SWHID_SEP.join(
+            [
+                self.namespace,
+                str(self.scheme_version),
+                self.object_type.value,
+                hash_to_hex(self.object_id),
+            ]
+        )
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}.from_string('{self}')"
+
+    @classmethod
+    def from_string(cls: Type[_TSWHID], s: str) -> _TSWHID:
+        parts = _parse_swhid(s)
+        if parts.pop("qualifiers"):
+            raise ValidationError(f"{cls.__name__} does not support qualifiers.")
+        try:
+            return cls(**parts)
+        except ValueError as e:
+            raise ValidationError(
+                "ValueError: %(args)s", params={"args": e.args}
+            ) from None
+
+
+@attr.s(frozen=True, kw_only=True, repr=False)
+class CoreSWHID(_BaseSWHID[ObjectType]):
+    """
+    Dataclass holding the relevant info associated to a SoftWare Heritage
+    persistent IDentifier (SWHID).
+
+    Unlike `QualifiedSWHID`, it is restricted to core SWHIDs, ie. SWHIDs
+    with no qualifiers.
+
+    Raises:
+        swh.model.exceptions.ValidationError: In case of invalid object type or id
+
+    To get the raw SWHID string from an instance of this class,
+    use the :func:`str` function:
+
+    >>> swhid = CoreSWHID(
+    ...     object_type=ObjectType.CONTENT,
+    ...     object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'),
+    ... )
+    >>> str(swhid)
+    'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0'
+
+    And vice-versa with :meth:`CoreSWHID.from_string`:
+
+    >>> swhid == CoreSWHID.from_string(
+    ...     "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0"
+    ... )
+    True
+    """
+
+    object_type = attr.ib(
+        type=ObjectType, validator=type_validator(), converter=ObjectType
+    )
+    """the type of object the identifier points to"""
+
+    def to_extended(self) -> ExtendedSWHID:
+        """Converts this CoreSWHID into an ExtendedSWHID.
+
+        As ExtendedSWHID is a superset of CoreSWHID, this is lossless."""
+        return ExtendedSWHID(
+            namespace=self.namespace,
+            scheme_version=self.scheme_version,
+            object_type=ExtendedObjectType(self.object_type.value),
+            object_id=self.object_id,
+        )
+
+    def to_qualified(self) -> QualifiedSWHID:
+        """Converts this CoreSWHID into a QualifiedSWHID.
+
+        As QualifiedSWHID is a superset of CoreSWHID, this is lossless."""
+        return QualifiedSWHID(
+            namespace=self.namespace,
+            scheme_version=self.scheme_version,
+            object_type=self.object_type,
+            object_id=self.object_id,
+        )
+
+
+def _parse_core_swhid(swhid: Union[str, CoreSWHID, None]) -> Optional[CoreSWHID]:
+    if swhid is None or isinstance(swhid, CoreSWHID):
+        return swhid
+    else:
+        return CoreSWHID.from_string(swhid)
+
+
+def _parse_lines_qualifier(
+    lines: Union[str, Tuple[int, Optional[int]], None],
+) -> Optional[Tuple[int, Optional[int]]]:
+    try:
+        if lines is None or isinstance(lines, tuple):
+            return lines
+        elif "-" in lines:
+            (from_, to) = lines.split("-", 2)
+            return (int(from_), int(to))
+        else:
+            return (int(lines), None)
+    except ValueError:
+        raise ValidationError(
+            "Invalid format for the lines qualifier: %(lines)s", params={"lines": lines}
+        )
+
+
+def _parse_path_qualifier(path: Union[str, bytes, None]) -> Optional[bytes]:
+    if path is None or isinstance(path, bytes):
+        return path
+    else:
+        return urllib.parse.unquote_to_bytes(path)
+
+
+@attr.s(frozen=True, kw_only=True, repr=False)
+class QualifiedSWHID(_BaseSWHID[ObjectType]):
+    """
+    Dataclass holding the relevant info associated to a SoftWare Heritage
+    persistent IDentifier (SWHID)
+
+    Raises:
+        swh.model.exceptions.ValidationError: In case of invalid object type or id
+
+    To get the raw SWHID string from an instance of this class,
+    use the :func:`str` function:
+
+    >>> swhid = QualifiedSWHID(
+    ...     object_type=ObjectType.CONTENT,
+    ...     object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'),
+    ...     lines=(5, 10),
+    ... )
+    >>> str(swhid)
+    'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0;lines=5-10'
+
+    And vice-versa with :meth:`QualifiedSWHID.from_string`:
+
+    >>> swhid == QualifiedSWHID.from_string(
+    ...     "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0;lines=5-10"
+    ... )
+    True
+    """
+
+    object_type = attr.ib(
+        type=ObjectType, validator=type_validator(), converter=ObjectType
+    )
+    """the type of object the identifier points to"""
+
+    # qualifiers:
+
+    origin = attr.ib(type=Optional[str], default=None, validator=type_validator())
+    """the software origin where an object has been found or observed in the wild,
+    as an URI"""
+
+    visit = attr.ib(type=Optional[CoreSWHID], default=None, converter=_parse_core_swhid)
+    """the core identifier of a snapshot corresponding to a specific visit
+    of a repository containing the designated object"""
+
+    anchor = attr.ib(
+        type=Optional[CoreSWHID],
+        default=None,
+        validator=type_validator(),
+        converter=_parse_core_swhid,
+    )
+    """a designated node in the Merkle DAG relative to which a path to the object
+    is specified, as the core identifier of a directory, a revision, a release,
+    or a snapshot"""
+
+    path = attr.ib(
+        type=Optional[bytes],
+        default=None,
+        validator=type_validator(),
+        converter=_parse_path_qualifier,
+    )
+    """the absolute file path, from the root directory associated to the anchor node,
+    to the object; when the anchor denotes a directory or a revision, and almost always
+    when it’s a release, the root directory is uniquely determined;
+    when the anchor denotes a snapshot, the root directory is the one pointed to by HEAD
+    (possibly indirectly), and undefined if such a reference is missing"""
+
+    Lines = Tuple[int, Optional[int]]
+    lines = attr.ib(
+        type=Optional[Lines],
+        default=None,
+        validator=type_validator(),
+        converter=_parse_lines_qualifier,
+    )
+    """lines: line number(s) of interest, usually within a content object"""
+
+    @visit.validator
+    def check_visit(self, attribute, value):
+        if value and value.object_type != ObjectType.SNAPSHOT:
+            raise ValidationError(
+                "The 'visit' qualifier must be a 'snp' SWHID, not '%(type)s'",
+                params={"type": value.object_type.value},
+            )
+
+    @anchor.validator
+    def check_anchor(self, attribute, value):
+        if value and value.object_type not in (
+            ObjectType.DIRECTORY,
+            ObjectType.REVISION,
+            ObjectType.RELEASE,
+            ObjectType.SNAPSHOT,
+        ):
+            raise ValidationError(
+                "The 'visit' qualifier must be a 'dir', 'rev', 'rel', or 'snp' SWHID, "
+                "not '%s(type)s'",
+                params={"type": value.object_type.value},
+            )
+
+    def to_dict(self) -> Dict[str, Optional[str | bytes | CoreSWHID | Lines]]:
+        """Returns a dictionary version of this QSWHID for json serialization"""
+        return {
+            "swhid": self._format_core_swhid(),
+            "origin": self.origin,
+            "visit": self.visit,
+            "anchor": self.anchor,
+            "path": self.path,
+            "lines": self.lines,
+        }
+
+    def qualifiers(self) -> Dict[str, str]:
+        """Returns URL-escaped qualifiers of this SWHID, for use in serialization"""
+        origin = self.origin
+        if origin:
+            unescaped_origin = origin
+            origin = origin.replace("%", "%25")
+            origin = origin.replace(";", "%3B")
+            assert (
+                urllib.parse.unquote(origin) == unescaped_origin
+            ), "Escaping ';' in the origin qualifier corrupted the origin URL."
+
+        d: Dict[str, Optional[str]] = {
+            "origin": origin,
+            "visit": str(self.visit) if self.visit else None,
+            "anchor": str(self.anchor) if self.anchor else None,
+            "path": (
+                urllib.parse.quote_from_bytes(self.path)
+                if self.path is not None
+                else None
+            ),
+            "lines": (
+                "-".join(str(line) for line in self.lines if line is not None)
+                if self.lines
+                else None
+            ),
+        }
+        return {k: v for (k, v) in d.items() if v is not None}
+
+    def __str__(self) -> str:
+        swhid = self._format_core_swhid()
+        qualifiers = self.qualifiers()
+        if qualifiers:
+            for k, v in qualifiers.items():
+                swhid += "%s%s=%s" % (SWHID_CTXT_SEP, k, v)
+        return swhid
+
+    def __repr__(self) -> str:
+        return super().__repr__()
+
+    @classmethod
+    def from_string(cls, s: str) -> QualifiedSWHID:
+        parts = _parse_swhid(s)
+        qualifiers = parts.pop("qualifiers")
+        invalid_qualifiers = set(qualifiers) - SWHID_QUALIFIERS
+        if invalid_qualifiers:
+            raise ValidationError(
+                "Invalid qualifier(s): %(qualifiers)s",
+                params={"qualifiers": ", ".join(invalid_qualifiers)},
+            )
+        if "origin" in qualifiers:
+            qualifiers["origin"] = urllib.parse.unquote(qualifiers["origin"])
+
+        try:
+            return QualifiedSWHID(**parts, **qualifiers)
+        except ValueError as e:
+            raise ValidationError(
+                "ValueError: %(args)s", params={"args": e.args}
+            ) from None
+
+
+@attr.s(frozen=True, kw_only=True, repr=False)
+class ExtendedSWHID(_BaseSWHID[ExtendedObjectType]):
+    """
+    Dataclass holding the relevant info associated to a SoftWare Heritage
+    persistent IDentifier (SWHID).
+
+    It extends  `CoreSWHID`, by allowing non-standard object types; and should
+    only be used internally to Software Heritage.
+
+    Raises:
+        swh.model.exceptions.ValidationError: In case of invalid object type or id
+
+    To get the raw SWHID string from an instance of this class,
+    use the :func:`str` function:
+
+    >>> swhid = ExtendedSWHID(
+    ...     object_type=ExtendedObjectType.CONTENT,
+    ...     object_id=bytes.fromhex('8ff44f081d43176474b267de5451f2c2e88089d0'),
+    ... )
+    >>> str(swhid)
+    'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0'
+
+    And vice-versa with :meth:`CoreSWHID.from_string`:
+
+    >>> swhid == ExtendedSWHID.from_string(
+    ...     "swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0"
+    ... )
+    True
+    """
+
+    object_type = attr.ib(
+        type=ExtendedObjectType,
+        validator=type_validator(),
+        converter=ExtendedObjectType,
+    )
+    """the type of object the identifier points to"""
+
+
+def _parse_swhid(swhid: str) -> Dict[str, Any]:
+    """Parse a Software Heritage identifier (SWHID) from string (see:
+    :ref:`persistent-identifiers`.)
+
+    This is for internal use; use :meth:`CoreSWHID.from_string`,
+    :meth:`QualifiedSWHID.from_string`, or :meth:`ExtendedSWHID.from_string` instead,
+    as they perform validation and build a dataclass.
+
+    Args:
+        swhid (str): A persistent identifier
+
+    Raises:
+        swh.model.exceptions.ValidationError: if passed string is not a valid SWHID
+
+    """
+    m = SWHID_RE.fullmatch(swhid)
+    if not m:
+        raise ValidationError(
+            "Invalid SWHID: invalid syntax: %(swhid)s", params={"swhid": swhid}
+        )
+    parts: Dict[str, Any] = m.groupdict()
+
+    qualifiers_raw = parts["qualifiers"]
+    parts["qualifiers"] = {}
+    if qualifiers_raw:
+        for qualifier in qualifiers_raw.split(SWHID_CTXT_SEP):
+            try:
+                k, v = qualifier.split("=", maxsplit=1)
+                parts["qualifiers"][k] = v
+            except ValueError:
+                raise ValidationError(
+                    "Invalid SWHID: invalid qualifier: %(qualifier)s",
+                    params={"qualifier": qualifier},
+                )
+
+    parts["scheme_version"] = int(parts["scheme_version"])
+    parts["object_id"] = hash_to_bytes(parts["object_id"])
+    return parts
No results found