Compare revisions

c9583bae · c9583bae · c9583bae · c9583bae · c9583bae · c9583bae
--- a/swh/model/exceptions.py
+++ b/swh/model/exceptions.py
@@ -33,11 +33,12 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.

-NON_FIELD_ERRORS = '__all__'
+NON_FIELD_ERRORS = "__all__"


 class ValidationError(Exception):
    """An error while validating data."""
+
    def __init__(self, message, code=None, params=None):
        """
        The `message` argument can be a single error, a list of errors, or a
@@ -54,16 +55,15 @@ class ValidationError(Exception):
            message = message[0]

        if isinstance(message, ValidationError):
-            if hasattr(message, 'error_dict'):
+            if hasattr(message, "error_dict"):
                message = message.error_dict
            # PY2 has a `message` property which is always there so we can't
            # duck-type on it. It was introduced in Python 2.5 and already
            # deprecated in Python 2.6.
-            elif not hasattr(message, 'message'):
+            elif not hasattr(message, "message"):
                message = message.error_list
            else:
-                message, code, params = (message.message, message.code,
-                                         message.params)
+                message, code, params = (message.message, message.code, message.params)

        if isinstance(message, dict):
            self.error_dict = {}
@@ -78,9 +78,8 @@ class ValidationError(Exception):
                # Normalize plain strings to instances of ValidationError.
                if not isinstance(message, ValidationError):
                    message = ValidationError(message)
-                if hasattr(message, 'error_dict'):
-                    self.error_list.extend(sum(message.error_dict.values(),
-                                               []))
+                if hasattr(message, "error_dict"):
+                    self.error_list.extend(sum(message.error_dict.values(), []))
                else:
                    self.error_list.extend(message.error_list)

@@ -94,18 +93,18 @@ class ValidationError(Exception):
    def message_dict(self):
        # Trigger an AttributeError if this ValidationError
        # doesn't have an error_dict.
-        getattr(self, 'error_dict')
+        getattr(self, "error_dict")

        return dict(self)

    @property
    def messages(self):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            return sum(dict(self).values(), [])
        return list(self)

    def update_error_dict(self, error_dict):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            for field, error_list in self.error_dict.items():
                error_dict.setdefault(field, []).extend(error_list)
        else:
@@ -113,7 +112,7 @@ class ValidationError(Exception):
        return error_dict

    def __iter__(self):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            for field, errors in self.error_dict.items():
                yield field, list(ValidationError(errors))
        else:
@@ -124,9 +123,13 @@ class ValidationError(Exception):
                yield message

    def __str__(self):
-        if hasattr(self, 'error_dict'):
+        if hasattr(self, "error_dict"):
            return repr(dict(self))
        return repr(list(self))

    def __repr__(self):
-        return 'ValidationError(%s)' % self
+        return "ValidationError(%s)" % self
+
+
+class InvalidDirectoryPath(Exception):
+    pass
--- a/swh/model/fields/__init__.py
+++ b/swh/model/fields/__init__.py
@@ -6,8 +6,13 @@
 # We do our imports here but we don't use them, so flake8 complains
 # flake8: noqa

-from .simple import (validate_type, validate_int, validate_str, validate_bytes,
-                     validate_datetime, validate_enum)
-from .hashes import (validate_sha1, validate_sha1_git, validate_sha256)
-from .compound import (validate_against_schema, validate_all_keys,
-                       validate_any_key)
+from .compound import validate_against_schema, validate_all_keys, validate_any_key
+from .hashes import validate_sha1, validate_sha1_git, validate_sha256
+from .simple import (
+    validate_bytes,
+    validate_datetime,
+    validate_enum,
+    validate_int,
+    validate_str,
+    validate_type,
+)
--- a/swh/model/fields/compound.py
+++ b/swh/model/fields/compound.py
@@ -6,7 +6,7 @@
 from collections import defaultdict
 import itertools

-from ..exceptions import ValidationError, NON_FIELD_ERRORS
+from ..exceptions import NON_FIELD_ERRORS, ValidationError


 def validate_against_schema(model, schema, value):
@@ -26,19 +26,19 @@ def validate_against_schema(model, schema, value):

    if not isinstance(value, dict):
        raise ValidationError(
-            'Unexpected type %(type)s for %(model)s, expected dict',
+            "Unexpected type %(type)s for %(model)s, expected dict",
            params={
-                'model': model,
-                'type': value.__class__.__name__,
+                "model": model,
+                "type": value.__class__.__name__,
            },
-            code='model-unexpected-type',
+            code="model-unexpected-type",
        )

    errors = defaultdict(list)

    for key, (mandatory, validators) in itertools.chain(
        ((k, v) for k, v in schema.items() if k != NON_FIELD_ERRORS),
-        [(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))]
+        [(NON_FIELD_ERRORS, (False, schema.get(NON_FIELD_ERRORS, [])))],
    ):
        if not validators:
            continue
@@ -54,9 +54,9 @@ def validate_against_schema(model, schema, value):
                if mandatory:
                    errors[key].append(
                        ValidationError(
-                            'Field %(field)s is mandatory',
-                            params={'field': key},
-                            code='model-field-mandatory',
+                            "Field %(field)s is mandatory",
+                            params={"field": key},
+                            code="model-field-mandatory",
                        )
                    )

@@ -74,19 +74,21 @@ def validate_against_schema(model, schema, value):
            else:
                if not valid:
                    errdata = {
-                        'validator': validator.__name__,
+                        "validator": validator.__name__,
                    }

                    if key == NON_FIELD_ERRORS:
-                        errmsg = 'Validation of model %(model)s failed in ' \
-                                 '%(validator)s'
-                        errdata['model'] = model
-                        errcode = 'model-validation-failed'
+                        errmsg = (
+                            "Validation of model %(model)s failed in " "%(validator)s"
+                        )
+                        errdata["model"] = model
+                        errcode = "model-validation-failed"
                    else:
-                        errmsg = 'Validation of field %(field)s failed in ' \
-                                 '%(validator)s'
-                        errdata['field'] = key
-                        errcode = 'field-validation-failed'
+                        errmsg = (
+                            "Validation of field %(field)s failed in " "%(validator)s"
+                        )
+                        errdata["field"] = key
+                        errcode = "field-validation-failed"

                    errors[key].append(
                        ValidationError(errmsg, params=errdata, code=errcode)
@@ -102,11 +104,11 @@ def validate_all_keys(value, keys):
    """Validate that all the given keys are present in value"""
    missing_keys = set(keys) - set(value)
    if missing_keys:
-        missing_fields = ', '.join(sorted(missing_keys))
+        missing_fields = ", ".join(sorted(missing_keys))
        raise ValidationError(
-            'Missing mandatory fields %(missing_fields)s',
-            params={'missing_fields': missing_fields},
-            code='missing-mandatory-field'
+            "Missing mandatory fields %(missing_fields)s",
+            params={"missing_fields": missing_fields},
+            code="missing-mandatory-field",
        )

    return True
@@ -116,11 +118,11 @@ def validate_any_key(value, keys):
    """Validate that any of the given keys is present in value"""
    present_keys = set(keys) & set(value)
    if not present_keys:
-        missing_fields = ', '.join(sorted(keys))
+        missing_fields = ", ".join(sorted(keys))
        raise ValidationError(
-            'Must contain one of the alternative fields %(missing_fields)s',
-            params={'missing_fields': missing_fields},
-            code='missing-alternative-field',
+            "Must contain one of the alternative fields %(missing_fields)s",
+            params={"missing_fields": missing_fields},
+            code="missing-alternative-field",
        )

    return True
--- a/swh/model/fields/hashes.py
+++ b/swh/model/fields/hashes.py
@@ -4,6 +4,7 @@
 # See top-level LICENSE file for more information

 import string
+
 from ..exceptions import ValidationError


@@ -22,22 +23,22 @@ def validate_hash(value, hash_type):
    """

    hash_lengths = {
-        'sha1': 20,
-        'sha1_git': 20,
-        'sha256': 32,
+        "sha1": 20,
+        "sha1_git": 20,
+        "sha256": 32,
    }

    hex_digits = set(string.hexdigits)

    if hash_type not in hash_lengths:
        raise ValidationError(
-            'Unexpected hash type %(hash_type)s, expected one of'
-            ' %(hash_types)s',
+            "Unexpected hash type %(hash_type)s, expected one of" " %(hash_types)s",
            params={
-                'hash_type': hash_type,
-                'hash_types': ', '.join(sorted(hash_lengths)),
+                "hash_type": hash_type,
+                "hash_types": ", ".join(sorted(hash_lengths)),
            },
-            code='unexpected-hash-type')
+            code="unexpected-hash-type",
+        )

    if isinstance(value, str):
        errors = []
@@ -48,10 +49,10 @@ def validate_hash(value, hash_type):
                    "Unexpected characters `%(unexpected_chars)s' for hash "
                    "type %(hash_type)s",
                    params={
-                        'unexpected_chars': ', '.join(sorted(extra_chars)),
-                        'hash_type': hash_type,
+                        "unexpected_chars": ", ".join(sorted(extra_chars)),
+                        "hash_type": hash_type,
                    },
-                    code='unexpected-hash-contents',
+                    code="unexpected-hash-contents",
                )
            )

@@ -60,14 +61,14 @@ def validate_hash(value, hash_type):
        if length != expected_length:
            errors.append(
                ValidationError(
-                    'Unexpected length %(length)d for hash type '
-                    '%(hash_type)s, expected %(expected_length)d',
+                    "Unexpected length %(length)d for hash type "
+                    "%(hash_type)s, expected %(expected_length)d",
                    params={
-                        'length': length,
-                        'expected_length': expected_length,
-                        'hash_type': hash_type,
+                        "length": length,
+                        "expected_length": expected_length,
+                        "hash_type": hash_type,
                    },
-                    code='unexpected-hash-length',
+                    code="unexpected-hash-length",
                )
            )

@@ -81,37 +82,37 @@ def validate_hash(value, hash_type):
        expected_length = hash_lengths[hash_type]
        if length != expected_length:
            raise ValidationError(
-                'Unexpected length %(length)d for hash type '
-                '%(hash_type)s, expected %(expected_length)d',
+                "Unexpected length %(length)d for hash type "
+                "%(hash_type)s, expected %(expected_length)d",
                params={
-                    'length': length,
-                    'expected_length': expected_length,
-                    'hash_type': hash_type,
+                    "length": length,
+                    "expected_length": expected_length,
+                    "hash_type": hash_type,
                },
-                code='unexpected-hash-length',
+                code="unexpected-hash-length",
            )

        return True

    raise ValidationError(
-        'Unexpected type %(type)s for hash, expected str or bytes',
+        "Unexpected type %(type)s for hash, expected str or bytes",
        params={
-            'type': value.__class__.__name__,
+            "type": value.__class__.__name__,
        },
-        code='unexpected-hash-value-type',
+        code="unexpected-hash-value-type",
    )


 def validate_sha1(sha1):
    """Validate that sha1 is a valid sha1 hash"""
-    return validate_hash(sha1, 'sha1')
+    return validate_hash(sha1, "sha1")


 def validate_sha1_git(sha1_git):
    """Validate that sha1_git is a valid sha1_git hash"""
-    return validate_hash(sha1_git, 'sha1_git')
+    return validate_hash(sha1_git, "sha1_git")


 def validate_sha256(sha256):
    """Validate that sha256 is a valid sha256 hash"""
-    return validate_hash(sha256, 'sha256')
+    return validate_hash(sha256, "sha256")
--- a/swh/model/fields/simple.py
+++ b/swh/model/fields/simple.py
@@ -13,16 +13,16 @@ def validate_type(value, type):
    """Validate that value is an integer"""
    if not isinstance(value, type):
        if isinstance(type, tuple):
-            typestr = 'one of %s' % ', '.join(typ.__name__ for typ in type)
+            typestr = "one of %s" % ", ".join(typ.__name__ for typ in type)
        else:
            typestr = type.__name__
        raise ValidationError(
-            'Unexpected type %(type)s, expected %(expected_type)s',
+            "Unexpected type %(type)s, expected %(expected_type)s",
            params={
-                'type': value.__class__.__name__,
-                'expected_type': typestr,
+                "type": value.__class__.__name__,
+                "expected_type": typestr,
            },
-            code='unexpected-type'
+            code="unexpected-type",
        )

    return True
@@ -54,10 +54,12 @@ def validate_datetime(value):
        errors.append(e)

    if isinstance(value, datetime.datetime) and value.tzinfo is None:
-        errors.append(ValidationError(
-            'Datetimes must be timezone-aware in swh',
-            code='datetime-without-tzinfo',
-        ))
+        errors.append(
+            ValidationError(
+                "Datetimes must be timezone-aware in swh",
+                code="datetime-without-tzinfo",
+            )
+        )

    if errors:
        raise ValidationError(errors)
@@ -69,12 +71,12 @@ def validate_enum(value, expected_values):
    """Validate that value is contained in expected_values"""
    if value not in expected_values:
        raise ValidationError(
-            'Unexpected value %(value)s, expected one of %(expected_values)s',
+            "Unexpected value %(value)s, expected one of %(expected_values)s",
            params={
-                'value': value,
-                'expected_values': ', '.join(sorted(expected_values)),
+                "value": value,
+                "expected_values": ", ".join(sorted(expected_values)),
            },
-            code='unexpected-value',
+            code="unexpected-value",
        )

    return True
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
--- a/swh/model/git.py
+++ b/swh/model/git.py
-# Copyright (C) 2015  The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-
-import os
-
-from enum import Enum
-
-from swh.model import hashutil, identifiers
-
-
-ROOT_TREE_KEY = b''
-
-
-class GitType(Enum):
-    BLOB = b'blob'
-    TREE = b'tree'
-    EXEC = b'exec'
-    LINK = b'link'
-    COMM = b'commit'
-    RELE = b'release'
-    REFS = b'ref'
-
-
-class GitPerm(Enum):
-    BLOB = b'100644'
-    TREE = b'40000'
-    EXEC = b'100755'
-    LINK = b'120000'
-
-
-def compute_directory_git_sha1(dirpath, hashes):
-    """Compute a directory git sha1 for a dirpath.
-
-    Args:
-        dirpath: the directory's absolute path
-        hashes: list of tree entries with keys:
-            - sha1_git: the tree entry's sha1
-            - name: file or subdir's name
-            - perms: the tree entry's sha1 permissions
-
-        Returns:
-            the binary sha1 of the dictionary's identifier
-
-        Assumes:
-            Every path exists in hashes.
-
-    """
-    directory = {
-        'entries':
-        [
-            {
-                'name': entry['name'],
-                'perms': int(entry['perms'].value, 8),
-                'target': entry['sha1_git'],
-                'type': 'dir' if entry['perms'] == GitPerm.TREE else 'file',
-            }
-            for entry in hashes[dirpath]
-        ]
-    }
-    return hashutil.hash_to_bytes(identifiers.directory_identifier(directory))
-
-
-def compute_revision_sha1_git(revision):
-    """Compute a revision sha1 git from its dict representation.
-
-    Args:
-        revision: Additional dictionary information needed to compute a
-        synthetic
-        revision. Following keys are expected:
-            - author
-            - date
-            - committer
-            - committer_date
-            - message
-            - type
-            - directory: binary form of the tree hash
-
-    Returns:
-        revision sha1 in bytes
-
-    # FIXME: beware, bytes output from storage api
-
-    """
-    return hashutil.hash_to_bytes(identifiers.revision_identifier(revision))
-
-
-def compute_release_sha1_git(release):
-    """Compute a release sha1 git from its dict representation.
-
-    Args:
-        release: Additional dictionary information needed to compute a
-        synthetic release. Following keys are expected:
-            - name
-            - message
-            - date
-            - author
-            - revision: binary form of the sha1_git revision targeted by this
-
-    Returns:
-        release sha1 in bytes
-
-    """
-    return hashutil.hash_to_bytes(identifiers.release_identifier(release))
-
-
-def compute_link_metadata(linkpath):
-    """Given a linkpath, compute the git metadata.
-
-    Args:
-        linkpath: absolute pathname of the link
-
-    Returns:
-        Dictionary of values:
-            - data: link's content
-            - length: link's content length
-            - name: basename of the link
-            - perms: git permission for link
-            - type: git type for link
-            - path: absolute path to the link on filesystem
-
-    """
-    data = os.readlink(linkpath)
-    link_metadata = hashutil.hash_data(data)
-    link_metadata.update({
-        'data': data,
-        'length': len(data),
-        'name': os.path.basename(linkpath),
-        'perms': GitPerm.LINK,
-        'type': GitType.BLOB,
-        'path': linkpath
-    })
-
-    return link_metadata
-
-
-def compute_blob_metadata(filepath):
-    """Given a filepath, compute the git metadata.
-
-    Args:
-        filepath: absolute pathname of the file.
-
-    Returns:
-        Dictionary of values:
-            - name: basename of the file
-            - perms: git permission for file
-            - type: git type for file
-            - path: absolute filepath on filesystem
-
-    """
-    blob_metadata = hashutil.hash_path(filepath)
-    perms = GitPerm.EXEC if os.access(filepath, os.X_OK) else GitPerm.BLOB
-    blob_metadata.update({
-        'name': os.path.basename(filepath),
-        'perms': perms,
-        'type': GitType.BLOB,
-        'path': filepath
-    })
-
-    return blob_metadata
-
-
-def compute_tree_metadata(dirname, ls_hashes):
-    """Given a dirname, compute the git metadata.
-
-    Args:
-        dirname: absolute pathname of the directory.
-
-    Returns:
-        Dictionary of values:
-            - sha1_git: tree's sha1 git
-            - name: basename of the directory
-            - perms: git permission for directory
-            - type: git type for directory
-            - path: absolute path to directory on filesystem
-
-    """
-    return {
-        'sha1_git': compute_directory_git_sha1(dirname, ls_hashes),
-        'name': os.path.basename(dirname),
-        'perms': GitPerm.TREE,
-        'type': GitType.TREE,
-        'path': dirname
-    }
-
-
-def walk_and_compute_sha1_from_directory(rootdir,
-                                         dir_ok_fn=lambda dirpath: True):
-    """Compute git sha1 from directory rootdir.
-
-    Args:
-        - rootdir: Root directory from which beginning the git hash computation
-
-        - dir_ok_fn: Filter function to filter directory according to rules
-        defined in the function. By default, all folders are ok.
-        Example override: dir_ok_fn = lambda dirpath: b'svn' not in dirpath
-
-    Returns:
-        Dictionary of entries with keys <path-name> and as values a list of
-        directory entries.
-        Those are list of dictionary with keys:
-          - 'perms'
-          - 'type'
-          - 'name'
-          - 'sha1_git'
-          - and specifically content: 'sha1', 'sha256', ...
-
-    Note:
-        One special key is ROOT_TREE_KEY to indicate the upper root of the
-        directory (this is the revision's directory).
-
-    Raises:
-        Nothing
-        If something is raised, this is a programmatic error.
-
-    """
-    ls_hashes = {}
-    all_links = set()
-
-    def filtfn(dirpath, dirnames):
-        return list(filter(lambda dirname: dir_ok_fn(os.path.join(dirpath,
-                                                                  dirname)),
-                           dirnames))
-
-    gen_dir = ((dp, filtfn(dp, dns), fns) for (dp, dns, fns)
-               in os.walk(rootdir, topdown=False)
-               if dir_ok_fn(dp))
-
-    for dirpath, dirnames, filenames in gen_dir:
-        hashes = []
-
-        links = (os.path.join(dirpath, file)
-                 for file in (filenames+dirnames)
-                 if os.path.islink(os.path.join(dirpath, file)))
-
-        for linkpath in links:
-            all_links.add(linkpath)
-            m_hashes = compute_link_metadata(linkpath)
-            hashes.append(m_hashes)
-
-        only_files = (os.path.join(dirpath, file)
-                      for file in filenames
-                      if os.path.join(dirpath, file) not in all_links)
-        for filepath in only_files:
-            m_hashes = compute_blob_metadata(filepath)
-            hashes.append(m_hashes)
-
-        ls_hashes[dirpath] = hashes
-
-        dir_hashes = []
-        subdirs = (os.path.join(dirpath, dir)
-                   for dir in dirnames
-                   if os.path.join(dirpath, dir)
-                   not in all_links)
-        for fulldirname in subdirs:
-            tree_hash = compute_tree_metadata(fulldirname, ls_hashes)
-            dir_hashes.append(tree_hash)
-
-        ls_hashes[dirpath].extend(dir_hashes)
-
-    # compute the current directory hashes
-    root_hash = {
-        'sha1_git': compute_directory_git_sha1(rootdir, ls_hashes),
-        'path': rootdir,
-        'name': os.path.basename(rootdir),
-        'perms': GitPerm.TREE,
-        'type': GitType.TREE
-    }
-    ls_hashes[ROOT_TREE_KEY] = [root_hash]
-
-    return ls_hashes
-
-
-def recompute_sha1_in_memory(root, deeper_rootdir, objects):
-    """Recompute git sha1 from directory deeper_rootdir to root.
-
-    This function relies exclusively on objects for hashes.
-    It expects the deeper_rootdir and every key below that path to be updated.
-
-    Args:
-
-      - root: Upper root directory (so same as
-        objects[ROOT_TREE_KEY][0]['path'])
-
-        - rootdir: Root directory from which beginning the git hash computation
-
-        - objects: objects dictionary as per returned by
-        `walk_and_compute_sha1_from_directory`
-
-    Returns:
-        Dictionary of entries with keys <path-name> and as values a list of
-        directory entries.
-        Those are list of dictionary with keys:
-          - 'perms'
-          - 'type'
-          - 'name'
-          - 'sha1_git'
-          - and specifically content: 'sha1', 'sha256', ...
-
-    Note:
-        One special key is ROOT_TREE_KEY to indicate the upper root of the
-        directory (this is the revision's directory).
-
-    Raises:
-        Nothing
-        If something is raised, this is a programmatic error.
-
-    """
-    # list of paths to update from bottom to top
-    upper_root = os.path.dirname(root)
-    rootdir = os.path.dirname(deeper_rootdir)
-    while rootdir != upper_root:
-        files = objects.get(rootdir, None)
-        if files:
-            ls_hashes = []
-            for hashfile in files:
-                fulldirname = hashfile['path']
-                if hashfile['type'] == GitType.TREE:
-                    tree_hash = compute_tree_metadata(fulldirname, objects)
-                    ls_hashes.append(tree_hash)
-                else:
-                    ls_hashes.append(hashfile)
-
-            objects[rootdir] = ls_hashes
-
-        rootdir = os.path.dirname(rootdir)
-
-    # update root
-    objects[ROOT_TREE_KEY][0]['sha1_git'] = compute_directory_git_sha1(root,
-                                                                       objects)
-    return objects
-
-
-def update_checksums_from(changed_paths, objects,
-                          dir_ok_fn=lambda dirpath: True):
-    """Given a list of changed paths, recompute the checksums only where needed.
-
-    Args:
-        changed_paths: List of dictionary representing path changes.
-        The dictionary has the form:
-        - path: the full path to the file Added, Modified or Deleted
-        - action: A, M or D
-        objects: dictionary returned by `walk_and_compute_sha1_from_directory`.
-
-    Returns:
-        Dictionary returned by `walk_and_compute_sha1_from_directory`
-        updated (mutated) according to necessary modifications.
-
-    """
-    root = objects[ROOT_TREE_KEY][0]['path']
-
-    # compute the list of changed paths to update (no action discrimination is
-    # necessary here since we'll walk back the fs from the deeper node's
-    # directory, so every deletion, add or modification will be seen)
-
-    # FIXME: Compute the lowest common ancestor to reduce the computations
-    # FIXME: if one changed path is a file at the rootdir, we recompute all
-    # from disk
-    for changed_path in changed_paths:
-        path = changed_path['path']
-        if changed_path['action'] == 'D':
-            new_objects = {k: objects[k] for k in objects.keys()
-                           if not k.startswith(path)}
-            objects = new_objects
-
-        rootdir = os.path.dirname(path)
-        if not os.path.exists(rootdir):
-            objects.pop(rootdir, None)
-            continue
-
-        # recompute from disk the checksums
-        hashes = walk_and_compute_sha1_from_directory(rootdir, dir_ok_fn)
-        # update the objects with new checksums for the arborescence tree below
-        # rootdir
-        for d in (k for k in hashes.keys() if k != ROOT_TREE_KEY):
-            objects[d] = hashes[d]
-
-        # now recompute the hashes in memory from deeper_rootdir to root
-        objects = recompute_sha1_in_memory(root, rootdir, objects)
-
-    return objects
--- a/swh/model/git_objects.py
+++ b/swh/model/git_objects.py
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
-# Copyright (C) 2015  The Software Heritage developers
+# Copyright (C) 2015-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

+"""Module in charge of hashing function definitions. This is the base
+module use to compute swh's hashes.
+
+Only a subset of hashing algorithms is supported as defined in the
+ALGORITHMS set. Any provided algorithms not in that list will result
+in a ValueError explaining the error.
+
+This module defines a MultiHash class to ease the softwareheritage
+hashing algorithms computation. This allows to compute hashes from
+file object, path, data using a similar interface as what the standard
+hashlib module provides.
+
+Basic usage examples:
+
+- file object: MultiHash.from_file(
+                 file_object, hash_names=DEFAULT_ALGORITHMS).digest()
+
+- path (filepath): MultiHash.from_path(b'foo').hexdigest()
+
+- data (bytes): MultiHash.from_data(b'foo').bytehexdigest()
+
+
+"Complex" usage, defining a swh hashlib instance first:
+
+- To compute length, integrate the length to the set of algorithms to
+  compute, for example:
+
+  .. code-block:: python
+
+     h = MultiHash(hash_names=set({'length'}).union(DEFAULT_ALGORITHMS))
+     with open(filepath, 'rb') as f:
+         h.update(f.read(HASH_BLOCK_SIZE))
+     hashes = h.digest()  # returns a dict of {hash_algo_name: hash_in_bytes}
+
+- Write alongside computing hashing algorithms (from a stream), example:
+
+  .. code-block:: python
+
+     h = MultiHash(length=length)
+     with open(filepath, 'wb') as f:
+         for chunk in r.iter_content():  # r a stream of sort
+             h.update(chunk)
+             f.write(chunk)
+     hashes = h.hexdigest()  # returns a dict of {hash_algo_name: hash_in_hex}
+
+
+"""
+
 import binascii
 import functools
 import hashlib
 from io import BytesIO
 import os
+from typing import Callable, Dict, Optional, Union
+
+ALGORITHMS = set(
+    ["sha1", "sha256", "sha1_git", "blake2s256", "blake2b512", "md5", "sha512"]
+)
+"""Hashing algorithms supported by this module"""
+
+DEFAULT_ALGORITHMS = set(["sha1", "sha256", "sha1_git", "blake2s256"])
+"""Algorithms computed by default when calling the functions from this module.

-# supported hashing algorithms
-ALGORITHMS = set(['sha1', 'sha256', 'sha1_git'])
+Subset of :const:`ALGORITHMS`.
+"""

-# should be a multiple of 64 (sha1/sha256's block size)
-# FWIW coreutils' sha1sum uses 32768
 HASH_BLOCK_SIZE = 32768
+"""Block size for streaming hash computations made in this module"""
+
+_blake2_hash_cache: Dict[str, Callable] = {}
+

+class MultiHash:
+    """Hashutil class to support multiple hashes computation.

-def _new_git_hash(base_algo, git_type, length):
-    """Initialize a digest object (as returned by python's hashlib) for the
-    requested algorithm, and feed it with the header for a git object of the
-    given type and length.
+    Args:
+
+        hash_names (set): Set of hash algorithms (+ optionally length)
+                          to compute hashes (cf. DEFAULT_ALGORITHMS)
+        length (int): Length of the total sum of chunks to read
+
+    If the length is provided as algorithm, the length is also
+    computed and returned.

-    The header for hashing a git object consists of:
+    """
+
+    def __init__(self, hash_names=DEFAULT_ALGORITHMS, length=None):
+        self.state = {}
+        self.track_length = False
+        for name in hash_names:
+            if name == "length":
+                self.state["length"] = 0
+                self.track_length = True
+            else:
+                self.state[name] = _new_hash(name, length)
+
+    @classmethod
+    def from_state(cls, state, track_length):
+        ret = cls([])
+        ret.state = state
+        ret.track_length = track_length
+
+    @classmethod
+    def from_file(cls, fobj, hash_names=DEFAULT_ALGORITHMS, length=None):
+        ret = cls(length=length, hash_names=hash_names)
+        while True:
+            chunk = fobj.read(HASH_BLOCK_SIZE)
+            if not chunk:
+                break
+            ret.update(chunk)
+        return ret
+
+    @classmethod
+    def from_path(cls, path, hash_names=DEFAULT_ALGORITHMS):
+        length = os.path.getsize(path)
+        with open(path, "rb") as f:
+            ret = cls.from_file(f, hash_names=hash_names, length=length)
+        return ret
+
+    @classmethod
+    def from_data(cls, data, hash_names=DEFAULT_ALGORITHMS):
+        length = len(data)
+        fobj = BytesIO(data)
+        return cls.from_file(fobj, hash_names=hash_names, length=length)
+
+    def update(self, chunk):
+        for name, h in self.state.items():
+            if name == "length":
+                continue
+            h.update(chunk)
+        if self.track_length:
+            self.state["length"] += len(chunk)
+
+    def digest(self):
+        return {
+            name: h.digest() if name != "length" else h
+            for name, h in self.state.items()
+        }
+
+    def hexdigest(self):
+        return {
+            name: h.hexdigest() if name != "length" else h
+            for name, h in self.state.items()
+        }
+
+    def bytehexdigest(self):
+        return {
+            name: hash_to_bytehex(h.digest()) if name != "length" else h
+            for name, h in self.state.items()
+        }
+
+    def copy(self):
+        copied_state = {
+            name: h.copy() if name != "length" else h for name, h in self.state.items()
+        }
+        return self.from_state(copied_state, self.track_length)
+
+
+def _new_blake2_hash(algo):
+    """Return a function that initializes a blake2 hash."""
+    if algo in _blake2_hash_cache:
+        return _blake2_hash_cache[algo]()
+
+    lalgo = algo.lower()
+    if not lalgo.startswith("blake2"):
+        raise ValueError("Algorithm %s is not a blake2 hash" % algo)
+
+    blake_family = lalgo[:7]
+
+    digest_size = None
+    if lalgo[7:]:
+        try:
+            digest_size, remainder = divmod(int(lalgo[7:]), 8)
+        except ValueError:
+            raise ValueError("Unknown digest size for algo %s" % algo) from None
+        if remainder:
+            raise ValueError(
+                "Digest size for algorithm %s must be a multiple of 8" % algo
+            )
+
+    blake2 = getattr(hashlib, blake_family)
+    _blake2_hash_cache[algo] = lambda: blake2(digest_size=digest_size)
+
+    return _blake2_hash_cache[algo]()
+
+
+def _new_hashlib_hash(algo):
+    """Initialize a digest object from hashlib.
+
+    Handle the swh-specific names for the blake2-related algorithms
+    """
+    if algo.startswith("blake2"):
+        return _new_blake2_hash(algo)
+    else:
+        return hashlib.new(algo)
+
+
+def git_object_header(git_type: str, length: int) -> bytes:
+    """Returns the header for a git object of the given type and length.
+
+    The header of a git object consists of:
     - The type of the object (encoded in ASCII)
     - One ASCII space (\x20)
     - The length of the object (decimal encoded in ASCII)
     - One NUL byte

    Args:
-        base_algo: a hashlib-supported algorithm
+        base_algo (str from :const:`ALGORITHMS`): a hashlib-supported algorithm
        git_type: the type of the git object (supposedly one of 'blob',
                  'commit', 'tag', 'tree')
        length: the length of the git object you're encoding
@@ -37,25 +218,36 @@ def _new_git_hash(base_algo, git_type, length):
    Returns:
        a hashutil.hash object
    """
+    git_object_types = {
+        "blob",
+        "tree",
+        "commit",
+        "tag",
+        "snapshot",
+        "raw_extrinsic_metadata",
+        "extid",
+    }

-    h = hashlib.new(base_algo)
-    git_header = '%s %d\0' % (git_type, length)
-    h.update(git_header.encode('ascii'))
+    if git_type not in git_object_types:
+        raise ValueError(
+            "Unexpected git object type %s, expected one of %s"
+            % (git_type, ", ".join(sorted(git_object_types)))
+        )

-    return h
+    return ("%s %d\0" % (git_type, length)).encode("ascii")


-def _new_hash(algo, length=None):
-    """Initialize a digest object (as returned by python's hashlib) for the
-    requested algorithm. See the constant ALGORITHMS for the list of supported
-    algorithms. If a git-specific hashing algorithm is requested (e.g.,
-    "sha1_git"), the hashing object will be pre-fed with the needed header; for
-    this to work, length must be given.
+def _new_hash(algo: str, length: Optional[int] = None):
+    """Initialize a digest object (as returned by python's hashlib) for
+    the requested algorithm. See the constant ALGORITHMS for the list
+    of supported algorithms. If a git-specific hashing algorithm is
+    requested (e.g., "sha1_git"), the hashing object will be pre-fed
+    with the needed header; for this to work, length must be given.

    Args:
-        algo: a hashing algorithm (one of ALGORITHMS)
-        length: the length of the hashed payload (needed for git-specific
-                algorithms)
+        algo (str): a hashing algorithm (one of ALGORITHMS)
+        length (int): the length of the hashed payload (needed for
+          git-specific algorithms)

    Returns:
        a hashutil.hash object
@@ -63,125 +255,99 @@ def _new_hash(algo, length=None):
    Raises:
        ValueError if algo is unknown, or length is missing for a git-specific
        hash.
+
    """
    if algo not in ALGORITHMS:
-        raise ValueError('Unexpected hashing algorithm %s, '
-                         'expected one of %s' %
-                         (algo, ', '.join(sorted(ALGORITHMS))))
+        raise ValueError(
+            "Unexpected hashing algorithm %s, expected one of %s"
+            % (algo, ", ".join(sorted(ALGORITHMS)))
+        )

-    h = None
-    if algo.endswith('_git'):
+    if algo.endswith("_git"):
        if length is None:
-            raise ValueError('Missing length for git hashing algorithm')
+            raise ValueError("Missing length for git hashing algorithm")
        base_algo = algo[:-4]
-        h = _new_git_hash(base_algo, 'blob', length)
-    else:
-        h = hashlib.new(algo)
+        h = _new_hashlib_hash(base_algo)
+        h.update(git_object_header("blob", length))
+        return h

-    return h
+    return _new_hashlib_hash(algo)


-def hash_file(fobj, length=None, algorithms=ALGORITHMS, chunk_cb=None):
-    """Hash the contents of the given file object with the given algorithms.
+def hash_git_data(data, git_type, base_algo="sha1"):
+    """Hash the given data as a git object of type git_type.

    Args:
-        fobj: a file-like object
-        length: the length of the contents of the file-like object (for the
-                git-specific algorithms)
-        algorithms: the hashing algorithms used
+        data: a bytes object
+        git_type: the git object type
+        base_algo: the base hashing algorithm used (default: sha1)

-    Returns: a dict mapping each algorithm to a bytes digest.
+    Returns: a dict mapping each algorithm to a bytes digest

    Raises:
-        ValueError if algorithms contains an unknown hash algorithm.
+        ValueError if the git_type is unexpected.
    """
-    hashes = {algo: _new_hash(algo, length) for algo in algorithms}
-
-    while True:
-        chunk = fobj.read(HASH_BLOCK_SIZE)
-        if not chunk:
-            break
-        for hash in hashes.values():
-            hash.update(chunk)
-        if chunk_cb:
-            chunk_cb(chunk)
+    h = _new_hashlib_hash(base_algo)
+    h.update(git_object_header(git_type, len(data)))
+    h.update(data)

-    return {algo: hash.digest() for algo, hash in hashes.items()}
+    return h.digest()


-def hash_path(path, algorithms=ALGORITHMS, chunk_cb=None):
-    """Hash the contents of the file at the given path with the given algorithms.
+@functools.lru_cache()
+def hash_to_hex(hash: Union[str, bytes]) -> str:
+    """Converts a hash (in hex or bytes form) to its hexadecimal ascii form

    Args:
-        path: the path of the file to hash
-        algorithms: the hashing algorithms used
-        chunk_cb: a callback
-
-    Returns: a dict mapping each algorithm to a bytes digest.
+      hash (str or bytes): a :class:`bytes` hash or a :class:`str` containing
+        the hexadecimal form of the hash

-    Raises:
-        ValueError if algorithms contains an unknown hash algorithm.
-        OSError on file access error
+    Returns:
+      str: the hexadecimal form of the hash
    """
-    length = os.path.getsize(path)
-    with open(path, 'rb') as fobj:
-        return hash_file(fobj, length, algorithms, chunk_cb)
+    if isinstance(hash, str):
+        return hash
+    return binascii.hexlify(hash).decode("ascii")


-def hash_data(data, algorithms=ALGORITHMS):
-    """Hash the given binary blob with the given algorithms.
+@functools.lru_cache()
+def hash_to_bytehex(hash: bytes) -> bytes:
+    """Converts a hash to its hexadecimal bytes representation

    Args:
-        data: a bytes object
-        algorithms: the hashing algorithms used
-
-    Returns: a dict mapping each algorithm to a bytes digest
+      hash (bytes): a :class:`bytes` hash

-    Raises:
-        TypeError if data does not support the buffer interface.
-        ValueError if algorithms contains an unknown hash algorithm.
+    Returns:
+      bytes: the hexadecimal form of the hash, as :class:`bytes`
    """
-    fobj = BytesIO(data)
-    return hash_file(fobj, len(data), algorithms)
+    return binascii.hexlify(hash)


-def hash_git_data(data, git_type, base_algo='sha1'):
-    """Hash the given data as a git object of type git_type.
+@functools.lru_cache()
+def hash_to_bytes(hash: Union[str, bytes]) -> bytes:
+    """Converts a hash (in hex or bytes form) to its raw bytes form

    Args:
-        data: a bytes object
-        git_type: the git object type
-        base_algo: the base hashing algorithm used (default: sha1)
+      hash (str or bytes): a :class:`bytes` hash or a :class:`str` containing
+        the hexadecimal form of the hash

-    Returns: a dict mapping each algorithm to a bytes digest
-
-    Raises:
-        ValueError if the git_type is unexpected.
+    Returns:
+      bytes: the :class:`bytes` form of the hash
    """
-
-    git_object_types = {'blob', 'tree', 'commit', 'tag'}
-
-    if git_type not in git_object_types:
-        raise ValueError('Unexpected git object type %s, expected one of %s' %
-                         (git_type, ', '.join(sorted(git_object_types))))
-
-    h = _new_git_hash(base_algo, git_type, len(data))
-    h.update(data)
-
-    return h.digest()
+    if isinstance(hash, bytes):
+        return hash
+    return bytes.fromhex(hash)


 @functools.lru_cache()
-def hash_to_hex(hash):
-    """Converts a hash (in hex or bytes form) to its hexadecimal ascii form"""
-    if isinstance(hash, str):
-        return hash
-    return binascii.hexlify(hash).decode('ascii')
+def bytehex_to_hash(hex: bytes) -> bytes:
+    """Converts a hexadecimal bytes representation of a hash to that hash

+    Args:
+      hash (bytes): a :class:`bytes` containing the hexadecimal form of the
+        hash encoded in ascii

-@functools.lru_cache()
-def hash_to_bytes(hash):
-    """Converts a hash (in hex or bytes form) to its raw bytes form"""
-    if isinstance(hash, bytes):
-        return hash
-    return bytes.fromhex(hash)
+    Returns:
+      bytes: the :class:`bytes` form of the hash
+    """
+    return hash_to_bytes(hex.decode())
--- a/swh/model/hypothesis_strategies.py
+++ b/swh/model/hypothesis_strategies.py
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
--- a/swh/model/merkle.py
+++ b/swh/model/merkle.py
+# Copyright (C) 2017-2022 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Merkle tree data structure"""
+
+from __future__ import annotations
+
+import abc
+from typing import Any, Dict, Iterator, List, Set
+
+
+class MerkleNode(dict, metaclass=abc.ABCMeta):
+    """Representation of a node in a Merkle Tree.
+
+    A (generalized) `Merkle Tree`_ is a tree in which every node is labeled
+    with a hash of its own data and the hash of its children.
+
+    .. _Merkle Tree: https://en.wikipedia.org/wiki/Merkle_tree
+
+    In pseudocode::
+
+      node.hash = hash(node.data
+                       + sum(child.hash for child in node.children))
+
+    This class efficiently implements the Merkle Tree data structure on top of
+    a Python :class:`dict`, minimizing hash computations and new data
+    collections when updating nodes.
+
+    Node data is stored in the :attr:`data` attribute, while (named) children
+    are stored as items of the underlying dictionary.
+
+    Addition, update and removal of objects are instrumented to automatically
+    invalidate the hashes of the current node as well as its registered
+    parents; It also resets the collection status of the objects so the updated
+    objects can be collected.
+
+    The collection of updated data from the tree is implemented through the
+    :func:`collect` function and associated helpers.
+
+    """
+
+    __slots__ = ["parents", "data", "__hash", "collected"]
+
+    data: Dict
+    """data associated to the current node"""
+
+    parents: List
+    """known parents of the current node"""
+
+    collected: bool
+    """whether the current node has been collected"""
+
+    def __init__(self, data=None):
+        super().__init__()
+        self.parents = []
+        self.data = data
+        self.__hash = None
+        self.collected = False
+
+    def __eq__(self, other):
+        return (
+            isinstance(other, MerkleNode)
+            and super().__eq__(other)
+            and self.data == other.data
+        )
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def invalidate_hash(self):
+        """Invalidate the cached hash of the current node."""
+        if not self.__hash:
+            return
+
+        self.__hash = None
+        self.collected = False
+        for parent in self.parents:
+            parent.invalidate_hash()
+
+    def update_hash(self, *, force=False) -> Any:
+        """Recursively compute the hash of the current node.
+
+        Args:
+          force (bool): invalidate the cache and force the computation for
+            this node and all children.
+        """
+        if self.__hash and not force:
+            return self.__hash
+
+        if force:
+            self.invalidate_hash()
+
+        for child in self.values():
+            child.update_hash(force=force)
+
+        self.__hash = self.compute_hash()
+        return self.__hash
+
+    @property
+    def hash(self) -> Any:
+        """The hash of the current node, as calculated by
+        :func:`compute_hash`.
+        """
+        return self.update_hash()
+
+    def __hash__(self):
+        return hash(self.hash)
+
+    @abc.abstractmethod
+    def compute_hash(self) -> Any:
+        """Compute the hash of the current node.
+
+        The hash should depend on the data of the node, as well as on hashes
+        of the children nodes.
+        """
+        raise NotImplementedError("Must implement compute_hash method")
+
+    def __setitem__(self, name, new_child):
+        """Add a child, invalidating the current hash"""
+        self.invalidate_hash()
+
+        super().__setitem__(name, new_child)
+
+        new_child.parents.append(self)
+
+    def __delitem__(self, name):
+        """Remove a child, invalidating the current hash"""
+        if name in self:
+            self.invalidate_hash()
+            self[name].parents.remove(self)
+            super().__delitem__(name)
+        else:
+            raise KeyError(name)
+
+    def update(self, new_children):
+        """Add several named children from a dictionary"""
+        if not new_children:
+            return
+
+        self.invalidate_hash()
+
+        for name, new_child in new_children.items():
+            new_child.parents.append(self)
+            if name in self:
+                self[name].parents.remove(self)
+
+        super().update(new_children)
+
+    def get_data(self, **kwargs):
+        """Retrieve and format the collected data for the current node, for use by
+        :func:`collect`.
+
+        Can be overridden, for instance when you want the collected data to
+        contain information about the child nodes.
+
+        Arguments:
+          kwargs: allow subclasses to alter behaviour depending on how
+            :func:`collect` is called.
+
+        Returns:
+          data formatted for :func:`collect`
+        """
+        return self.data
+
+    def collect_node(self) -> Set[MerkleNode]:
+        """Collect the current node if it has not been yet, for use by :func:`collect`."""
+        if not self.collected:
+            self.collected = True
+            return {self}
+        else:
+            return set()
+
+    def collect(self) -> Set[MerkleNode]:
+        """Collect the added and modified nodes in the subtree rooted at `self`
+        since the last collect operation.
+
+        Returns:
+           A :class:`set` of collected nodes
+        """
+        ret = self.collect_node()
+        for child in self.values():
+            ret.update(child.collect())
+
+        return ret
+
+    def reset_collect(self):
+        """Recursively unmark collected nodes in the subtree rooted at `self`.
+
+        This lets the caller use :func:`collect` again.
+        """
+        self.collected = False
+
+        for child in self.values():
+            child.reset_collect()
+
+    def iter_tree(self, dedup=True) -> Iterator[MerkleNode]:
+        """Yields all children nodes, recursively. Common nodes are deduplicated
+        by default (deduplication can be turned off setting the given argument
+        'dedup' to False).
+        """
+        yield from self._iter_tree(seen=set(), dedup=dedup)
+
+    def _iter_tree(self, seen: Set[bytes], dedup) -> Iterator[MerkleNode]:
+        if self.hash not in seen:
+            if dedup:
+                seen.add(self.hash)
+            yield self
+            for child in self.values():
+                yield from child._iter_tree(seen=seen, dedup=dedup)
+
+
+class MerkleLeaf(MerkleNode):
+    """A leaf to a Merkle tree.
+
+    A Merkle leaf is simply a Merkle node with children disabled.
+    """
+
+    __slots__: List[str] = []
+
+    def __setitem__(self, name, child):
+        raise ValueError("%s is a leaf" % self.__class__.__name__)
+
+    def __getitem__(self, name):
+        raise ValueError("%s is a leaf" % self.__class__.__name__)
+
+    def __delitem__(self, name):
+        raise ValueError("%s is a leaf" % self.__class__.__name__)
+
+    def update(self, new_children):
+        """Children update operation. Disabled for leaves."""
+        raise ValueError("%s is a leaf" % self.__class__.__name__)
--- a/swh/model/model.py
+++ b/swh/model/model.py
--- a/swh/model/py.typed
+++ b/swh/model/py.typed
+# Marker file for PEP 561.
--- a/swh/model/swhids.py
+++ b/swh/model/swhids.py
--- a/swh/model/tests/data/dir-folders/sample-folder.tgz
+++ b/swh/model/tests/data/dir-folders/sample-folder.tgz
--- a/swh/model/tests/data/repos/sample-repo.tgz
+++ b/swh/model/tests/data/repos/sample-repo.tgz
--- a/swh/model/tests/fields/test_compound.py
+++ b/swh/model/tests/fields/test_compound.py
--- a/swh/model/tests/fields/test_hashes.py
+++ b/swh/model/tests/fields/test_hashes.py
--- a/swh/model/tests/fields/test_simple.py
+++ b/swh/model/tests/fields/test_simple.py
No results found