hashutil: Migrate towards MultiHash api

34870256 · Antoine R. Dumont · eb338cda · 34870256 · 34870256 · 34870256
Verified Commit 34870256 authored 6 years ago by Antoine R. Dumont
--- a/swh/model/from_disk.py
+++ b/swh/model/from_disk.py
-# Copyright (C) 2017 The Software Heritage developers
+# Copyright (C) 2017-2018 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -7,7 +7,7 @@ import enum
 import os
 import stat

-from . import hashutil
+from .hashutil import MultiHash, HASH_BLOCK_SIZE
 from .merkle import MerkleLeaf, MerkleNode
 from .identifiers import (
    directory_identifier,
@@ -77,8 +77,9 @@ class Content(MerkleLeaf):
          mode (int): a file mode (passed to :func:`mode_to_perms`)
          data (bytes): raw contents of the file
        """
-        ret = hashutil.hash_data(data)
-        ret['length'] = len(data)
+        length = len(data)
+        ret = MultiHash.from_data(data, length=length).digest()
+        ret['length'] = length
        ret['perms'] = mode_to_perms(mode)
        ret['data'] = data

@@ -91,8 +92,8 @@ class Content(MerkleLeaf):

    @classmethod
    def from_file(cls, *, path, data=False, save_path=False):
-        """Compute the Software Heritage content entry corresponding to an on-disk
-        file.
+        """Compute the Software Heritage content entry corresponding to an
+        on-disk file.

        The returned dictionary contains keys useful for both:
        - loading the content in the archive (hashes, `length`)
@@ -103,6 +104,7 @@ class Content(MerkleLeaf):
            content entry
          data (bool): add the file data to the entry
          save_path (bool): add the file path to the entry
+
        """
        file_stat = os.lstat(path)
        mode = file_stat.st_mode
@@ -117,17 +119,16 @@ class Content(MerkleLeaf):
        length = file_stat.st_size

        if not data:
-            ret = hashutil.hash_path(path)
+            ret = MultiHash.from_path(path).digest()
        else:
+            h = MultiHash(length=length)
            chunks = []
-
-            def append_chunk(x, chunks=chunks):
-                chunks.append(x)
-
            with open(path, 'rb') as fobj:
-                ret = hashutil.hash_file(fobj, length=length,
-                                         chunk_cb=append_chunk)
+                for chunk in fobj:
+                    h.update(chunk)
+                    chunks.append(chunk)

+            ret = h.digest()
            ret['data'] = b''.join(chunks)

        if save_path:

--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
@@ -11,8 +11,7 @@ from functools import lru_cache

 from .exceptions import ValidationError
 from .fields.hashes import validate_sha1
-from .hashutil import hash_data, hash_git_data, DEFAULT_ALGORITHMS
-from .hashutil import hash_to_hex
+from .hashutil import hash_git_data, hash_to_hex, MultiHash


 SNAPSHOT = 'snapshot'
@@ -104,7 +103,7 @@ def content_identifier(content):

    """

-    return hash_data(content['data'], DEFAULT_ALGORITHMS)
+    return MultiHash.from_data(content['data']).digest()


 def _sort_key(entry):

--- a/swh/model/validators.py
+++ b/swh/model/validators.py
-# Copyright (C) 2015  The Software Heritage developers
+# Copyright (C) 2015-2018  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information

 from .exceptions import ValidationError, NON_FIELD_ERRORS
-from . import fields, hashutil
+from . import fields
+from .hashutil import MultiHash, hash_to_bytes


 def validate_content(content):
@@ -44,11 +45,11 @@ def validate_content(content):
    def validate_hashes(content):
        errors = []
        if 'data' in content:
-            hashes = hashutil.hash_data(content['data'])
+            hashes = MultiHash.from_data(content['data']).digest()
            for hash_type, computed_hash in hashes.items():
                if hash_type not in content:
                    continue
-                content_hash = hashutil.hash_to_bytes(content[hash_type])
+                content_hash = hash_to_bytes(content[hash_type])
                if content_hash != computed_hash:
                    errors.append(ValidationError(
                        'hash mismatch in content for hash %(hash)s',