From 24f8dd4d1bb765e422dbac918e9d2e367286615e Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <antoine.romain.dumont@gmail.com>
Date: Thu, 16 Mar 2017 12:19:49 +0100
Subject: [PATCH] swh.model.hashutil: Adapt according to latest discussion

- Add module docstring
- Add blake2s256 and blake2b512 in supported algorithms
- Spawn a new variable DEFAULT_ALGORITHMS as default computed
  algorithms for the main functions

Related T692
---
 swh/model/hashutil.py            | 75 ++++++++++++++++++++------------
 swh/model/tests/test_hashutil.py |  8 ++--
 2 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py
index 81c88c3a..aa047869 100644
--- a/swh/model/hashutil.py
+++ b/swh/model/hashutil.py
@@ -3,6 +3,27 @@
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
+"""Module in charge of hashing function definitions. This is the base
+module use to compute swh's hashes.
+
+Only a subset of hashing algorithms is supported as defined in the
+ALGORITHMS set. Any provided algorithms not in that list will result
+in a ValueError explaining the error.
+
+This modules defines the following hashing functions:
+
+- hash_file: Hash the contents of the given file object with the given
+  algorithms (defaulting to DEFAULT_ALGORITHMS if none provided).
+
+- hash_data: Hash the given binary blob with the given algorithms
+  (defaulting to DEFAULT_ALGORITHMS if none provided).
+
+- hash_path: Hash the contents of the file at the given path with the
+  given algorithms (defaulting to DEFAULT_ALGORITHMS if none
+  provided).
+
+"""
+
 import binascii
 import functools
 import hashlib
@@ -10,8 +31,11 @@ import os
 
 from io import BytesIO
 
-# supported hashing algorithms
-ALGORITHMS = set(['sha1', 'sha256', 'sha1_git'])
+# Supported algorithms
+ALGORITHMS = set(['sha1', 'sha256', 'sha1_git', 'blake2s256', 'blake2b512'])
+
+# Default algorithms used
+DEFAULT_ALGORITHMS = set(['sha1', 'sha256', 'sha1_git'])
 
 # should be a multiple of 64 (sha1/sha256's block size)
 # FWIW coreutils' sha1sum uses 32768
@@ -47,16 +71,16 @@ def _new_git_hash(base_algo, git_type, length):
 
 
 def _new_hash(algo, length=None):
-    """Initialize a digest object (as returned by python's hashlib) for the
-    requested algorithm. See the constant ALGORITHMS for the list of supported
-    algorithms. If a git-specific hashing algorithm is requested (e.g.,
-    "sha1_git"), the hashing object will be pre-fed with the needed header; for
-    this to work, length must be given.
+    """Initialize a digest object (as returned by python's hashlib) for
+    the requested algorithm. See the constant ALGORITHMS for the list
+    of supported algorithms. If a git-specific hashing algorithm is
+    requested (e.g., "sha1_git"), the hashing object will be pre-fed
+    with the needed header; for this to work, length must be given.
 
     Args:
-        algo: a hashing algorithm (one of ALGORITHMS)
-        length: the length of the hashed payload (needed for git-specific
-                algorithms)
+        algo (str): a hashing algorithm (one of ALGORITHMS)
+        length (int): the length of the hashed payload (needed for
+                git-specific algorithms)
 
     Returns:
         a hashutil.hash object
@@ -64,30 +88,23 @@ def _new_hash(algo, length=None):
     Raises:
         ValueError if algo is unknown, or length is missing for a git-specific
         hash.
+
     """
-    if algo not in ALGORITHMS and ':' not in algo:
-        raise ValueError('Unexpected hashing algorithm %s, '
-                         'expected one of %s' %
-                         (algo, ', '.join(sorted(ALGORITHMS))))
+    if algo not in ALGORITHMS:
+        raise ValueError(
+            'Unexpected hashing algorithm %s, expected one of %s' %
+            (algo, ', '.join(sorted(ALGORITHMS))))
 
-    h = None
     if algo.endswith('_git'):
         if length is None:
             raise ValueError('Missing length for git hashing algorithm')
         base_algo = algo[:-4]
-        h = _new_git_hash(base_algo, 'blob', length)
-    elif ':' in algo:   # variable length hashing algorithms
-        _algo = algo.split(':')
-        base_algo = _algo[0]
-        variable_length = int(_algo[1])
-        h = hashlib.new('%s%s' % (base_algo, variable_length))
-    else:
-        h = hashlib.new(algo)
+        return _new_git_hash(base_algo, 'blob', length)
 
-    return h
+    return hashlib.new(algo)
 
 
-def hash_file(fobj, length=None, algorithms=ALGORITHMS, chunk_cb=None):
+def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None):
     """Hash the contents of the given file object with the given algorithms.
 
     Args:
@@ -115,8 +132,9 @@ def hash_file(fobj, length=None, algorithms=ALGORITHMS, chunk_cb=None):
     return {algo: hash.digest() for algo, hash in hashes.items()}
 
 
-def hash_path(path, algorithms=ALGORITHMS, chunk_cb=None):
-    """Hash the contents of the file at the given path with the given algorithms.
+def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None):
+    """Hash the contents of the file at the given path with the given
+       algorithms.
 
     Args:
         path: the path of the file to hash
@@ -128,6 +146,7 @@ def hash_path(path, algorithms=ALGORITHMS, chunk_cb=None):
     Raises:
         ValueError if algorithms contains an unknown hash algorithm.
         OSError on file access error
+
     """
     length = os.path.getsize(path)
     with open(path, 'rb') as fobj:
@@ -136,7 +155,7 @@ def hash_path(path, algorithms=ALGORITHMS, chunk_cb=None):
     return hash
 
 
-def hash_data(data, algorithms=ALGORITHMS):
+def hash_data(data, algorithms=DEFAULT_ALGORITHMS):
     """Hash the given binary blob with the given algorithms.
 
     Args:
diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py
index 8d14524d..758e24eb 100644
--- a/swh/model/tests/test_hashutil.py
+++ b/swh/model/tests/test_hashutil.py
@@ -131,14 +131,16 @@ class Hashutil(unittest.TestCase):
             hashutil._new_hash('blake2:10')
         except ValueError as e:
             self.assertEquals(str(e),
-                              'unsupported hash type blake210')
+                              'Unexpected hashing algorithm blake2:10, '
+                              'expected one of blake2b512, blake2s256, '
+                              'sha1, sha1_git, sha256')
 
     @patch('swh.model.hashutil.hashlib')
     @istest
     def new_hash_blake2b(self, mock_hashlib):
         mock_hashlib.new.return_value = 'some-hashlib-object'
 
-        h = hashutil._new_hash('blake2b:512')
+        h = hashutil._new_hash('blake2b512')
 
         self.assertEquals(h, 'some-hashlib-object')
         mock_hashlib.new.assert_called_with('blake2b512')
@@ -148,7 +150,7 @@ class Hashutil(unittest.TestCase):
     def new_hash_blake2s(self, mock_hashlib):
         mock_hashlib.new.return_value = 'some-hashlib-object'
 
-        h = hashutil._new_hash('blake2s:256')
+        h = hashutil._new_hash('blake2s256')
 
         self.assertEquals(h, 'some-hashlib-object')
         mock_hashlib.new.assert_called_with('blake2s256')
-- 
GitLab