diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py new file mode 100644 index 0000000000000000000000000000000000000000..b8c6025fc96d189d47d17cd0a1193094d7d0a089 --- /dev/null +++ b/swh/model/hashutil.py @@ -0,0 +1,147 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import hashlib +from io import BytesIO + +# supported hashing algorithms +ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) + +# should be a multiple of 64 (sha1/sha256's block size) +# FWIW coreutils' sha1sum uses 32768 +HASH_BLOCK_SIZE = 32768 + + +def _new_git_hash(base_algo, git_type, length): + """Initialize a digest object (as returned by python's hashlib) for the + requested algorithm, and feed it with the header for a git object of the + given type and length. + + The header for hashing a git object consists of: + - The type of the object (encoded in ASCII) + - One ASCII space (\x20) + - The length of the object (decimal encoded in ASCII) + - One NUL byte + + Args: + base_algo: a hashlib-supported algorithm + git_type: the type of the git object (supposedly one of 'blob', + 'commit', 'tag', 'tree') + length: the length of the git object you're encoding + + Returns: + a hashutil.hash object + """ + + h = hashlib.new(base_algo) + git_header = '%s %d\0' % (git_type, length) + h.update(git_header.encode('ascii')) + + return h + + +def _new_hash(algo, length=None): + """Initialize a digest object (as returned by python's hashlib) for the + requested algorithm. See the constant ALGORITHMS for the list of supported + algorithms. If a git-specific hashing algorithm is requested (e.g., + "sha1_git"), the hashing object will be pre-fed with the needed header; for + this to work, length must be given. + + Args: + algo: a hashing algorithm (one of ALGORITHMS) + length: the length of the hashed payload (needed for git-specific + algorithms) + + Returns: + a hashutil.hash object + + Raises: + ValueError if algo is unknown, or length is missing for a git-specific + hash. + """ + if algo not in ALGORITHMS: + raise ValueError('Unexpected hashing algorithm %s, ' + 'expected one of %s' % + (algo, ', '.join(sorted(ALGORITHMS)))) + + h = None + if algo.endswith('_git'): + if length is None: + raise ValueError('Missing length for git hashing algorithm') + base_algo = algo[:-4] + h = _new_git_hash(base_algo, 'blob', length) + else: + h = hashlib.new(algo) + + return h + + +def hash_file(fobj, length=None, algorithms=ALGORITHMS): + """Hash the contents of the given file object with the given algorithms. + + Args: + fobj: a file-like object + length: the length of the contents of the file-like object (for the + git-specific algorithms) + algorithms: the hashing algorithms used + + Returns: a dict mapping each algorithm to a hexadecimal digest + + Raises: + ValueError if algorithms contains an unknown hash algorithm. + """ + hashes = {algo: _new_hash(algo, length) for algo in algorithms} + + while True: + chunk = fobj.read(HASH_BLOCK_SIZE) + if not chunk: + break + for hash in hashes.values(): + hash.update(chunk) + + return {algo: hash.hexdigest() for algo, hash in hashes.items()} + + +def hash_data(data, algorithms=ALGORITHMS): + """Hash the given binary blob with the given algorithms. + + Args: + data: a bytes object + algorithms: the hashing algorithms used + + Returns: a dict mapping each algorithm to a hexadecimal digest + + Raises: + TypeError if data does not support the buffer interface. + ValueError if algorithms contains an unknown hash algorithm. + """ + fobj = BytesIO(data) + return hash_file(fobj, len(data), algorithms) + + +def hash_git_data(data, git_type, base_algo='sha1'): + """Hash the given data as a git object of type git_type. + + Args: + data: a bytes object + git_type: the git object type + base_algo: the base hashing algorithm used (default: sha1) + + Returns: a dict mapping each algorithm to a hexadecimal digest + + Raises: + ValueError if the git_type is unexpected. + """ + + git_object_types = {'blob', 'tree', 'commit', 'tag'} + + if git_type not in git_object_types: + raise ValueError('Unexpected git object type %s, expected one of %s' % + (git_type, ', '.join(sorted(git_object_types)))) + + h = _new_git_hash(base_algo, git_type, len(data)) + h.update(data) + + return h.hexdigest() diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py new file mode 100644 index 0000000000000000000000000000000000000000..45c55f88af6615dc808620f8a2d47cde81ef3dec --- /dev/null +++ b/swh/model/tests/test_hashutil.py @@ -0,0 +1,75 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import io +import unittest + +from nose.tools import istest + +from swh.model import hashutil + + +class Hashutil(unittest.TestCase): + def setUp(self): + self.data = b'1984\n' + self.hex_checksums = { + 'sha1': '62be35bf00ff0c624f4a621e2ea5595a049e0731', + 'sha1_git': '568aaf43d83b2c3df8067f3bedbb97d83260be6d', + 'sha256': '26602113b4b9afd9d55466b08580d3c2' + '4a9b50ee5b5866c0d91fab0e65907311', + } + + self.git_checksums = { + 'blob': self.hex_checksums['sha1_git'], + 'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0', + 'commit': '79e4093542e72f0fcb7cbd75cb7d270f9254aa8f', + 'tag': 'd6bf62466f287b4d986c545890716ce058bddf67', + } + + @istest + def hash_data(self): + checksums = hashutil.hash_data(self.data) + self.assertEqual(checksums, self.hex_checksums) + + @istest + def hash_data_unknown_hash(self): + with self.assertRaises(ValueError) as cm: + hashutil.hash_data(self.data, ['unknown-hash']) + + self.assertIn('Unexpected hashing algorithm', cm.exception.args[0]) + self.assertIn('unknown-hash', cm.exception.args[0]) + + @istest + def hash_git_data(self): + checksums = { + git_type: hashutil.hash_git_data(self.data, git_type) + for git_type in self.git_checksums + } + + self.assertEqual(checksums, self.git_checksums) + + @istest + def hash_git_data_unknown_git_type(self): + with self.assertRaises(ValueError) as cm: + hashutil.hash_git_data(self.data, 'unknown-git-type') + + self.assertIn('Unexpected git object type', cm.exception.args[0]) + self.assertIn('unknown-git-type', cm.exception.args[0]) + + @istest + def hash_file(self): + fobj = io.BytesIO(self.data) + + checksums = hashutil.hash_file(fobj, length=len(self.data)) + self.assertEqual(checksums, self.hex_checksums) + + @istest + def hash_file_missing_length(self): + fobj = io.BytesIO(self.data) + + with self.assertRaises(ValueError) as cm: + hashutil.hash_file(fobj, algorithms=['sha1_git']) + + self.assertIn('Missing length', cm.exception.args[0])