Skip to content
Snippets Groups Projects
Commit 59742ac0 authored by Nicolas Dandrimont's avatar Nicolas Dandrimont
Browse files

swh.model.hashutil: Add hashing utilities to swh.model

parent 76eb3640
No related branches found
No related tags found
No related merge requests found
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import hashlib
from io import BytesIO
# supported hashing algorithms
ALGORITHMS = set(['sha1', 'sha256', 'sha1_git'])
# should be a multiple of 64 (sha1/sha256's block size)
# FWIW coreutils' sha1sum uses 32768
HASH_BLOCK_SIZE = 32768
def _new_git_hash(base_algo, git_type, length):
"""Initialize a digest object (as returned by python's hashlib) for the
requested algorithm, and feed it with the header for a git object of the
given type and length.
The header for hashing a git object consists of:
- The type of the object (encoded in ASCII)
- One ASCII space (\x20)
- The length of the object (decimal encoded in ASCII)
- One NUL byte
Args:
base_algo: a hashlib-supported algorithm
git_type: the type of the git object (supposedly one of 'blob',
'commit', 'tag', 'tree')
length: the length of the git object you're encoding
Returns:
a hashutil.hash object
"""
h = hashlib.new(base_algo)
git_header = '%s %d\0' % (git_type, length)
h.update(git_header.encode('ascii'))
return h
def _new_hash(algo, length=None):
"""Initialize a digest object (as returned by python's hashlib) for the
requested algorithm. See the constant ALGORITHMS for the list of supported
algorithms. If a git-specific hashing algorithm is requested (e.g.,
"sha1_git"), the hashing object will be pre-fed with the needed header; for
this to work, length must be given.
Args:
algo: a hashing algorithm (one of ALGORITHMS)
length: the length of the hashed payload (needed for git-specific
algorithms)
Returns:
a hashutil.hash object
Raises:
ValueError if algo is unknown, or length is missing for a git-specific
hash.
"""
if algo not in ALGORITHMS:
raise ValueError('Unexpected hashing algorithm %s, '
'expected one of %s' %
(algo, ', '.join(sorted(ALGORITHMS))))
h = None
if algo.endswith('_git'):
if length is None:
raise ValueError('Missing length for git hashing algorithm')
base_algo = algo[:-4]
h = _new_git_hash(base_algo, 'blob', length)
else:
h = hashlib.new(algo)
return h
def hash_file(fobj, length=None, algorithms=ALGORITHMS):
"""Hash the contents of the given file object with the given algorithms.
Args:
fobj: a file-like object
length: the length of the contents of the file-like object (for the
git-specific algorithms)
algorithms: the hashing algorithms used
Returns: a dict mapping each algorithm to a hexadecimal digest
Raises:
ValueError if algorithms contains an unknown hash algorithm.
"""
hashes = {algo: _new_hash(algo, length) for algo in algorithms}
while True:
chunk = fobj.read(HASH_BLOCK_SIZE)
if not chunk:
break
for hash in hashes.values():
hash.update(chunk)
return {algo: hash.hexdigest() for algo, hash in hashes.items()}
def hash_data(data, algorithms=ALGORITHMS):
"""Hash the given binary blob with the given algorithms.
Args:
data: a bytes object
algorithms: the hashing algorithms used
Returns: a dict mapping each algorithm to a hexadecimal digest
Raises:
TypeError if data does not support the buffer interface.
ValueError if algorithms contains an unknown hash algorithm.
"""
fobj = BytesIO(data)
return hash_file(fobj, len(data), algorithms)
def hash_git_data(data, git_type, base_algo='sha1'):
"""Hash the given data as a git object of type git_type.
Args:
data: a bytes object
git_type: the git object type
base_algo: the base hashing algorithm used (default: sha1)
Returns: a dict mapping each algorithm to a hexadecimal digest
Raises:
ValueError if the git_type is unexpected.
"""
git_object_types = {'blob', 'tree', 'commit', 'tag'}
if git_type not in git_object_types:
raise ValueError('Unexpected git object type %s, expected one of %s' %
(git_type, ', '.join(sorted(git_object_types))))
h = _new_git_hash(base_algo, git_type, len(data))
h.update(data)
return h.hexdigest()
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import io
import unittest
from nose.tools import istest
from swh.model import hashutil
class Hashutil(unittest.TestCase):
def setUp(self):
self.data = b'1984\n'
self.hex_checksums = {
'sha1': '62be35bf00ff0c624f4a621e2ea5595a049e0731',
'sha1_git': '568aaf43d83b2c3df8067f3bedbb97d83260be6d',
'sha256': '26602113b4b9afd9d55466b08580d3c2'
'4a9b50ee5b5866c0d91fab0e65907311',
}
self.git_checksums = {
'blob': self.hex_checksums['sha1_git'],
'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0',
'commit': '79e4093542e72f0fcb7cbd75cb7d270f9254aa8f',
'tag': 'd6bf62466f287b4d986c545890716ce058bddf67',
}
@istest
def hash_data(self):
checksums = hashutil.hash_data(self.data)
self.assertEqual(checksums, self.hex_checksums)
@istest
def hash_data_unknown_hash(self):
with self.assertRaises(ValueError) as cm:
hashutil.hash_data(self.data, ['unknown-hash'])
self.assertIn('Unexpected hashing algorithm', cm.exception.args[0])
self.assertIn('unknown-hash', cm.exception.args[0])
@istest
def hash_git_data(self):
checksums = {
git_type: hashutil.hash_git_data(self.data, git_type)
for git_type in self.git_checksums
}
self.assertEqual(checksums, self.git_checksums)
@istest
def hash_git_data_unknown_git_type(self):
with self.assertRaises(ValueError) as cm:
hashutil.hash_git_data(self.data, 'unknown-git-type')
self.assertIn('Unexpected git object type', cm.exception.args[0])
self.assertIn('unknown-git-type', cm.exception.args[0])
@istest
def hash_file(self):
fobj = io.BytesIO(self.data)
checksums = hashutil.hash_file(fobj, length=len(self.data))
self.assertEqual(checksums, self.hex_checksums)
@istest
def hash_file_missing_length(self):
fobj = io.BytesIO(self.data)
with self.assertRaises(ValueError) as cm:
hashutil.hash_file(fobj, algorithms=['sha1_git'])
self.assertIn('Missing length', cm.exception.args[0])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment