From 7f885ed5506de8f61dbd04e91ff843dcbac1bddc Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <antoine.romain.dumont@gmail.com> Date: Fri, 14 Sep 2018 01:11:43 +0200 Subject: [PATCH] swh.model.hashutil: Open hash_stream endpoint Related T421 --- swh/model/hashutil.py | 68 ++++++++++++++++++++++++++------ swh/model/tests/test_hashutil.py | 14 +++++++ 2 files changed, 70 insertions(+), 12 deletions(-) diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index 6675b5cb..a1556038 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -162,8 +162,8 @@ def _new_hash(algo, length=None): return _new_hashlib_hash(algo) -def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None, - hexdigest=False): +def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, + chunk_cb=None, with_length=False, hexdigest=False): """Hash the contents of the given file object with the given algorithms. Args: @@ -172,6 +172,7 @@ def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None, git-specific algorithms) algorithms: the hashing algorithms to be used, as an iterable over strings + with_length (bool): Include length in the dict result hexdigest (bool): False returns the hash as binary, otherwise returns as hex @@ -193,11 +194,55 @@ def hash_file(fobj, length=None, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None, chunk_cb(chunk) if hexdigest: - return {algo: hash.hexdigest() for algo, hash in hashes.items()} - return {algo: hash.digest() for algo, hash in hashes.items()} + h = {algo: hash.hexdigest() for algo, hash in hashes.items()} + else: + h = {algo: hash.digest() for algo, hash in hashes.items()} + if with_length: + h['length'] = length + return h + + +def hash_stream(s, length=None, algorithms=DEFAULT_ALGORITHMS, + chunk_cb=None, with_length=False, hexdigest=False): + """Hash the contents of the given stream with the given algorithms. + + Args: + s (stream): a stream object (e.g requests.get(stream=True)) + length (int): the length of the contents of the stream (for the + git-specific algorithms) + algorithms (dict): the hashing algorithms to be used, as an + iterable over strings + with_length (bool): Include length in the dict result + hexdigest (bool): False returns the hash as binary, otherwise + returns as hex + + Returns: a dict mapping each algorithm to a digest (bytes by default). + + Raises: + ValueError if algorithms contains an unknown hash algorithm. + + """ + hashes = {algo: _new_hash(algo, length) for algo in algorithms} + + for chunk in s.iter_content(): + if not chunk: + break + for hash in hashes.values(): + hash.update(chunk) + if chunk_cb: + chunk_cb(chunk) + if hexdigest: + h = {algo: hash.hexdigest() for algo, hash in hashes.items()} + else: + h = {algo: hash.digest() for algo, hash in hashes.items()} + if with_length: + h['length'] = length + return h -def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): + +def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None, + with_length=True, hexdigest=False): """Hash the contents of the file at the given path with the given algorithms. @@ -205,6 +250,9 @@ def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): path: the path of the file to hash algorithms: the hashing algorithms used chunk_cb: a callback + with_length (bool): Include length in the dict result + hexdigest (bool): False returns the hash as binary, otherwise + returns as hex Returns: a dict mapping each algorithm to a bytes digest. @@ -215,9 +263,8 @@ def hash_path(path, algorithms=DEFAULT_ALGORITHMS, chunk_cb=None): """ length = os.path.getsize(path) with open(path, 'rb') as fobj: - hash = hash_file(fobj, length, algorithms, chunk_cb=chunk_cb) - hash['length'] = length - return hash + return hash_file(fobj, length, algorithms, chunk_cb=chunk_cb, + with_length=with_length, hexdigest=hexdigest) def hash_data(data, algorithms=DEFAULT_ALGORITHMS, with_length=False): @@ -236,10 +283,7 @@ def hash_data(data, algorithms=DEFAULT_ALGORITHMS, with_length=False): """ fobj = BytesIO(data) length = len(data) - data = hash_file(fobj, length, algorithms) - if with_length: - data['length'] = length - return data + return hash_file(fobj, length, algorithms, with_length=with_length) def hash_git_data(data, git_type, base_algo='sha1'): diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index fabbf16b..99bd78e1 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -102,6 +102,20 @@ class Hashutil(unittest.TestCase): hexdigest=True) self.assertEqual(checksums, self.hex_checksums) + @istest + def hash_stream(self): + class StreamStub: + def __init__(self, data): + self.data = data + + def iter_content(self): + yield from io.BytesIO(self.data) + + s = StreamStub(self.data) + checksums = hashutil.hash_stream(s, length=len(self.data), + hexdigest=True) + self.assertEqual(checksums, self.hex_checksums) + @istest def hash_file_missing_length(self): fobj = io.BytesIO(self.data) -- GitLab