diff --git a/.gitignore b/.gitignore index 71fd348e7e94a13ac92eafc083d604d5dd5c0463..303d302636f5d8ff2d1d08c9764cc601676f2f0e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ .eggs/ __pycache__ *.egg-info/ +dist +version.txt diff --git a/PKG-INFO b/PKG-INFO index 44c2684d611c7ed91c5e418a59ab26632c8093bd..c6992a77247a84d374d1df9cd9c2d6114794e9f4 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.1 +Version: 0.0.2 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..c6992a77247a84d374d1df9cd9c2d6114794e9f4 --- /dev/null +++ b/swh.model.egg-info/PKG-INFO @@ -0,0 +1,10 @@ +Metadata-Version: 1.0 +Name: swh.model +Version: 0.0.2 +Summary: Software Heritage data model +Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ +Author: Software Heritage developers +Author-email: swh-devel@inria.fr +License: UNKNOWN +Description: UNKNOWN +Platform: UNKNOWN diff --git a/swh.model.egg-info/SOURCES.txt b/swh.model.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..98291c01c3157c1cc892bc40b863ba45628e8df8 --- /dev/null +++ b/swh.model.egg-info/SOURCES.txt @@ -0,0 +1,36 @@ +.gitignore +AUTHORS +LICENSE +MANIFEST.in +Makefile +requirements.txt +setup.py +version.txt +debian/changelog +debian/compat +debian/control +debian/copyright +debian/rules +debian/source/format +swh.model.egg-info/PKG-INFO +swh.model.egg-info/SOURCES.txt +swh.model.egg-info/dependency_links.txt +swh.model.egg-info/requires.txt +swh.model.egg-info/top_level.txt +swh/model/__init__.py +swh/model/exceptions.py +swh/model/hashutil.py +swh/model/identifiers.py +swh/model/validators.py +swh/model/fields/__init__.py +swh/model/fields/compound.py +swh/model/fields/hashes.py +swh/model/fields/simple.py +swh/model/tests/__init__.py +swh/model/tests/test_hashutil.py +swh/model/tests/test_identifiers.py +swh/model/tests/test_validators.py +swh/model/tests/fields/__init__.py +swh/model/tests/fields/test_compound.py +swh/model/tests/fields/test_hashes.py +swh/model/tests/fields/test_simple.py \ No newline at end of file diff --git a/swh.model.egg-info/dependency_links.txt b/swh.model.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/swh.model.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/swh.model.egg-info/requires.txt b/swh.model.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..39a323addb39c408716b8874ef828acd3c4da427 --- /dev/null +++ b/swh.model.egg-info/requires.txt @@ -0,0 +1 @@ +vcversioner diff --git a/swh.model.egg-info/top_level.txt b/swh.model.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..0cb0f8f527bcf0669aab8b261756601dbf9afdd2 --- /dev/null +++ b/swh.model.egg-info/top_level.txt @@ -0,0 +1 @@ +swh diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index b8c6025fc96d189d47d17cd0a1193094d7d0a089..2d5ff126b00b67f5b46534aa60f6f5118e1d6a78 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -3,8 +3,11 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import binascii +import functools import hashlib from io import BytesIO +import os # supported hashing algorithms ALGORITHMS = set(['sha1', 'sha256', 'sha1_git']) @@ -78,7 +81,7 @@ def _new_hash(algo, length=None): return h -def hash_file(fobj, length=None, algorithms=ALGORITHMS): +def hash_file(fobj, length=None, algorithms=ALGORITHMS, chunk_cb=None): """Hash the contents of the given file object with the given algorithms. Args: @@ -87,7 +90,7 @@ def hash_file(fobj, length=None, algorithms=ALGORITHMS): git-specific algorithms) algorithms: the hashing algorithms used - Returns: a dict mapping each algorithm to a hexadecimal digest + Returns: a dict mapping each algorithm to a bytes digest. Raises: ValueError if algorithms contains an unknown hash algorithm. @@ -100,8 +103,29 @@ def hash_file(fobj, length=None, algorithms=ALGORITHMS): break for hash in hashes.values(): hash.update(chunk) + if chunk_cb: + chunk_cb(chunk) - return {algo: hash.hexdigest() for algo, hash in hashes.items()} + return {algo: hash.digest() for algo, hash in hashes.items()} + + +def hash_path(path, algorithms=ALGORITHMS, chunk_cb=None): + """Hash the contents of the file at the given path with the given algorithms. + + Args: + path: the path of the file to hash + algorithms: the hashing algorithms used + chunk_cb: a callback + + Returns: a dict mapping each algorithm to a bytes digest. + + Raises: + ValueError if algorithms contains an unknown hash algorithm. + OSError on file access error + """ + length = os.path.getsize(path) + with open(path, 'rb') as fobj: + return hash_file(fobj, length, algorithms, chunk_cb) def hash_data(data, algorithms=ALGORITHMS): @@ -111,7 +135,7 @@ def hash_data(data, algorithms=ALGORITHMS): data: a bytes object algorithms: the hashing algorithms used - Returns: a dict mapping each algorithm to a hexadecimal digest + Returns: a dict mapping each algorithm to a bytes digest Raises: TypeError if data does not support the buffer interface. @@ -129,7 +153,7 @@ def hash_git_data(data, git_type, base_algo='sha1'): git_type: the git object type base_algo: the base hashing algorithm used (default: sha1) - Returns: a dict mapping each algorithm to a hexadecimal digest + Returns: a dict mapping each algorithm to a bytes digest Raises: ValueError if the git_type is unexpected. @@ -144,4 +168,20 @@ def hash_git_data(data, git_type, base_algo='sha1'): h = _new_git_hash(base_algo, git_type, len(data)) h.update(data) - return h.hexdigest() + return h.digest() + + +@functools.lru_cache() +def hash_to_hex(hash): + """Converts a hash (in hex or bytes form) to its hexadecimal ascii form""" + if isinstance(hash, str): + return hash + return binascii.hexlify(hash).decode('ascii') + + +@functools.lru_cache() +def hash_to_bytes(hash): + """Converts a hash (in hex or bytes form) to its raw bytes form""" + if isinstance(hash, bytes): + return hash + return bytes.fromhex(hash) diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index e44d8940e607f0287d06d1e86b6b48c6cc0268e2..311e58c6026a3e9d408f1d3c1fe56bb67ba7bd07 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -159,7 +159,8 @@ def directory_identifier(directory): identifier_to_bytes(entry['target']), ]) - return hashutil.hash_git_data(b''.join(components), 'tree') + return identifier_to_str(hashutil.hash_git_data(b''.join(components), + 'tree')) def format_date(date): @@ -265,7 +266,8 @@ def revision_identifier(revision): revision['message'], ]) - return hashutil.hash_git_data(b''.join(components), 'commit') + return identifier_to_str(hashutil.hash_git_data(b''.join(components), + 'commit')) def target_type_to_git(target_type): @@ -294,4 +296,5 @@ def release_identifier(release): components.extend([b'\n', release['message']]) - return hashutil.hash_git_data(b''.join(components), 'tag') + return identifier_to_str(hashutil.hash_git_data(b''.join(components), + 'tag')) diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index 45c55f88af6615dc808620f8a2d47cde81ef3dec..79cdc9ece21e9065a73f9296fbac729d6badd7ac 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import io +import tempfile import unittest from nose.tools import istest @@ -21,17 +22,27 @@ class Hashutil(unittest.TestCase): '4a9b50ee5b5866c0d91fab0e65907311', } - self.git_checksums = { + self.checksums = { + type: bytes.fromhex(cksum) + for type, cksum in self.hex_checksums.items() + } + + self.git_hex_checksums = { 'blob': self.hex_checksums['sha1_git'], 'tree': '5b2e883aa33d2efab98442693ea4dd5f1b8871b0', 'commit': '79e4093542e72f0fcb7cbd75cb7d270f9254aa8f', 'tag': 'd6bf62466f287b4d986c545890716ce058bddf67', } + self.git_checksums = { + type: bytes.fromhex(cksum) + for type, cksum in self.git_hex_checksums.items() + } + @istest def hash_data(self): checksums = hashutil.hash_data(self.data) - self.assertEqual(checksums, self.hex_checksums) + self.assertEqual(checksums, self.checksums) @istest def hash_data_unknown_hash(self): @@ -63,7 +74,7 @@ class Hashutil(unittest.TestCase): fobj = io.BytesIO(self.data) checksums = hashutil.hash_file(fobj, length=len(self.data)) - self.assertEqual(checksums, self.hex_checksums) + self.assertEqual(checksums, self.checksums) @istest def hash_file_missing_length(self): @@ -73,3 +84,28 @@ class Hashutil(unittest.TestCase): hashutil.hash_file(fobj, algorithms=['sha1_git']) self.assertIn('Missing length', cm.exception.args[0]) + + @istest + def hash_path(self): + with tempfile.NamedTemporaryFile(delete=False) as f: + f.write(self.data) + f.close() + hashes = hashutil.hash_path(f.name) + + self.assertEquals(self.checksums, hashes) + + @istest + def hash_to_hex(self): + for type in self.checksums: + hex = self.hex_checksums[type] + hash = self.checksums[type] + self.assertEquals(hashutil.hash_to_hex(hex), hex) + self.assertEquals(hashutil.hash_to_hex(hash), hex) + + @istest + def hash_to_bytes(self): + for type in self.checksums: + hex = self.hex_checksums[type] + hash = self.checksums[type] + self.assertEquals(hashutil.hash_to_bytes(hex), hash) + self.assertEquals(hashutil.hash_to_bytes(hash), hash) diff --git a/swh/model/validators.py b/swh/model/validators.py index cb2e2770403e5018be9df7d21f2ef14b0de8e72f..ea64b40f800c2c384b67019306549adc262c216f 100644 --- a/swh/model/validators.py +++ b/swh/model/validators.py @@ -3,8 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import binascii - from .exceptions import ValidationError, NON_FIELD_ERRORS from . import fields, hashutil @@ -50,9 +48,7 @@ def validate_content(content): for hash_type, computed_hash in hashes.items(): if hash_type not in content: continue - content_hash = content[hash_type] - if isinstance(content_hash, bytes): - content_hash = binascii.hexlify(content_hash).decode() + content_hash = hashutil.hash_to_bytes(content[hash_type]) if content_hash != computed_hash: errors.append(ValidationError( 'hash mismatch in content for hash %(hash)s', diff --git a/version.txt b/version.txt index 825ccab0ad8e053ed6658db4c0a3dadab3508f25..c895f93b9f88a13eab32fe79e4cd88d05ecd6b5d 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.1-0-ge92e3c5 \ No newline at end of file +v0.0.2-0-g96fb46f \ No newline at end of file