diff --git a/PKG-INFO b/PKG-INFO index 4925bc7c7d7884b2d3c6006ad9b4a20252caef92..6e8954fba81597e0d25a5d295a5c9c5ee38b0381 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.13 +Version: 0.0.14 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 4925bc7c7d7884b2d3c6006ad9b4a20252caef92..6e8954fba81597e0d25a5d295a5c9c5ee38b0381 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.13 +Version: 0.0.14 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh/model/git.py b/swh/model/git.py index a3503cbb17450ed754d0dff60bf71de673fe05b6..1f95b7b62ac701b86152f42ae2e6d5dee5d3d5cc 100644 --- a/swh/model/git.py +++ b/swh/model/git.py @@ -1,10 +1,11 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os +import stat from enum import Enum @@ -156,21 +157,32 @@ def compute_link_metadata(linkpath): def compute_blob_metadata(filepath): - """Given a filepath, compute the git metadata. + """Given a filepath resolving to a regular file, compute the metadata. + Other file types (fifo, character or block device, symlink) will + be considered empty regular file. To deal properly with symlinks, + use swh.model.git.compute_link_metadata. Args: - filepath: absolute pathname of the file. + filepath: absolute pathname of the regular file. Returns: Dictionary of values: - name: basename of the file + - length: data length - perms: git permission for file - type: git type for file - path: absolute filepath on filesystem """ - blob_metadata = hashutil.hash_path(filepath) - perms = GitPerm.EXEC if os.access(filepath, os.X_OK) else GitPerm.BLOB + mode = os.lstat(filepath).st_mode + if not stat.S_ISREG(mode): # special (block or character device, fifo) + perms = GitPerm.BLOB + blob_metadata = hashutil.hash_data(b'') + blob_metadata['length'] = 0 + else: + perms = GitPerm.EXEC if os.access(filepath, os.X_OK) else GitPerm.BLOB + blob_metadata = hashutil.hash_path(filepath) + blob_metadata.update({ 'name': os.path.basename(filepath), 'perms': perms, diff --git a/swh/model/hashutil.py b/swh/model/hashutil.py index b2558a3bea2f5682493cbc83381328a8dde8fefa..ea28414fe05bfb1fd0a172f281d9a8de6a23fb08 100644 --- a/swh/model/hashutil.py +++ b/swh/model/hashutil.py @@ -181,9 +181,21 @@ def hash_to_hex(hash): return binascii.hexlify(hash).decode('ascii') +@functools.lru_cache() +def hash_to_bytehex(hash): + """Converts a hash to its hexadecimal bytes representation""" + return binascii.hexlify(hash) + + @functools.lru_cache() def hash_to_bytes(hash): """Converts a hash (in hex or bytes form) to its raw bytes form""" if isinstance(hash, bytes): return hash return bytes.fromhex(hash) + + +@functools.lru_cache() +def bytehex_to_hash(hex): + """Converts a hexadecimal bytes representation of a hash to that hash""" + return hash_to_bytes(hex.decode()) diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index c53513ae6bfbb9798a0845b7d1b38c2399b8c62a..4eb2b9d70df0c6230fe16be3de75792284eb2637 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -114,6 +114,18 @@ def _perms_to_bytes(perms): return oc.encode('ascii') +def escape_newlines(snippet): + """Escape the newlines present in snippet according to git rules. + + New lines in git manifests are escaped by indenting the next line by one + space.""" + + if b'\n' in snippet: + return b'\n '.join(snippet.split(b'\n')) + else: + return snippet + + def directory_identifier(directory): """Return the intrinsic identifier for a directory. @@ -347,7 +359,7 @@ def format_author_line(header, author, date_offset): """ - ret = [header.encode(), b' ', format_author(author)] + ret = [header.encode(), b' ', escape_newlines(format_author(author))] date_offset = normalize_timestamp(date_offset) @@ -443,13 +455,9 @@ def revision_identifier(revision): if isinstance(value, str): value = value.encode('utf-8') - # multi-line values: indent continuation lines - if b'\n' in value: - value_chunks = value.split(b'\n') - value = b'\n '.join(value_chunks) - # encode the key to utf-8 - components.extend([key.encode('utf-8'), b' ', value, b'\n']) + components.extend([key.encode('utf-8'), b' ', + escape_newlines(value), b'\n']) if revision['message'] is not None: components.extend([b'\n', revision['message']]) diff --git a/swh/model/tests/test_git.py b/swh/model/tests/test_git.py index b1eac8cbbb36988d24369b2a26641710a8e7255a..3c233c3b791b827ffcc18774eb2d4faba9f3d526 100644 --- a/swh/model/tests/test_git.py +++ b/swh/model/tests/test_git.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -138,6 +138,41 @@ blah self.assertEqual(checksum, self.checksums['tag_sha1_git']) +@attr('fs') +class ComputeBlobMetadata(unittest.TestCase): + @istest + def compute_blob_metadata__special_file_returns_nothing(self): + # prepare + tmp_root_path = tempfile.mkdtemp().encode('utf-8') + name = b'fifo-file' + path = os.path.join(tmp_root_path, name) + + # given + os.mkfifo(path) + + # when + actual_metadata = git.compute_blob_metadata(path) + + # then + expected_metadata = { + 'sha1': b'\xda9\xa3\xee^kK\r2U\xbf\xef\x95`\x18\x90\xaf\xd8\x07\t', + 'sha1_git': b'\xe6\x9d\xe2\x9b\xb2\xd1\xd6CK\x8b)\xaewZ\xd8\xc2' + b'\xe4\x8cS\x91', + 'sha256': b"\xe3\xb0\xc4B\x98\xfc\x1c\x14\x9a\xfb\xf4\xc8\x99o" + b"\xb9$'\xaeA\xe4d\x9b\x93L\xa4\x95\x99\x1bxR\xb8U", + 'perms': git.GitPerm.BLOB, + 'path': path, + 'name': name, + 'type': git.GitType.BLOB, + 'length': 0 + } + + self.assertEquals(actual_metadata, expected_metadata) + + # cleanup + shutil.rmtree(tmp_root_path) + + @attr('fs') class GitHashWalkArborescenceTree: """Root class to ease walk and git hash testing without side-effecty diff --git a/swh/model/tests/test_hashutil.py b/swh/model/tests/test_hashutil.py index f795e87ea64f3e64a864dcef1f2da653e5f4cb70..614e7ee282de9ed52fa41f2394819c861b18ce76 100644 --- a/swh/model/tests/test_hashutil.py +++ b/swh/model/tests/test_hashutil.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -110,3 +110,101 @@ class Hashutil(unittest.TestCase): hash = self.checksums[type] self.assertEquals(hashutil.hash_to_bytes(hex), hash) self.assertEquals(hashutil.hash_to_bytes(hash), hash) + + @istest + def hash_to_bytehex(self): + for algo in self.checksums: + self.assertEqual(self.hex_checksums[algo].encode('ascii'), + hashutil.hash_to_bytehex(self.checksums[algo])) + + @istest + def bytehex_to_hash(self): + for algo in self.checksums: + self.assertEqual(self.checksums[algo], + hashutil.bytehex_to_hash( + self.hex_checksums[algo].encode())) + + +class HashlibGit(unittest.TestCase): + + def setUp(self): + self.blob_data = b'42\n' + + self.tree_data = b''.join([b'40000 barfoo\0', + bytes.fromhex('c3020f6bf135a38c6df' + '3afeb5fb38232c5e07087'), + b'100644 blah\0', + bytes.fromhex('63756ef0df5e4f10b6efa' + '33cfe5c758749615f20'), + b'100644 hello\0', + bytes.fromhex('907b308167f0880fb2a' + '5c0e1614bb0c7620f9dc3')]) + + self.commit_data = """tree 1c61f7259dcb770f46b194d941df4f08ff0a3970 +author Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com> 1444054085 +0200 +committer Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com> 1444054085 +0200 + +initial +""".encode('utf-8') # NOQA + self.tag_data = """object 24d012aaec0bc5a4d2f62c56399053d6cc72a241 +type commit +tag 0.0.1 +tagger Antoine R. Dumont (@ardumont) <antoine.romain.dumont@gmail.com> 1444225145 +0200 + +blah +""".encode('utf-8') # NOQA + + self.checksums = { + 'blob_sha1_git': bytes.fromhex('d81cc0710eb6cf9efd5b920a8453e1' + 'e07157b6cd'), + 'tree_sha1_git': bytes.fromhex('ac212302c45eada382b27bfda795db' + '121dacdb1c'), + 'commit_sha1_git': bytes.fromhex('e960570b2e6e2798fa4cfb9af2c399' + 'd629189653'), + 'tag_sha1_git': bytes.fromhex('bc2b99ba469987bcf1272c189ed534' + 'e9e959f120'), + } + + @istest + def unknown_header_type(self): + with self.assertRaises(ValueError) as cm: + hashutil.hash_git_data(b'any-data', 'some-unknown-type') + + self.assertIn('Unexpected git object type', cm.exception.args[0]) + + @istest + def hashdata_content(self): + # when + actual_hash = hashutil.hash_git_data(self.blob_data, git_type='blob') + + # then + self.assertEqual(actual_hash, + self.checksums['blob_sha1_git']) + + @istest + def hashdata_tree(self): + # when + actual_hash = hashutil.hash_git_data(self.tree_data, git_type='tree') + + # then + self.assertEqual(actual_hash, + self.checksums['tree_sha1_git']) + + @istest + def hashdata_revision(self): + # when + actual_hash = hashutil.hash_git_data(self.commit_data, + git_type='commit') + + # then + self.assertEqual(actual_hash, + self.checksums['commit_sha1_git']) + + @istest + def hashdata_tag(self): + # when + actual_hash = hashutil.hash_git_data(self.tag_data, git_type='tag') + + # then + self.assertEqual(actual_hash, + self.checksums['tag_sha1_git']) diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index 16a34bb9c76561bde5e97e42e2298f973b37e75e..755dba60906987d0a80616c6999864621ebaeac1 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015 The Software Heritage developers +# Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -615,6 +615,29 @@ o6X/3T+vm8K3bf3driRr34c= 'message': b'tagging version 20081029\n\nr56558\n', } + self.release_newline_in_author = { + 'author': { + 'email': b'esycat@gmail.com', + 'fullname': b'Eugene Janusov\n<esycat@gmail.com>', + 'name': b'Eugene Janusov\n', + }, + 'date': { + 'negative_utc': None, + 'offset': 600, + 'timestamp': { + 'microseconds': 0, + 'seconds': 1377480558, + }, + }, + 'id': b'\\\x98\xf5Y\xd04\x16-\xe2->\xbe\xb9T3\xe6\xf8\x88R1', + 'message': b'Release of v0.3.2.', + 'name': b'0.3.2', + 'synthetic': False, + 'target': (b'\xc0j\xa3\xd9;x\xa2\x86\\I5\x17' + b'\x000\xf8\xc2\xd79o\xd3'), + 'target_type': 'revision', + } + @istest def release_identifier(self): self.assertEqual( @@ -649,3 +672,10 @@ o6X/3T+vm8K3bf3driRr34c= identifiers.release_identifier(self.release_negative_utc), identifiers.identifier_to_str(self.release_negative_utc['id']) ) + + @istest + def release_identifier_newline_in_author(self): + self.assertEqual( + identifiers.release_identifier(self.release_newline_in_author), + identifiers.identifier_to_str(self.release_newline_in_author['id']) + ) diff --git a/version.txt b/version.txt index ec62758902ca885563c595cabebfcd3ea0c0643d..ed4e55cb1a5c9150320ab6e9463f5a5ffc8ddee9 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.13-0-g58c5a24 \ No newline at end of file +v0.0.14-0-g3e325ca \ No newline at end of file