diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index 33e3afc1773922f361e69a3160f3ec8501e99c7a..70b82857e458a9c1a8ac870e2329dc5a5871ff6c 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -3,6 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import binascii +import datetime from functools import lru_cache from . import hashutil @@ -40,6 +42,38 @@ def identifier_to_bytes(identifier): identifier.__class__.__name__) +@lru_cache() +def identifier_to_str(identifier): + """Convert an identifier to an hexadecimal string. + + Args: + identifier: an identifier, either a 40-char hexadecimal string or a + bytes object of length 20 + Returns: + The length 40 string corresponding to the given identifier, hex encoded + + Raises: + ValueError if the identifier is of an unexpected type or length. + """ + + if isinstance(identifier, str): + if len(identifier) != 40: + raise ValueError( + 'Wrong length for str identifier %s, expected 40' % + len(identifier)) + return identifier + + if isinstance(identifier, bytes): + if len(identifier) != 20: + raise ValueError( + 'Wrong length for bytes identifier %s, expected 20' % + len(identifier)) + return binascii.hexlify(identifier).decode() + + raise ValueError('Wrong type for identitfier %s, expected bytes or str' % + identifier.__class__.__name__) + + def content_identifier(content): """Return the intrinsic identifier for a content. @@ -121,3 +155,82 @@ def directory_identifier(directory): ]) return hashutil.hash_git_data(b''.join(components), 'tree') + + +def format_date(date): + """Convert a date object into an UTC timestamp encoded as ascii bytes. + + Git stores timestamps as an integer number of seconds since the UNIX epoch. + + However, Software Heritage stores timestamps as an integer number of + microseconds (postgres type "datetime with timezone"). + + Therefore, we print timestamps with no microseconds as integers, and + timestamps with microseconds as floating point values. + + """ + if isinstance(date, datetime.datetime): + if date.microsecond == 0: + date = int(date.timestamp()) + else: + date = date.timestamp() + return str(date).encode() + else: + if date == int(date): + date = int(date) + return str(date).encode() + + +@lru_cache() +def format_offset(offset): + """Convert an integer number of minutes into an offset representation. + + The offset representation is [+-]hhmm where: + hh is the number of hours; + mm is the number of minutes. + + A null offset is represented as +0000. + """ + if offset >= 0: + sign = '+' + else: + sign = '-' + + hours = abs(offset) // 60 + minutes = abs(offset) % 60 + + t = '%s%02d%02d' % (sign, hours, minutes) + return t.encode() + + +def format_author(author): + components = [ + author['name'], b' <', author['email'], b'> ', + format_date(author['date']), b' ', + format_offset(author['date_offset']), + ] + + return b''.join(components) + + +def revision_identifier(revision): + """Return the intrinsic identifier for a revision. + """ + components = [ + b'tree ', identifier_to_str(revision['directory']).encode(), b'\n', + ] + for parent in revision['parents']: + if parent: + components.extend([ + b'parent ', identifier_to_str(parent).encode(), b'\n', + ]) + + components.extend([ + b'author ', format_author(revision['author']), b'\n', + b'committer ', format_author(revision['committer']), b'\n', + b'\n', + revision['message'], + ]) + + print(b''.join(components).decode('utf-8')) + return hashutil.hash_git_data(b''.join(components), 'commit') diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index 2dbd8ca2082b95eab2d0c4bf02074ca999457f4a..0ace24f1f7678a427caf502c7f10a2bd73beef35 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -138,3 +138,76 @@ class DirectoryIdentifier(unittest.TestCase): self.assertEqual( identifiers.directory_identifier(self.empty_directory), self.empty_directory['id']) + + +class RevisionIdentifier(unittest.TestCase): + def setUp(self): + self.revision = { + 'id': 'bc0195aad0daa2ad5b0d76cce22b167bc3435590', + 'directory': '85a74718d377195e1efd0843ba4f3260bad4fe07', + 'parents': ['01e2d0627a9a6edb24c37db45db5ecb31e9de808'], + 'author': { + 'name': b'Linus Torvalds', + 'email': b'torvalds@linux-foundation.org', + 'date': datetime.datetime(2015, 7, 12, 22, 10, 30, + tzinfo=datetime.timezone.utc), + 'date_offset': -420, + + }, + 'committer': { + 'name': b'Linus Torvalds', + 'email': b'torvalds@linux-foundation.org', + 'date': datetime.datetime(2015, 7, 12, 22, 10, 30, + tzinfo=datetime.timezone.utc), + 'date_offset': -420, + + }, + 'message': b'Linux 4.2-rc2\n', + } + + self.synthetic_revision = { + 'id': b'\xb2\xa7\xe1&\x04\x92\xe3D\xfa\xb3\xcb\xf9\x1b\xc1<\x91' + b'\xe0T&\xfd', + 'author': { + 'name': b'Software Heritage', + 'email': b'robot@softwareheritage.org', + 'date': datetime.datetime(2015, 7, 16, 11, 51, 35, + tzinfo=datetime.timezone.utc), + 'date_offset': 0, + }, + 'type': 'tar', + 'committer': { + 'name': b'Software Heritage', + 'date': datetime.datetime(2015, 7, 16, 11, 51, 35, + tzinfo=datetime.timezone.utc), + 'email': b'robot@softwareheritage.org', + 'date_offset': 0, + }, + 'synthetic': True, + 'parents': [None], + 'message': b'synthetic revision message\n', + 'directory': b'\xd1\x1f\x00\xa6\xa0\xfe\xa6\x05SA\xd2U\x84\xb5\xa9' + b'e\x16\xc0\xd2\xb8', + 'metadata': {'original_artifact': [ + {'archive_type': 'tar', + 'name': 'gcc-5.2.0.tar.bz2', + 'sha1_git': '39d281aff934d44b439730057e55b055e206a586', + 'sha1': 'fe3f5390949d47054b613edc36c557eb1d51c18e', + 'sha256': '5f835b04b5f7dd4f4d2dc96190ec1621b8d89f' + '2dc6f638f9f8bc1b1014ba8cad'}]}, + + } + + @istest + def revision_identifier(self): + self.assertEqual( + identifiers.revision_identifier(self.revision), + identifiers.identifier_to_str(self.revision['id']), + ) + + @istest + def revision_identifier_synthetic(self): + self.assertEqual( + identifiers.revision_identifier(self.synthetic_revision), + identifiers.identifier_to_str(self.synthetic_revision['id']), + )