diff --git a/MANIFEST.in b/MANIFEST.in index 08ebc95bbbcb73bedd469b33d886a5d153fe8174..e7c46fcaa031efc7023aa243e3ff94ba7996ea65 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include Makefile include requirements.txt +include requirements-swh.txt include version.txt diff --git a/PKG-INFO b/PKG-INFO index 08d4cb4e6254b0d427275a7d1ce76afa21be98c4..4925bc7c7d7884b2d3c6006ad9b4a20252caef92 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.12 +Version: 0.0.13 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/bin/git-revhash b/bin/git-revhash new file mode 100755 index 0000000000000000000000000000000000000000..69d1d1cdb63d6d7dcfe07ef1b0176faebe251e3a --- /dev/null +++ b/bin/git-revhash @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# Use +# git-revhash 'tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904\nparent 22c0fa5195a53f2e733ec75a9b6e9d1624a8b771\nauthor seanius <seanius@3187e211-bb14-4c82-9596-0b59d67cd7f4> 1138341044 +0000\ncommitter seanius <seanius@3187e211-bb14-4c82-9596-0b59d67cd7f4> 1138341044 +0000\n\nmaking dir structure...\n' # noqa +# output: 17a631d474f49bbebfdf3d885dcde470d7faafd7 + +echo -ne $* | git hash-object --stdin -t commit diff --git a/bin/swh-revhash b/bin/swh-revhash new file mode 100755 index 0000000000000000000000000000000000000000..c7e2998a131724949f34f883cc1df2ca46d43824 --- /dev/null +++ b/bin/swh-revhash @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +# Use: +# swh-revhash 'tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904\nparent 22c0fa5195a53f2e733ec75a9b6e9d1624a8b771\nauthor seanius <seanius@3187e211-bb14-4c82-9596-0b59d67cd7f4> 1138341044 +0000\ncommitter seanius <seanius@3187e211-bb14-4c82-9596-0b59d67cd7f4> 1138341044 +0000\n\nmaking dir structure...\n' # noqa +# output: 17a631d474f49bbebfdf3d885dcde470d7faafd7 + +# To compare with git: +# git-revhash 'tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904\nparent 22c0fa5195a53f2e733ec75a9b6e9d1624a8b771\nauthor seanius <seanius@3187e211-bb14-4c82-9596-0b59d67cd7f4> 1138341044 +0000\ncommitter seanius <seanius@3187e211-bb14-4c82-9596-0b59d67cd7f4> 1138341044 +0000\n\nmaking dir structure...\n' # noqa +# output: 17a631d474f49bbebfdf3d885dcde470d7faafd7 + + +import sys + +from swh.model import identifiers, hashutil + + +def revhash(revision_raw): + """Compute the revision hash. + + """ + if b'\\n' in revision_raw: # HACK: string have somehow their \n + # expanded to \\n + revision_raw = revision_raw.replace(b'\\n', b'\n') + + h = hashutil.hash_git_data(revision_raw, 'commit') + return identifiers.identifier_to_str(h) + + +if __name__ == '__main__': + revision_raw = sys.argv[1].encode('utf-8') + print(revhash(revision_raw)) diff --git a/requirements-swh.txt b/requirements-swh.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/setup.py b/setup.py index 31541b45f6a943030854ae8b47603ae7d8f7ccac..a520dc09a36de511b7acae984cc43e8efd222ddb 100644 --- a/setup.py +++ b/setup.py @@ -3,13 +3,13 @@ from setuptools import setup def parse_requirements(): requirements = [] - with open('requirements.txt') as f: - for line in f.readlines(): - line = line.strip() - if not line or line.startswith('#'): - continue - requirements.append(line) - + for reqf in ('requirements.txt', 'requirements-swh.txt'): + with open(reqf) as f: + for line in f.readlines(): + line = line.strip() + if not line or line.startswith('#'): + continue + requirements.append(line) return requirements diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 08d4cb4e6254b0d427275a7d1ce76afa21be98c4..4925bc7c7d7884b2d3c6006ad9b4a20252caef92 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: swh.model -Version: 0.0.12 +Version: 0.0.13 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers diff --git a/swh.model.egg-info/SOURCES.txt b/swh.model.egg-info/SOURCES.txt index 6d07bebeb7ebeb70ab32a0187fa2411d1beeb42f..79270515293f77238b836ae264f16b4ddd404f12 100644 --- a/swh.model.egg-info/SOURCES.txt +++ b/swh.model.egg-info/SOURCES.txt @@ -5,9 +5,12 @@ MANIFEST.in Makefile Makefile.local README-dev.md +requirements-swh.txt requirements.txt setup.py version.txt +bin/git-revhash +bin/swh-revhash debian/changelog debian/compat debian/control diff --git a/swh/model/git.py b/swh/model/git.py index 6cc7019a730c03938223bb8c3b37e6c2c36f6da8..a3503cbb17450ed754d0dff60bf71de673fe05b6 100644 --- a/swh/model/git.py +++ b/swh/model/git.py @@ -241,9 +241,9 @@ def default_validation_dir(dirpath): return True -def __walk(rootdir, - dir_ok_fn=default_validation_dir, - remove_empty_folder=False): +def _walk(rootdir, + dir_ok_fn=default_validation_dir, + remove_empty_folder=False): """Walk the filesystem and yields a 3 tuples (dirpath, dirnames as set of absolute paths, filenames as set of abslute paths) @@ -379,7 +379,7 @@ def walk_and_compute_sha1_from_directory(rootdir, if rootdir.endswith(b'/'): rootdir = rootdir.rstrip(b'/') - for dirpath, dirnames, filenames in __walk( + for dirpath, dirnames, filenames in _walk( rootdir, dir_ok_fn, remove_empty_folder): hashes = [] @@ -456,13 +456,13 @@ def compute_hashes_from_directory(rootdir, If something is raised, this is a programmatic error. """ - def __get_dict_from_dirpath(_dict, path): + def _get_dict_from_dirpath(_dict, path): """Retrieve the default associated value for key path. """ return _dict.get(path, dict(children=set(), checksums=None)) - def __get_dict_from_filepath(_dict, path): + def _get_dict_from_filepath(_dict, path): """Retrieve the default associated value for key path. """ @@ -474,10 +474,10 @@ def compute_hashes_from_directory(rootdir, if rootdir.endswith(b'/'): rootdir = rootdir.rstrip(b'/') - for dirpath, dirnames, filenames in __walk( + for dirpath, dirnames, filenames in _walk( rootdir, dir_ok_fn, remove_empty_folder): - dir_entry = __get_dict_from_dirpath(ls_hashes, dirpath) + dir_entry = _get_dict_from_dirpath(ls_hashes, dirpath) children = dir_entry['children'] links = (file @@ -487,25 +487,25 @@ def compute_hashes_from_directory(rootdir, for linkpath in links: all_links.add(linkpath) m_hashes = compute_link_metadata(linkpath) - d = __get_dict_from_filepath(ls_hashes, linkpath) + d = _get_dict_from_filepath(ls_hashes, linkpath) d['checksums'] = m_hashes ls_hashes[linkpath] = d children.add(linkpath) for filepath in (file for file in filenames if file not in all_links): m_hashes = compute_blob_metadata(filepath) - d = __get_dict_from_filepath(ls_hashes, filepath) + d = _get_dict_from_filepath(ls_hashes, filepath) d['checksums'] = m_hashes ls_hashes[filepath] = d children.add(filepath) for fulldirname in (dir for dir in dirnames if dir not in all_links): - d_hashes = __get_dict_from_dirpath(ls_hashes, fulldirname) + d_hashes = _get_dict_from_dirpath(ls_hashes, fulldirname) tree_hash = _compute_tree_metadata( fulldirname, (ls_hashes[p]['checksums'] for p in d_hashes['children']) ) - d = __get_dict_from_dirpath(ls_hashes, fulldirname) + d = _get_dict_from_dirpath(ls_hashes, fulldirname) d['checksums'] = tree_hash ls_hashes[fulldirname] = d children.add(fulldirname) @@ -514,7 +514,7 @@ def compute_hashes_from_directory(rootdir, ls_hashes[dirpath] = dir_entry # compute the current directory hashes - d_hashes = __get_dict_from_dirpath(ls_hashes, rootdir) + d_hashes = _get_dict_from_dirpath(ls_hashes, rootdir) root_hash = { 'sha1_git': _compute_directory_git_sha1( (ls_hashes[p]['checksums'] for p in d_hashes['children']) diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index cf3b3265f1e0392b76430b880ffaee2c8cd2171b..c53513ae6bfbb9798a0845b7d1b38c2399b8c62a 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -171,19 +171,21 @@ def format_date(date): microseconds (postgres type "datetime with timezone"). Therefore, we print timestamps with no microseconds as integers, and - timestamps with microseconds as floating point values. + timestamps with microseconds as floating point values. We elide the + trailing zeroes from microsecond values, to "future-proof" our + representation if we ever need more precision in timestamps. """ - if isinstance(date, datetime.datetime): - if date.microsecond == 0: - date = int(date.timestamp()) - else: - date = date.timestamp() - return str(date).encode() + if not isinstance(date, dict): + raise ValueError('format_date only supports dicts, %r received' % date) + + seconds = date.get('seconds', 0) + microseconds = date.get('microseconds', 0) + if not microseconds: + return str(seconds).encode() else: - if date == int(date): - date = int(date) - return str(date).encode() + float_value = ('%d.%06d' % (seconds, microseconds)) + return float_value.rstrip('0').encode() @lru_cache() @@ -221,8 +223,9 @@ def normalize_timestamp(time_representation): Returns: a normalized dictionary with three keys - - timestamp: a number of seconds since the UNIX epoch (1970-01-01 at 00:00 - UTC) + - timestamp: a dict with two optional keys: + - seconds: the integral number of seconds since the UNIX epoch + - microseconds: the integral number of microseconds - offset: the timezone offset as a number of minutes relative to UTC - negative_utc: a boolean representing whether the offset is -0000 when offset = 0. @@ -235,12 +238,23 @@ def normalize_timestamp(time_representation): negative_utc = False if isinstance(time_representation, dict): - timestamp = time_representation['timestamp'] + ts = time_representation['timestamp'] + if isinstance(ts, dict): + seconds = ts.get('seconds', 0) + microseconds = ts.get('microseconds', 0) + elif isinstance(ts, int): + seconds = ts + microseconds = 0 + else: + raise ValueError( + 'normalize_timestamp received non-integer timestamp member:' + ' %r' % ts) offset = time_representation['offset'] if 'negative_utc' in time_representation: negative_utc = time_representation['negative_utc'] elif isinstance(time_representation, datetime.datetime): - timestamp = time_representation.timestamp() + seconds = int(time_representation.timestamp()) + microseconds = time_representation.microsecond utcoffset = time_representation.utcoffset() if utcoffset is None: raise ValueError( @@ -250,12 +264,20 @@ def normalize_timestamp(time_representation): # utcoffset is an integer number of minutes seconds_offset = utcoffset.total_seconds() offset = int(seconds_offset) // 60 - else: - timestamp = time_representation + elif isinstance(time_representation, int): + seconds = time_representation + microseconds = 0 offset = 0 + else: + raise ValueError( + 'normalize_timestamp received non-integer timestamp:' + ' %r' % time_representation) return { - 'timestamp': timestamp, + 'timestamp': { + 'seconds': seconds, + 'microseconds': microseconds, + }, 'offset': offset, 'negative_utc': negative_utc, } diff --git a/swh/model/tests/test_git.py b/swh/model/tests/test_git.py index f58057a5e4ada755394d9e60c1413588a849bc6c..b1eac8cbbb36988d24369b2a26641710a8e7255a 100644 --- a/swh/model/tests/test_git.py +++ b/swh/model/tests/test_git.py @@ -517,7 +517,7 @@ class TestObjectsPerType(WithSampleFolderChecksums, unittest.TestCase): @istest def objects_per_type_tree(self): - def __children_hashes(path, objects=self.objects): + def _children_hashes(path, objects=self.objects): return set((c['sha1_git'] for c in git.children_hashes( objects[path]['children'], objects))) @@ -531,7 +531,7 @@ class TestObjectsPerType(WithSampleFolderChecksums, unittest.TestCase): 'perms': git.GitPerm.TREE, # we only add children's sha1_git here, in reality, # it's a full dict of hashes. - 'children': __children_hashes(b'/tmp/tmp7w3oi_j8') + 'children': _children_hashes(b'/tmp/tmp7w3oi_j8') }, { 'type': git.GitType.TREE, @@ -539,7 +539,7 @@ class TestObjectsPerType(WithSampleFolderChecksums, unittest.TestCase): 'sha1_git': b'\xe8\xb0\xf1Fj\xf8`\x8c\x8a?\xb9\x87\x9d\xb1r\xb8\x87\xe8\x07Y', # noqa 'path': b'/tmp/tmp7w3oi_j8/sample-folder', 'perms': git.GitPerm.TREE, - 'children': __children_hashes( + 'children': _children_hashes( b'/tmp/tmp7w3oi_j8/sample-folder') }, { @@ -548,7 +548,7 @@ class TestObjectsPerType(WithSampleFolderChecksums, unittest.TestCase): 'sha1_git': b'K\x82]\xc6B\xcbn\xb9\xa0`\xe5K\xf8\xd6\x92\x88\xfb\xeeI\x04', # noqa 'path': b'/tmp/tmp7w3oi_j8/sample-folder/empty-folder', 'perms': git.GitPerm.TREE, - 'children': __children_hashes( + 'children': _children_hashes( b'/tmp/tmp7w3oi_j8/sample-folder/empty-folder') }, { @@ -557,7 +557,7 @@ class TestObjectsPerType(WithSampleFolderChecksums, unittest.TestCase): 'sha1_git': b'<\x1fW\x83\x94\xf4b?t\xa0\xba\x7f\xe7ar\x9fY\xfcn\xc4', # noqa 'path': b'/tmp/tmp7w3oi_j8/sample-folder/bar', 'perms': git.GitPerm.TREE, - 'children': __children_hashes( + 'children': _children_hashes( b'/tmp/tmp7w3oi_j8/sample-folder/bar') }, { @@ -566,7 +566,7 @@ class TestObjectsPerType(WithSampleFolderChecksums, unittest.TestCase): 'sha1_git': b'\xc3\x02\x0fk\xf15\xa3\x8cm\xf3\xaf\xeb_\xb3\x822\xc5\xe0p\x87', # noqa 'path': b'/tmp/tmp7w3oi_j8/sample-folder/bar/barfoo', 'perms': git.GitPerm.TREE, - 'children': __children_hashes( + 'children': _children_hashes( b'/tmp/tmp7w3oi_j8/sample-folder/bar/barfoo'), }, { @@ -575,7 +575,7 @@ class TestObjectsPerType(WithSampleFolderChecksums, unittest.TestCase): 'sha1_git': b'+A\xc4\x0f\r\x1f\xbf\xfc\xba\x12I}\xb7\x1f\xba\x83\xfc\xca\x96\xe5', # noqa 'path': b'/tmp/tmp7w3oi_j8/sample-folder/foo', 'perms': git.GitPerm.TREE, - 'children': __children_hashes( + 'children': _children_hashes( b'/tmp/tmp7w3oi_j8/sample-folder/foo') }, ] @@ -585,7 +585,7 @@ class TestObjectsPerType(WithSampleFolderChecksums, unittest.TestCase): # when actual_sha1_trees = list( - ((c['sha1_git'], c['type'], __children_hashes(c['path'])) + ((c['sha1_git'], c['type'], _children_hashes(c['path'])) for c in git.objects_per_type(git.GitType.TREE, self.objects))) self.assertEquals(len(actual_sha1_trees), len(expected_sha1_trees)) diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index e1adfea34f08ac419af4fc424fb4460e3eaabc6e..16a34bb9c76561bde5e97e42e2298f973b37e75e 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -55,15 +55,23 @@ class UtilityFunctionsIdentifier(unittest.TestCase): class UtilityFunctionsDateOffset(unittest.TestCase): def setUp(self): - self.date = datetime.datetime( - 2015, 11, 22, 16, 33, 56, tzinfo=datetime.timezone.utc) - self.date_int = int(self.date.timestamp()) - self.date_repr = b'1448210036' - - self.date_microseconds = datetime.datetime( - 2015, 11, 22, 16, 33, 56, 2342, tzinfo=datetime.timezone.utc) - self.date_microseconds_float = self.date_microseconds.timestamp() - self.date_microseconds_repr = b'1448210036.002342' + self.dates = { + b'1448210036': { + 'seconds': 1448210036, + 'microseconds': 0, + }, + b'1448210036.002342': { + 'seconds': 1448210036, + 'microseconds': 2342, + }, + b'1448210036.12': { + 'seconds': 1448210036, + 'microseconds': 120000, + } + } + self.broken_dates = [ + 1448210036.12, + ] self.offsets = { 0: b'+0000', @@ -73,12 +81,14 @@ class UtilityFunctionsDateOffset(unittest.TestCase): @istest def format_date(self): - for date in [self.date, self.date_int]: - self.assertEqual(identifiers.format_date(date), self.date_repr) + for date_repr, date in self.dates.items(): + self.assertEqual(identifiers.format_date(date), date_repr) - for date in [self.date_microseconds, self.date_microseconds_float]: - self.assertEqual(identifiers.format_date(date), - self.date_microseconds_repr) + @istest + def format_date_fail(self): + for date in self.broken_dates: + with self.assertRaises(ValueError): + identifiers.format_date(date) @istest def format_offset(self): @@ -285,7 +295,7 @@ dg1KdHOa34shrKDaOVzW 'email': b'robot@softwareheritage.org', }, 'date': { - 'timestamp': 1437047495.0, + 'timestamp': {'seconds': 1437047495}, 'offset': 0, 'negative_utc': False, }, @@ -349,7 +359,7 @@ dg1KdHOa34shrKDaOVzW 'fullname': b'Jiang Xin <worldhello.net@gmail.com>', }, 'date': { - 'timestamp': '1428538899', + 'timestamp': 1428538899, 'offset': 480, }, 'committer': { @@ -357,7 +367,7 @@ dg1KdHOa34shrKDaOVzW 'email': b'worldhello.net@gmail.com', }, 'committer_date': { - 'timestamp': '1428538899', + 'timestamp': 1428538899, 'offset': 480, }, 'metadata': { @@ -383,7 +393,7 @@ dg1KdHOa34shrKDaOVzW 'fullname': b'Jiang Xin <worldhello.net@gmail.com>', }, 'date': { - 'timestamp': '1428538899', + 'timestamp': 1428538899, 'offset': 480, }, 'committer': { @@ -391,7 +401,7 @@ dg1KdHOa34shrKDaOVzW 'email': b'worldhello.net@gmail.com', }, 'committer_date': { - 'timestamp': '1428538899', + 'timestamp': 1428538899, 'offset': 480, }, 'message': None, @@ -408,7 +418,7 @@ dg1KdHOa34shrKDaOVzW 'fullname': b'Jiang Xin <worldhello.net@gmail.com>', }, 'date': { - 'timestamp': '1428538899', + 'timestamp': 1428538899, 'offset': 480, }, 'committer': { @@ -416,7 +426,7 @@ dg1KdHOa34shrKDaOVzW 'email': b'worldhello.net@gmail.com', }, 'committer_date': { - 'timestamp': '1428538899', + 'timestamp': 1428538899, 'offset': 480, }, 'message': b'', @@ -592,7 +602,7 @@ o6X/3T+vm8K3bf3driRr34c= 'target': '54e9abca4c77421e2921f5f156c9fe4a9f7441c7', 'target_type': 'revision', 'date': { - 'timestamp': 1225281976.0, + 'timestamp': {'seconds': 1225281976}, 'offset': 0, 'negative_utc': True, }, diff --git a/version.txt b/version.txt index df588ce3c11f55bac17119d4e9b7254bbb094667..ec62758902ca885563c595cabebfcd3ea0c0643d 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.12-0-gcec445d \ No newline at end of file +v0.0.13-0-g58c5a24 \ No newline at end of file