diff --git a/PKG-INFO b/PKG-INFO index 25b0fa6e158aefbb8b416e30cc73a236d4fc20a8..10d2af839859e665c70f477bb55d0f69069a8680 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 0.0.47 +Version: 0.0.48 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers @@ -36,3 +36,4 @@ Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing +Provides-Extra: cli diff --git a/requirements.txt b/requirements.txt index 5962345374109ec709acb6ed0c66b424d29d049c..98825fa3ef8c1821335b73ab07e581747024d346 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,6 @@ # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner -Click attrs hypothesis python-dateutil diff --git a/setup.py b/setup.py index 5604841510d2a03557c1487c1b74c811158347a8..6f2eb37c6904f74a18844b9f67d00cc1bc8ac4b2 100755 --- a/setup.py +++ b/setup.py @@ -49,7 +49,10 @@ setup( setup_requires=['vcversioner'], install_requires=(parse_requirements() + parse_requirements('swh') + blake2_requirements), - extras_require={'testing': parse_requirements('test')}, + extras_require={ + 'cli': parse_requirements('cli'), + 'testing': parse_requirements('test'), + }, vcversioner={}, include_package_data=True, entry_points=''' diff --git a/swh.model.egg-info/PKG-INFO b/swh.model.egg-info/PKG-INFO index 25b0fa6e158aefbb8b416e30cc73a236d4fc20a8..10d2af839859e665c70f477bb55d0f69069a8680 100644 --- a/swh.model.egg-info/PKG-INFO +++ b/swh.model.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: swh.model -Version: 0.0.47 +Version: 0.0.48 Summary: Software Heritage data model Home-page: https://forge.softwareheritage.org/diffusion/DMOD/ Author: Software Heritage developers @@ -36,3 +36,4 @@ Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Description-Content-Type: text/markdown Provides-Extra: testing +Provides-Extra: cli diff --git a/swh.model.egg-info/SOURCES.txt b/swh.model.egg-info/SOURCES.txt index c631049549f1f0cef407e2795efdfa325f477572..04bf4380c214574dcb3adb3f8848ca750233e2e3 100644 --- a/swh.model.egg-info/SOURCES.txt +++ b/swh.model.egg-info/SOURCES.txt @@ -39,6 +39,7 @@ swh/model/tests/test_model.py swh/model/tests/test_toposort.py swh/model/tests/test_validators.py swh/model/tests/data/dir-folders/sample-folder.tgz +swh/model/tests/data/repos/sample-repo.tgz swh/model/tests/fields/__init__.py swh/model/tests/fields/test_compound.py swh/model/tests/fields/test_hashes.py diff --git a/swh.model.egg-info/requires.txt b/swh.model.egg-info/requires.txt index 8b1e4f2a673979a83a666acf85010b5904e7414d..88a6ba3df5f19879e693e8fe9f62e4d4b8aada06 100644 --- a/swh.model.egg-info/requires.txt +++ b/swh.model.egg-info/requires.txt @@ -1,5 +1,4 @@ vcversioner -Click attrs hypothesis python-dateutil @@ -7,5 +6,11 @@ python-dateutil [:python_version < "3.6"] pyblake2 +[cli] +Click +dulwich + [testing] +Click +dulwich pytest diff --git a/swh/model/cli.py b/swh/model/cli.py index 853efa995876adb1c8eef54503e52fb5b5a7b988..991bc46ec72bf9657d7db9965b33e08ab83ff29d 100644 --- a/swh/model/cli.py +++ b/swh/model/cli.py @@ -4,12 +4,14 @@ # See top-level LICENSE file for more information import click +import dulwich.repo import os import sys from functools import partial from urllib.parse import urlparse +from swh.model import hashutil from swh.model import identifiers as pids from swh.model.exceptions import ValidationError from swh.model.from_disk import Content, Directory @@ -17,6 +19,15 @@ from swh.model.from_disk import Content, Directory CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) +# Mapping between dulwich types and Software Heritage ones. Used by snapshot ID +# computation. +_DULWICH_TYPES = { + b'blob': 'content', + b'tree': 'directory', + b'commit': 'revision', + b'tag': 'release', +} + class PidParamType(click.ParamType): name = 'persistent identifier' @@ -45,6 +56,26 @@ def pid_of_origin(url): return str(pid) +def pid_of_git_repo(path): + repo = dulwich.repo.Repo(path) + + branches = {} + for ref, target in repo.refs.as_dict().items(): + obj = repo[target] + if obj: + branches[ref] = { + 'target': hashutil.bytehex_to_hash(target), + 'target_type': _DULWICH_TYPES[obj.type_name], + } + else: + branches[ref] = None + snapshot = {'branches': branches} + + pid = pids.PersistentId(object_type='snapshot', + object_id=pids.snapshot_identifier(snapshot)) + return str(pid) + + def identify_object(obj_type, follow_symlinks, obj): if obj_type == 'auto': if os.path.isfile(obj): @@ -73,6 +104,8 @@ def identify_object(obj_type, follow_symlinks, obj): pid = pid_of_dir(path) elif obj_type == 'origin': pid = pid_of_origin(obj) + elif obj_type == 'snapshot': + pid = pid_of_git_repo(obj) else: # shouldn't happen, due to option validation raise click.BadParameter('invalid object type: ' + obj_type) @@ -89,7 +122,8 @@ def identify_object(obj_type, follow_symlinks, obj): @click.option('--filename/--no-filename', 'show_filename', default=True, help='show/hide file name (default: show)') @click.option('--type', '-t', 'obj_type', default='auto', - type=click.Choice(['auto', 'content', 'directory', 'origin']), + type=click.Choice(['auto', 'content', 'directory', 'origin', + 'snapshot']), help='type of object to identify (default: auto)') @click.option('--verify', '-v', metavar='PID', type=PidParamType(), help='reference identifier to be compared with computed one') @@ -116,7 +150,12 @@ def identify(obj_type, verify, show_filename, follow_symlinks, objects): $ swh identify --no-filename /usr/src/linux/kernel/ swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab - """ + \b + $ git clone --mirror https://forge.softwareheritage.org/source/helloworld.git + $ swh identify --type snapshot helloworld.git/ + swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93 helloworld.git + + """ # NoQA # overlong lines in shell examples are fine if verify and len(objects) != 1: raise click.BadParameter('verification requires a single object') diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py index 6a00068145034d088cc3240fb20136df7b227a23..66f97dd52bcd695b62c9289d939b09b0b36ef85c 100644 --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -606,7 +606,11 @@ def snapshot_identifier(snapshot, *, ignore_unresolved=False): def origin_identifier(origin): - """Return the intrinsic identifier for an origin.""" + """Return the intrinsic identifier for an origin. + + An origin's identifier is the sha1 checksum of the entire origin URL + + """ return hashlib.sha1(origin['url'].encode('ascii')).hexdigest() @@ -695,6 +699,13 @@ class PersistentId(_PersistentId): if not o: raise ValidationError('Wrong input: Supported types are %s' % ( list(_object_type_map.keys()))) + if namespace != PID_NAMESPACE: + raise ValidationError( + "Wrong format: only supported namespace is '%s'" + % PID_NAMESPACE) + if scheme_version != PID_VERSION: + raise ValidationError( + 'Wrong format: only supported version is %d' % PID_VERSION) # internal swh representation resolution if isinstance(object_id, dict): object_id = object_id[o['key_id']] @@ -773,22 +784,8 @@ def parse_persistent_identifier(persistent_id): # Checking for parsing errors _ns, _version, _type, _id = pid_data - if _ns != PID_NAMESPACE: - raise ValidationError( - "Wrong format: only supported namespace is '%s'" % PID_NAMESPACE) - - if _version != str(PID_VERSION): - raise ValidationError( - 'Wrong format: only supported version is %d' % PID_VERSION) - pid_data[1] = int(pid_data[1]) - expected_types = PID_TYPES - if _type not in expected_types: - raise ValidationError( - 'Wrong format: Supported types are %s' % ( - ', '.join(expected_types))) - for otype, data in _object_type_map.items(): if _type == data['short_name']: pid_data[2] = otype @@ -798,12 +795,6 @@ def parse_persistent_identifier(persistent_id): raise ValidationError( 'Wrong format: Identifier should be present') - try: - validate_sha1(_id) - except ValidationError: - raise ValidationError( - 'Wrong format: Identifier should be a valid hash') - persistent_id_metadata = {} for part in persistent_id_parts: try: @@ -813,4 +804,4 @@ def parse_persistent_identifier(persistent_id): msg = 'Contextual data is badly formatted, form key=val expected' raise ValidationError(msg) pid_data.append(persistent_id_metadata) - return PersistentId._make(pid_data) + return PersistentId(*pid_data) diff --git a/swh/model/model.py b/swh/model/model.py index 70559dbbf685cfdf0240e5302dd40cfc0205c372..6824217d4977eef188e0a81cd6e8ed6610ce6498 100644 --- a/swh/model/model.py +++ b/swh/model/model.py @@ -97,6 +97,11 @@ class Origin(BaseModel): url = attr.ib(type=str) type = attr.ib(type=Optional[str], default=None) + def to_dict(self): + r = super().to_dict() + r.pop('type', None) + return r + @attr.s class OriginVisit(BaseModel): @@ -122,6 +127,7 @@ class OriginVisit(BaseModel): ov = super().to_dict() if ov['visit'] is None: del ov['visit'] + ov['origin'] = self.origin.to_dict() return ov @classmethod diff --git a/swh/model/tests/data/repos/sample-repo.tgz b/swh/model/tests/data/repos/sample-repo.tgz new file mode 100644 index 0000000000000000000000000000000000000000..5b5baa70b6be89161eebc0cc9b7589481a439504 Binary files /dev/null and b/swh/model/tests/data/repos/sample-repo.tgz differ diff --git a/swh/model/tests/test_cli.py b/swh/model/tests/test_cli.py index 7f70b46d119192790d2e6c4f1740f30cefbf1a82..990ca08ea3cf1a6c7d68b802f05c6af50495bce3 100644 --- a/swh/model/tests/test_cli.py +++ b/swh/model/tests/test_cli.py @@ -1,9 +1,10 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os +import tarfile import tempfile import unittest @@ -45,6 +46,19 @@ class TestIdentify(DataMixin, unittest.TestCase): self.assertPidOK(result, 'swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759') + def test_snapshot_id(self): + """identify a snapshot""" + tarball = os.path.join(os.path.dirname(__file__), 'data', 'repos', + 'sample-repo.tgz') + with tempfile.TemporaryDirectory(prefix='swh.model.cli') as d: + with tarfile.open(tarball, 'r:gz') as t: + t.extractall(d) + repo_dir = os.path.join(d, 'sample-repo') + result = self.runner.invoke(cli.identify, + ['--type', 'snapshot', repo_dir]) + self.assertPidOK(result, + 'swh:1:snp:9dc0fc035aabe293f5faf6c362a59513454a170d') # NoQA + def test_origin_id(self): """identify an origin URL""" url = 'https://github.com/torvalds/linux' diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py index 83294d5a3b835ad54782e3cad1f526138aeae33b..9e6cd571f91442db8182b6433ef47a893d5a53b7 100644 --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -10,8 +10,8 @@ import unittest from swh.model import hashutil, identifiers from swh.model.exceptions import ValidationError from swh.model.identifiers import (CONTENT, DIRECTORY, - PID_TYPES, RELEASE, - REVISION, SNAPSHOT, PersistentId) + RELEASE, REVISION, + SNAPSHOT, PersistentId) class UtilityFunctionsIdentifier(unittest.TestCase): @@ -768,8 +768,8 @@ class SnapshotIdentifier(unittest.TestCase): 'swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', None, {}), (RELEASE, _release_id, - 'swh:2:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', - 2, {}), + 'swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', + 1, {}), (REVISION, _revision_id, 'swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', None, {}), @@ -783,8 +783,8 @@ class SnapshotIdentifier(unittest.TestCase): 'swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453', None, {}), (RELEASE, _release, - 'swh:2:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', - 2, {}), + 'swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', + 1, {}), (REVISION, _revision, 'swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', None, {}), @@ -811,12 +811,12 @@ class SnapshotIdentifier(unittest.TestCase): _snapshot_id = 'notahash4bc0bf3d81436bf980b46e98bd338453' _snapshot = {'id': _snapshot_id} - for _type, _hash, _error in [ - (SNAPSHOT, _snapshot_id, 'Unexpected characters'), - (SNAPSHOT, _snapshot, 'Unexpected characters'), - ('foo', '', 'Wrong input: Supported types are'), + for _type, _hash in [ + (SNAPSHOT, _snapshot_id), + (SNAPSHOT, _snapshot), + ('foo', ''), ]: - with self.assertRaisesRegex(ValidationError, _error): + with self.assertRaises(ValidationError): identifiers.persistent_identifier(_type, _hash) def test_parse_persistent_identifier(self): @@ -866,34 +866,37 @@ class SnapshotIdentifier(unittest.TestCase): self.assertEqual(actual_result, expected_result) def test_parse_persistent_identifier_parsing_error(self): - for pid, _error in [ - ('swh:1:cnt', - 'Wrong format: There should be 4 mandatory values'), - ('swh:1:', - 'Wrong format: There should be 4 mandatory values'), - ('swh:', - 'Wrong format: There should be 4 mandatory values'), - ('swh:1:cnt:', - 'Wrong format: Identifier should be present'), - ('foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505', - 'Wrong format: only supported namespace is \'swh\''), - ('swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505', - 'Wrong format: only supported version is 1'), - ('swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505', - 'Wrong format: Supported types are %s' % ( - ', '.join(PID_TYPES))), + for pid in [ + ('swh:1:cnt'), + ('swh:1:'), + ('swh:'), + ('swh:1:cnt:'), + ('foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505'), + ('swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505'), + ('swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505'), ('swh:1:dir:0b6959356d30f1a4e9b7f6bca59b9a336464c03d;invalid;' - 'malformed', - 'Contextual data is badly formatted, form key=val expected'), - ('swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d', - 'Wrong format: Identifier should be a valid hash'), - ('swh:1:snp:foo', - 'Wrong format: Identifier should be a valid hash') + 'malformed'), + ('swh:1:snp:gh6959356d30f1a4e9b7f6bca59b9a336464c03d'), + ('swh:1:snp:foo'), ]: - with self.assertRaisesRegex( - ValidationError, _error): + with self.assertRaises(ValidationError): identifiers.parse_persistent_identifier(pid) + def test_persistentid_class_validation_error(self): + for _ns, _version, _type, _id in [ + ('foo', 1, CONTENT, 'abc8bc9d7a6bcf6db04f476d29314f157507d505'), + ('swh', 2, DIRECTORY, 'def8bc9d7a6bcf6db04f476d29314f157507d505'), + ('swh', 1, 'foo', 'fed8bc9d7a6bcf6db04f476d29314f157507d505'), + ('swh', 1, SNAPSHOT, 'gh6959356d30f1a4e9b7f6bca59b9a336464c03d'), + ]: + with self.assertRaises(ValidationError): + PersistentId( + namespace=_ns, + scheme_version=_version, + object_type=_type, + object_id=_id + ) + class OriginIdentifier(unittest.TestCase): def setUp(self): diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py index f91868577235614cd5733957a0af6d19a289c046..b2cc3edc5fa9c2a6cb361942b9ed9f56a6b83ca0 100644 --- a/swh/model/tests/test_model.py +++ b/swh/model/tests/test_model.py @@ -8,12 +8,16 @@ import copy from hypothesis import given from swh.model.model import Content -from swh.model.hypothesis_strategies import objects +from swh.model.hypothesis_strategies import objects, origins, origin_visits @given(objects()) def test_todict_inverse_fromdict(objtype_and_obj): (obj_type, obj) = objtype_and_obj + + if obj_type in ('origin', 'origin_visit'): + return + obj_as_dict = obj.to_dict() obj_as_dict_copy = copy.deepcopy(obj_as_dict) @@ -27,6 +31,23 @@ def test_todict_inverse_fromdict(objtype_and_obj): assert obj_as_dict == type(obj).from_dict(obj_as_dict).to_dict() +@given(origins()) +def test_todict_origins(origin): + obj = origin.to_dict() + + assert 'type' not in obj + assert type(origin)(url=origin.url) == type(origin).from_dict(obj) + + +@given(origin_visits()) +def test_todict_origin_visits(origin_visit): + obj = origin_visit.to_dict() + + assert 'type' not in obj['origin'] + origin_visit.origin.type = None + assert origin_visit == type(origin_visit).from_dict(obj) + + def test_content_get_hash(): hashes = dict( sha1=b'foo', sha1_git=b'bar', sha256=b'baz', blake2s256=b'qux') diff --git a/version.txt b/version.txt index 82e6a188bb01871d82dd5b7fcb4e3a8288a19e83..ac1c1caa880802e7a35a421dec20f66df38107bd 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.47-0-g340b001 \ No newline at end of file +v0.0.48-0-gb2c21d3 \ No newline at end of file