diff --git a/mypy.ini b/mypy.ini index 656d7bb315fb0344e7b2662e7ad9abc99cd92b5b..fc409768187382ed4e220ebab45a141cb04a5773 100644 --- a/mypy.ini +++ b/mypy.ini @@ -8,6 +8,9 @@ warn_unused_ignores = True [mypy-django.*] # false positive, only used my hypotesis' extras ignore_missing_imports = True +[mypy-dulwich.*] # false positive, only used my hypotesis' extras +ignore_missing_imports = True + [mypy-pkg_resources.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 5962345374109ec709acb6ed0c66b424d29d049c..236db633091191d4fef2655c73b4425beef70ec8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ Click attrs hypothesis python-dateutil +dulwich diff --git a/swh/model/cli.py b/swh/model/cli.py index 853efa995876adb1c8eef54503e52fb5b5a7b988..991bc46ec72bf9657d7db9965b33e08ab83ff29d 100644 --- a/swh/model/cli.py +++ b/swh/model/cli.py @@ -4,12 +4,14 @@ # See top-level LICENSE file for more information import click +import dulwich.repo import os import sys from functools import partial from urllib.parse import urlparse +from swh.model import hashutil from swh.model import identifiers as pids from swh.model.exceptions import ValidationError from swh.model.from_disk import Content, Directory @@ -17,6 +19,15 @@ from swh.model.from_disk import Content, Directory CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) +# Mapping between dulwich types and Software Heritage ones. Used by snapshot ID +# computation. +_DULWICH_TYPES = { + b'blob': 'content', + b'tree': 'directory', + b'commit': 'revision', + b'tag': 'release', +} + class PidParamType(click.ParamType): name = 'persistent identifier' @@ -45,6 +56,26 @@ def pid_of_origin(url): return str(pid) +def pid_of_git_repo(path): + repo = dulwich.repo.Repo(path) + + branches = {} + for ref, target in repo.refs.as_dict().items(): + obj = repo[target] + if obj: + branches[ref] = { + 'target': hashutil.bytehex_to_hash(target), + 'target_type': _DULWICH_TYPES[obj.type_name], + } + else: + branches[ref] = None + snapshot = {'branches': branches} + + pid = pids.PersistentId(object_type='snapshot', + object_id=pids.snapshot_identifier(snapshot)) + return str(pid) + + def identify_object(obj_type, follow_symlinks, obj): if obj_type == 'auto': if os.path.isfile(obj): @@ -73,6 +104,8 @@ def identify_object(obj_type, follow_symlinks, obj): pid = pid_of_dir(path) elif obj_type == 'origin': pid = pid_of_origin(obj) + elif obj_type == 'snapshot': + pid = pid_of_git_repo(obj) else: # shouldn't happen, due to option validation raise click.BadParameter('invalid object type: ' + obj_type) @@ -89,7 +122,8 @@ def identify_object(obj_type, follow_symlinks, obj): @click.option('--filename/--no-filename', 'show_filename', default=True, help='show/hide file name (default: show)') @click.option('--type', '-t', 'obj_type', default='auto', - type=click.Choice(['auto', 'content', 'directory', 'origin']), + type=click.Choice(['auto', 'content', 'directory', 'origin', + 'snapshot']), help='type of object to identify (default: auto)') @click.option('--verify', '-v', metavar='PID', type=PidParamType(), help='reference identifier to be compared with computed one') @@ -116,7 +150,12 @@ def identify(obj_type, verify, show_filename, follow_symlinks, objects): $ swh identify --no-filename /usr/src/linux/kernel/ swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab - """ + \b + $ git clone --mirror https://forge.softwareheritage.org/source/helloworld.git + $ swh identify --type snapshot helloworld.git/ + swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93 helloworld.git + + """ # NoQA # overlong lines in shell examples are fine if verify and len(objects) != 1: raise click.BadParameter('verification requires a single object') diff --git a/swh/model/tests/data/repos/sample-repo.tgz b/swh/model/tests/data/repos/sample-repo.tgz new file mode 100644 index 0000000000000000000000000000000000000000..5b5baa70b6be89161eebc0cc9b7589481a439504 Binary files /dev/null and b/swh/model/tests/data/repos/sample-repo.tgz differ diff --git a/swh/model/tests/test_cli.py b/swh/model/tests/test_cli.py index 7f70b46d119192790d2e6c4f1740f30cefbf1a82..990ca08ea3cf1a6c7d68b802f05c6af50495bce3 100644 --- a/swh/model/tests/test_cli.py +++ b/swh/model/tests/test_cli.py @@ -1,9 +1,10 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os +import tarfile import tempfile import unittest @@ -45,6 +46,19 @@ class TestIdentify(DataMixin, unittest.TestCase): self.assertPidOK(result, 'swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759') + def test_snapshot_id(self): + """identify a snapshot""" + tarball = os.path.join(os.path.dirname(__file__), 'data', 'repos', + 'sample-repo.tgz') + with tempfile.TemporaryDirectory(prefix='swh.model.cli') as d: + with tarfile.open(tarball, 'r:gz') as t: + t.extractall(d) + repo_dir = os.path.join(d, 'sample-repo') + result = self.runner.invoke(cli.identify, + ['--type', 'snapshot', repo_dir]) + self.assertPidOK(result, + 'swh:1:snp:9dc0fc035aabe293f5faf6c362a59513454a170d') # NoQA + def test_origin_id(self): """identify an origin URL""" url = 'https://github.com/torvalds/linux'