Skip to content
Snippets Groups Projects
Commit a9af3e7c authored by Stefano Zacchiroli's avatar Stefano Zacchiroli
Browse files

swh identify: add support to compute snapshot PIDs of on-disk git repo

parent febe8002
No related branches found
No related tags found
No related merge requests found
......@@ -8,6 +8,9 @@ warn_unused_ignores = True
[mypy-django.*] # false positive, only used my hypotesis' extras
ignore_missing_imports = True
[mypy-dulwich.*] # false positive, only used my hypotesis' extras
ignore_missing_imports = True
[mypy-pkg_resources.*]
ignore_missing_imports = True
......
......@@ -6,3 +6,4 @@ Click
attrs
hypothesis
python-dateutil
dulwich
......@@ -4,12 +4,14 @@
# See top-level LICENSE file for more information
import click
import dulwich.repo
import os
import sys
from functools import partial
from urllib.parse import urlparse
from swh.model import hashutil
from swh.model import identifiers as pids
from swh.model.exceptions import ValidationError
from swh.model.from_disk import Content, Directory
......@@ -17,6 +19,15 @@ from swh.model.from_disk import Content, Directory
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
# Mapping between dulwich types and Software Heritage ones. Used by snapshot ID
# computation.
_DULWICH_TYPES = {
b'blob': 'content',
b'tree': 'directory',
b'commit': 'revision',
b'tag': 'release',
}
class PidParamType(click.ParamType):
name = 'persistent identifier'
......@@ -45,6 +56,26 @@ def pid_of_origin(url):
return str(pid)
def pid_of_git_repo(path):
repo = dulwich.repo.Repo(path)
branches = {}
for ref, target in repo.refs.as_dict().items():
obj = repo[target]
if obj:
branches[ref] = {
'target': hashutil.bytehex_to_hash(target),
'target_type': _DULWICH_TYPES[obj.type_name],
}
else:
branches[ref] = None
snapshot = {'branches': branches}
pid = pids.PersistentId(object_type='snapshot',
object_id=pids.snapshot_identifier(snapshot))
return str(pid)
def identify_object(obj_type, follow_symlinks, obj):
if obj_type == 'auto':
if os.path.isfile(obj):
......@@ -73,6 +104,8 @@ def identify_object(obj_type, follow_symlinks, obj):
pid = pid_of_dir(path)
elif obj_type == 'origin':
pid = pid_of_origin(obj)
elif obj_type == 'snapshot':
pid = pid_of_git_repo(obj)
else: # shouldn't happen, due to option validation
raise click.BadParameter('invalid object type: ' + obj_type)
......@@ -89,7 +122,8 @@ def identify_object(obj_type, follow_symlinks, obj):
@click.option('--filename/--no-filename', 'show_filename', default=True,
help='show/hide file name (default: show)')
@click.option('--type', '-t', 'obj_type', default='auto',
type=click.Choice(['auto', 'content', 'directory', 'origin']),
type=click.Choice(['auto', 'content', 'directory', 'origin',
'snapshot']),
help='type of object to identify (default: auto)')
@click.option('--verify', '-v', metavar='PID', type=PidParamType(),
help='reference identifier to be compared with computed one')
......@@ -116,7 +150,12 @@ def identify(obj_type, verify, show_filename, follow_symlinks, objects):
$ swh identify --no-filename /usr/src/linux/kernel/
swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab
"""
\b
$ git clone --mirror https://forge.softwareheritage.org/source/helloworld.git
$ swh identify --type snapshot helloworld.git/
swh:1:snp:510aa88bdc517345d258c1fc2babcd0e1f905e93 helloworld.git
""" # NoQA # overlong lines in shell examples are fine
if verify and len(objects) != 1:
raise click.BadParameter('verification requires a single object')
......
File added
# Copyright (C) 2018 The Software Heritage developers
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import tarfile
import tempfile
import unittest
......@@ -45,6 +46,19 @@ class TestIdentify(DataMixin, unittest.TestCase):
self.assertPidOK(result,
'swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759')
def test_snapshot_id(self):
"""identify a snapshot"""
tarball = os.path.join(os.path.dirname(__file__), 'data', 'repos',
'sample-repo.tgz')
with tempfile.TemporaryDirectory(prefix='swh.model.cli') as d:
with tarfile.open(tarball, 'r:gz') as t:
t.extractall(d)
repo_dir = os.path.join(d, 'sample-repo')
result = self.runner.invoke(cli.identify,
['--type', 'snapshot', repo_dir])
self.assertPidOK(result,
'swh:1:snp:9dc0fc035aabe293f5faf6c362a59513454a170d') # NoQA
def test_origin_id(self):
"""identify an origin URL"""
url = 'https://github.com/torvalds/linux'
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment