From 0d5bc1774829f2175e7ff4f2c301e3648c7cfa05 Mon Sep 17 00:00:00 2001
From: Stefano Zacchiroli <zack@upsilon.cc>
Date: Sat, 16 Jun 2018 22:40:44 +0200
Subject: [PATCH] add swh-identify CLI tool to compute persistent identifiers

Currently only content and directory object types are supported, but more can
be added in the future.

Closes T1039
---
 bin/swh-hash-file           | 32 -------------
 setup.py                    |  4 ++
 swh/model/cli.py            | 96 +++++++++++++++++++++++++++++++++++++
 swh/model/tests/test_cli.py | 71 +++++++++++++++++++++++++++
 4 files changed, 171 insertions(+), 32 deletions(-)
 delete mode 100755 bin/swh-hash-file
 create mode 100644 swh/model/cli.py
 create mode 100644 swh/model/tests/test_cli.py

diff --git a/bin/swh-hash-file b/bin/swh-hash-file
deleted file mode 100755
index c30de78f..00000000
--- a/bin/swh-hash-file
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/python3
-
-# Copyright (C) 2018  The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import sys
-
-from swh.model.from_disk import Content
-from swh.model.hashutil import hash_to_hex
-
-
-HASH_ALGO = 'sha1_git'
-
-
-def hash_file(fname):
-    return hash_to_hex(Content.from_file(path=fname.encode()).hash)
-
-
-def main(fnames):
-    for f in fnames:
-        print(f, hash_file(f), sep='\t')
-
-
-if __name__ == '__main__':
-    fnames = sys.argv[1:]
-    if not fnames:
-        print('Usage: swh-hash-file FILE...')
-        sys.exit(2)
-
-    main(fnames)
diff --git a/setup.py b/setup.py
index 232f3bc2..dd32e44d 100644
--- a/setup.py
+++ b/setup.py
@@ -31,6 +31,10 @@ setup(
     packages=find_packages(),  # packages's modules
     scripts=[],   # scripts to package
     install_requires=parse_requirements() + extra_requirements,
+    entry_points='''
+        [console_scripts]
+        swh-identify=swh.model.cli:identify
+    ''',
     setup_requires=['vcversioner'],
     vcversioner={},
     include_package_data=True,
diff --git a/swh/model/cli.py b/swh/model/cli.py
new file mode 100644
index 00000000..9e0471a4
--- /dev/null
+++ b/swh/model/cli.py
@@ -0,0 +1,96 @@
+# Copyright (C) 2018  The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import click
+import os
+import sys
+
+from swh.model import identifiers as pids
+from swh.model.from_disk import Content, Directory
+
+
+class PidParamType(click.ParamType):
+    name = 'persistent identifier'
+
+    def convert(self, value, param, ctx):
+        try:
+            _parsed_pid = pids.parse_persistent_identifier(value)  # noqa
+            return value  # return as string, as we need just that
+        except Exception:
+            # TODO catch more specific parsing exception. Requires
+            # https://forge.softwareheritage.org/T1104 to be addressed first.
+            self.fail('%s is not a valid PID' % value, param, ctx)
+
+
+def pid_of_file(path):
+    object = Content.from_file(path=path).get_data()
+    return pids.persistent_identifier(pids.CONTENT, object)
+
+
+def pid_of_dir(path):
+    object = Directory.from_disk(path=path).get_data()
+    return pids.persistent_identifier(pids.DIRECTORY, object)
+
+
+@click.command()
+@click.option('--type', '-t', default='auto',
+              type=click.Choice(['auto', 'content', 'directory']),
+              help='type of object to identify (default: auto)')
+@click.option('--verify', '-v', metavar='PID', type=PidParamType(),
+              help='reference identifier to be compared with computed one')
+@click.argument('object',
+                type=click.Path(exists=True, readable=True,
+                                allow_dash=True, path_type=bytes))
+def identify(type, verify, object):
+    """Compute the Software Heritage persistent identifier (PID) for a given
+    source code object.
+
+    For more details about Software Heritage PIDs see:
+
+    \b
+    https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
+
+    \b
+    Examples:
+
+    \b
+      $ swh-identify /usr/src/linux/kernel/
+      swh:1:dir:f9f858a48d663b3809c9e2f336412717496202ab
+
+    \b
+      $ swh-identify /usr/src/linux/kernel/sched/deadline.c
+      swh:1:cnt:57b939c81bce5d06fa587df8915f05affbe22b82
+
+    """
+    if type == 'auto':
+        if os.path.isfile(object):
+            type = 'content'
+        elif os.path.isdir(object):
+            type = 'directory'
+        else:  # shouldn't happen, due to path validation
+            raise click.BadParameter('%s is neither a file nor a directory' %
+                                     object)
+
+    pid = None
+    if type == 'content':
+        pid = pid_of_file(object)
+    elif type == 'directory':
+        pid = pid_of_dir(object)
+    else:  # shouldn't happen, due to option validation
+        raise click.BadParameter('invalid object type: ' + type)
+
+    if verify:
+        if verify == pid:
+            click.echo('PID match: %s' % pid)
+            sys.exit(0)
+        else:
+            click.echo('PID mismatch: %s != %s' % (verify, pid))
+            sys.exit(1)
+    else:
+        click.echo(pid)
+
+
+if __name__ == '__main__':
+    identify()
diff --git a/swh/model/tests/test_cli.py b/swh/model/tests/test_cli.py
new file mode 100644
index 00000000..5612c9c3
--- /dev/null
+++ b/swh/model/tests/test_cli.py
@@ -0,0 +1,71 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import tempfile
+import unittest
+
+from click.testing import CliRunner
+
+from swh.model import cli
+from swh.model.tests.test_from_disk import DataMixin
+from swh.model.hashutil import hash_to_hex
+
+
+class TestIdentify(DataMixin, unittest.TestCase):
+
+    def setUp(self):
+        super().setUp()
+        self.runner = CliRunner()
+
+    def test_content_id(self):
+        self.make_contents(self.tmpdir_name)
+        for filename, content in self.contents.items():
+            path = os.path.join(self.tmpdir_name, filename)
+            result = self.runner.invoke(cli.identify,
+                                        ['--type', 'content', path])
+
+            self.assertEqual(result.exit_code, 0)
+            self.assertEqual(result.output.rstrip(),
+                             'swh:1:cnt:' + hash_to_hex(content['sha1_git']))
+
+    def test_directory_id(self):
+        self.make_from_tarball(self.tmpdir_name)
+        path = os.path.join(self.tmpdir_name, b'sample-folder')
+        result = self.runner.invoke(cli.identify,
+                                    ['--type', 'directory', path])
+
+        self.assertEqual(result.exit_code, 0)
+        self.assertEqual(result.output.rstrip(),
+                         'swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759')
+
+    def test_auto_id(self):
+        with tempfile.NamedTemporaryFile(prefix='swh.model.cli') as f:
+            result = self.runner.invoke(cli.identify, [f.name])
+            self.assertEqual(result.exit_code, 0)
+            self.assertRegex(result.output, r'^swh:\d+:cnt:')
+
+        with tempfile.TemporaryDirectory(prefix='swh.model.cli') as dirname:
+            result = self.runner.invoke(cli.identify, [dirname])
+            self.assertEqual(result.exit_code, 0)
+            self.assertRegex(result.output, r'^swh:\d+:dir:')
+
+    def test_verify_content(self):
+        self.make_contents(self.tmpdir_name)
+        for filename, content in self.contents.items():
+            expected_id = 'swh:1:cnt:' + hash_to_hex(content['sha1_git'])
+
+            # match
+            path = os.path.join(self.tmpdir_name, filename)
+            result = self.runner.invoke(cli.identify,
+                                        ['--verify', expected_id, path])
+            self.assertEqual(result.exit_code, 0)
+
+            # mismatch
+            with open(path, 'a') as f:
+                f.write('trailing garbage to make verification fail')
+            result = self.runner.invoke(cli.identify,
+                                        ['--verify', expected_id, path])
+            self.assertEqual(result.exit_code, 1)
-- 
GitLab