From 8a0cc22a73f9e9d7909f0ec219b536fbe42a8c0a Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <antoine.romain.dumont@gmail.com>
Date: Wed, 20 Jun 2018 10:18:17 +0200
Subject: [PATCH] identifiers: Make invalid persistent identifier parsing raise
 error

Related T1104
---
 swh/model/identifiers.py            | 41 ++++++++++++++++++++++++++---
 swh/model/tests/test_identifiers.py | 28 ++++++++++++++++++--
 2 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py
index eef77106..138ca33a 100644
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
@@ -637,12 +637,21 @@ def persistent_identifier(type, object, version=1):
     return 'swh:%s:%s:%s' % (version, o['short_name'], _hash)
 
 
+PERSISTENT_IDENTIFIER_TYPES = ['snp', 'rel', 'rev', 'dir', 'cnt']
+
 PERSISTENT_IDENTIFIER_KEYS = [
     'namespace', 'scheme_version', 'object_type', 'object_id', 'metadata']
 
 PERSISTENT_IDENTIFIER_PARTS_SEP = ';'
 
 
+class SWHMalformedIdentifierException(ValueError):
+    """Exception when a string representing an identifier is badly formatted.
+
+    """
+    pass
+
+
 def parse_persistent_identifier(persistent_id):
     """Parse swh's :ref:`persistent-identifiers` scheme.
 
@@ -659,8 +668,34 @@ def parse_persistent_identifier(persistent_id):
             * metadata, holding dict value
 
     """
+    # <pid>;<contextual-information>
     persistent_id_parts = persistent_id.split(PERSISTENT_IDENTIFIER_PARTS_SEP)
-    data = persistent_id_parts.pop(0).split(':')
+    pid_data = persistent_id_parts.pop(0).split(':')
+
+    if len(pid_data) != 4:
+        raise SWHMalformedIdentifierException(
+            'Wrong format: There should be 4 mandatory parameters')
+
+    # Checking for parsing errors
+    _ns, _version, _type, _id = pid_data
+    if _ns != 'swh':
+        raise SWHMalformedIdentifierException(
+            'Wrong format: Supported namespace is \'swh\'')
+
+    if _version != '1':
+        raise SWHMalformedIdentifierException(
+            'Wrong format: Supported version is 1')
+
+    expected_types = PERSISTENT_IDENTIFIER_TYPES
+    if _type not in expected_types:
+        raise SWHMalformedIdentifierException(
+            'Wrong format: Supported types are %s' % (
+                ', '.join(expected_types)))
+
+    if not _id:
+        raise SWHMalformedIdentifierException(
+            'Wrong format: Identifier should be present')
+
     persistent_id_metadata = {}
     for part in persistent_id_parts:
         try:
@@ -668,5 +703,5 @@ def parse_persistent_identifier(persistent_id):
             persistent_id_metadata[key] = val
         except Exception:
             pass
-    data.append(persistent_id_metadata)
-    return dict(zip(PERSISTENT_IDENTIFIER_KEYS, data))
+    pid_data.append(persistent_id_metadata)
+    return dict(zip(PERSISTENT_IDENTIFIER_KEYS, pid_data))
diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py
index afe943b6..8282aebb 100644
--- a/swh/model/tests/test_identifiers.py
+++ b/swh/model/tests/test_identifiers.py
@@ -804,8 +804,8 @@ class SnapshotIdentifier(unittest.TestCase):
         for pid, _type, _version, _hash in [
                 ('swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2', 'cnt',
                  '1', '94a9ed024d3859793618152ea559a168bbcbb5e2'),
-                ('swh:2:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir',
-                 '2', 'd198bc9d7a6bcf6db04f476d29314f157507d505'),
+                ('swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505', 'dir',
+                 '1', 'd198bc9d7a6bcf6db04f476d29314f157507d505'),
                 ('swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d', 'rev',
                  '1', '309cf2674ee7a0749978cf8265ab91a60aea0f7d'),
                 ('swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f', 'rel',
@@ -847,3 +847,27 @@ class SnapshotIdentifier(unittest.TestCase):
             }
             actual_result = identifiers.parse_persistent_identifier(pid)
             self.assertEquals(actual_result, expected_result)
+
+    def test_parse_persistent_identifier_parsing_error(self):
+        from swh.model.identifiers import (SWHMalformedIdentifierException,
+                                           PERSISTENT_IDENTIFIER_TYPES)
+        for pid, _error in [
+                ('swh:1:cnt',
+                 'Wrong format: There should be 4 mandatory parameters'),
+                ('swh:1:',
+                 'Wrong format: There should be 4 mandatory parameters'),
+                ('swh:',
+                 'Wrong format: There should be 4 mandatory parameters'),
+                ('foo:1:cnt:abc8bc9d7a6bcf6db04f476d29314f157507d505',
+                 'Wrong format: Supported namespace is \'swh\''),
+                ('swh:2:dir:def8bc9d7a6bcf6db04f476d29314f157507d505',
+                 'Wrong format: Supported version is 1'),
+                ('swh:1:foo:fed8bc9d7a6bcf6db04f476d29314f157507d505',
+                 'Wrong format: Supported types are %s' % (
+                     ', '.join(PERSISTENT_IDENTIFIER_TYPES))),
+                ('swh:1:cnt:',
+                 'Wrong format: Identifier should be present'),
+        ]:
+            with self.assertRaisesRegex(
+                    SWHMalformedIdentifierException, _error):
+                identifiers.parse_persistent_identifier(pid)
-- 
GitLab