From 574685052348bfc6ed28570f06b9cc4302dfde27 Mon Sep 17 00:00:00 2001
From: Stefano Zacchiroli <zack@upsilon.cc>
Date: Sat, 19 Dec 2020 11:54:59 +0100
Subject: [PATCH] SWHID parsing: simplify and deduplicate validation logic

Before this change there was a lot of overlap between parse_swhid() and the
attrs-based validators in the SWHID class. Also, the validation implementation
in parse_swhid() was done by hand.

With this change the coarse-grained validation done by parse_swhid() is now
delegated to a regex. The semantic validation of SWHIDs is left to attrs
validators. The regex is also exposed as a module attribute, to be used by
client code that want to syntactically validate SWHIDs without necessarily
instantiate SWHID classes (we have several other modules doing that already,
and they are using slightly different hand-made regexs, which isn't great).

As part of this change we also clean up the use of ValidationError exceptions,
systematically passing the problematic parts of SWHID as arguments, and uniform
error messages.

This change also brings some speed up in SWHID parsing. On a benchmark parsing
~30 M valid SWHIDs, the previous implementation took ~3:06 minutes, the new one
~2:50 minutes, or a ~9% speedup.

Closes T2788
---
 swh/model/identifiers.py            | 137 ++++++++++++++--------------
 swh/model/tests/test_identifiers.py |  40 +++-----
 swh/model/tests/test_model.py       |  15 +--
 3 files changed, 88 insertions(+), 104 deletions(-)

diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py
index 274cb356..98843a5a 100644
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
@@ -29,6 +29,16 @@ SWHID_VERSION = 1
 SWHID_TYPES = ["ori", "snp", "rel", "rev", "dir", "cnt"]
 SWHID_SEP = ":"
 SWHID_CTXT_SEP = ";"
+SWHID_QUALIFIERS = {"origin", "anchor", "visit", "path", "lines"}
+
+SWHID_RE_RAW = (
+    f"(?P<scheme>{SWHID_NAMESPACE})"
+    f"{SWHID_SEP}(?P<version>{SWHID_VERSION})"
+    f"{SWHID_SEP}(?P<object_type>{'|'.join(SWHID_TYPES)})"
+    f"{SWHID_SEP}(?P<object_id>[0-9a-f]{{40}})"
+    f"({SWHID_CTXT_SEP}(?P<qualifiers>\\S+))?"
+)
+SWHID_RE = re.compile(SWHID_RE_RAW)
 
 
 @lru_cache()
@@ -677,6 +687,15 @@ _object_type_map = {
     CONTENT: {"short_name": "cnt", "key_id": "sha1_git"},
 }
 
+_swhid_type_map = {
+    "ori": ORIGIN,
+    "snp": SNAPSHOT,
+    "rel": RELEASE,
+    "rev": REVISION,
+    "dir": DIRECTORY,
+    "cnt": CONTENT,
+}
+
 
 @attr.s(frozen=True)
 class SWHID:
@@ -717,8 +736,8 @@ class SWHID:
         # 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0'
     """
 
-    namespace = attr.ib(type=str, default="swh")
-    scheme_version = attr.ib(type=int, default=1)
+    namespace = attr.ib(type=str, default=SWHID_NAMESPACE)
+    scheme_version = attr.ib(type=int, default=SWHID_VERSION)
     object_type = attr.ib(type=str, default="")
     object_id = attr.ib(type=str, converter=hash_to_hex, default="")  # type: ignore
     metadata = attr.ib(
@@ -729,28 +748,43 @@ class SWHID:
     def check_namespace(self, attribute, value):
         if value != SWHID_NAMESPACE:
             raise ValidationError(
-                f"Invalid SWHID: namespace is '{value}' but must be '{SWHID_NAMESPACE}'"
+                "Invalid SWHID: invalid namespace: %(namespace)s",
+                params={"namespace": value},
             )
 
     @scheme_version.validator
     def check_scheme_version(self, attribute, value):
         if value != SWHID_VERSION:
             raise ValidationError(
-                f"Invalid SWHID: version is {value} but must be {SWHID_VERSION}"
+                "Invalid SWHID: invalid version: %(version)s", params={"version": value}
             )
 
     @object_type.validator
     def check_object_type(self, attribute, value):
         if value not in _object_type_map:
-            supported_types = ", ".join(_object_type_map.keys())
             raise ValidationError(
-                f"Invalid SWHID: object type is {value} but must be "
-                f"one of {supported_types}"
+                "Invalid SWHID: invalid type: %(object_type)s)",
+                params={"object_type": value},
             )
 
     @object_id.validator
     def check_object_id(self, attribute, value):
-        validate_sha1(value)  # can raise if invalid hash
+        try:
+            validate_sha1(value)  # can raise if invalid hash
+        except ValidationError:
+            raise ValidationError(
+                "Invalid SWHID: invalid checksum: %(object_id)s",
+                params={"object_id": value},
+            ) from None
+
+    @metadata.validator
+    def check_qualifiers(self, attribute, value):
+        for k in value:
+            if k not in SWHID_QUALIFIERS:
+                raise ValidationError(
+                    "Invalid SWHID: unknown qualifier: %(qualifier)s",
+                    params={"qualifier": k},
+                )
 
     def to_dict(self) -> Dict[str, Any]:
         return attr.asdict(self)
@@ -801,77 +835,44 @@ def swhid(
     return str(swhid)
 
 
-CONTEXT_QUALIFIERS = {"origin", "anchor", "visit", "path", "lines"}
-
-
 def parse_swhid(swhid: str) -> SWHID:
-    """Parse :ref:`persistent-identifiers`.
+    """Parse a Software Heritage identifier (SWHID) from string (see:
+    :ref:`persistent-identifiers`.)
 
     Args:
         swhid (str): A persistent identifier
 
-    Raises:
-        swh.model.exceptions.ValidationError: in case of:
-
-            * missing mandatory values (4)
-            * invalid namespace supplied
-            * invalid version supplied
-            * invalid type supplied
-            * missing hash
-            * invalid hash identifier supplied
-
     Returns:
         a named tuple holding the parsing result
 
-    """
-    if re.search(r"[ \t\n\r\f\v]", swhid):
-        raise ValidationError("Invalid SwHID: SWHIDs cannot contain whitespaces")
-
-    # <swhid>;<contextual-information>
-    swhid_parts = swhid.split(SWHID_CTXT_SEP)
-    swhid_data = swhid_parts.pop(0).split(":")
-
-    if len(swhid_data) != 4:
-        raise ValidationError(
-            "Invalid SWHID, format must be 'swh:1:OBJECT_TYPE:OBJECT_ID'"
-        )
-
-    # Checking for parsing errors
-    _ns, _version, _type, _id = swhid_data
-
-    for otype, data in _object_type_map.items():
-        if _type == data["short_name"]:
-            _type = otype
-            break
+    Raises:
+        swh.model.exceptions.ValidationError: if passed string is not a valid SWHID
 
-    if not _id:
+    """
+    m = SWHID_RE.fullmatch(swhid)
+    if not m:
         raise ValidationError(
-            "Invalid SWHID: missing OBJECT_ID (as a 40 hex digit string)"
-        )
-
-    _metadata = {}
-    for part in swhid_parts:
-        try:
-            qualifier, val = part.split("=")
-            _metadata[qualifier] = val
-        except Exception:
-            raise ValidationError(
-                "Invalid SWHID: contextual data must be a ;-separated list of "
-                "key=value pairs"
-            )
-
-    wrong_qualifiers = set(_metadata) - set(CONTEXT_QUALIFIERS)
-    if wrong_qualifiers:
-        error_msg = (
-            f"Invalid SWHID: Wrong qualifiers {', '.join(wrong_qualifiers)}. "
-            f"The qualifiers must be one of {', '.join(CONTEXT_QUALIFIERS)}"
+            "Invalid SWHID: invalid syntax: %(swhid)s", params={"swhid": swhid}
         )
-        raise ValidationError(error_msg)
+    parts = m.groupdict()
+
+    _qualifiers = {}
+    qualifiers_raw = parts["qualifiers"]
+    if qualifiers_raw:
+        for qualifier in qualifiers_raw.split(SWHID_CTXT_SEP):
+            try:
+                k, v = qualifier.split("=")
+            except ValueError:
+                raise ValidationError(
+                    "Invalid SWHID: invalid qualifier: %(qualifier)s",
+                    params={"qualifier": qualifier},
+                )
+            _qualifiers[k] = v
 
     return SWHID(
-        _ns,
-        int(_version),
-        _type,
-        _id,
-        _metadata,  # type: ignore  # mypy can't properly unify types
+        parts["scheme"],
+        int(parts["version"]),
+        _swhid_type_map[parts["object_type"]],
+        parts["object_id"],
+        _qualifiers,  # type: ignore  # mypy can't properly unify types
     )
diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py
index 73515c65..6041bdab 100644
--- a/swh/model/tests/test_identifiers.py
+++ b/swh/model/tests/test_identifiers.py
@@ -218,6 +218,8 @@ directory_example = {
     ],
 }
 
+dummy_qualifiers = {"origin": "https://example.com", "lines": "42"}
+
 
 class DirectoryIdentifier(unittest.TestCase):
     def setUp(self):
@@ -843,7 +845,7 @@ class SnapshotIdentifier(unittest.TestCase):
         for _type, _hash in [
             (SNAPSHOT, _snapshot_id),
             (SNAPSHOT, _snapshot),
-            ("foo", ""),
+            ("lines", "42"),
         ]:
             with self.assertRaises(ValidationError):
                 identifiers.swhid(_type, _hash)
@@ -1117,17 +1119,9 @@ def test_swhid_hash():
     )
 
     assert hash(
-        SWHID(
-            object_type="directory",
-            object_id=object_id,
-            metadata={"foo": "bar", "baz": "qux"},
-        )
+        SWHID(object_type="directory", object_id=object_id, metadata=dummy_qualifiers,)
     ) == hash(
-        SWHID(
-            object_type="directory",
-            object_id=object_id,
-            metadata={"foo": "bar", "baz": "qux"},
-        )
+        SWHID(object_type="directory", object_id=object_id, metadata=dummy_qualifiers,)
     )
 
     # Different order of the dictionary, so the underlying order of the tuple in
@@ -1136,13 +1130,13 @@ def test_swhid_hash():
         SWHID(
             object_type="directory",
             object_id=object_id,
-            metadata={"foo": "bar", "baz": "qux"},
+            metadata={"origin": "https://example.com", "lines": "42"},
         )
     ) == hash(
         SWHID(
             object_type="directory",
             object_id=object_id,
-            metadata={"baz": "qux", "foo": "bar"},
+            metadata={"lines": "42", "origin": "https://example.com"},
         )
     )
 
@@ -1155,21 +1149,9 @@ def test_swhid_eq():
     )
 
     assert SWHID(
-        object_type="directory",
-        object_id=object_id,
-        metadata={"foo": "bar", "baz": "qux"},
-    ) == SWHID(
-        object_type="directory",
-        object_id=object_id,
-        metadata={"foo": "bar", "baz": "qux"},
-    )
+        object_type="directory", object_id=object_id, metadata=dummy_qualifiers,
+    ) == SWHID(object_type="directory", object_id=object_id, metadata=dummy_qualifiers,)
 
     assert SWHID(
-        object_type="directory",
-        object_id=object_id,
-        metadata={"foo": "bar", "baz": "qux"},
-    ) == SWHID(
-        object_type="directory",
-        object_id=object_id,
-        metadata={"baz": "qux", "foo": "bar"},
-    )
+        object_type="directory", object_id=object_id, metadata=dummy_qualifiers,
+    ) == SWHID(object_type="directory", object_id=object_id, metadata=dummy_qualifiers,)
diff --git a/swh/model/tests/test_model.py b/swh/model/tests/test_model.py
index 902f0df6..393dcfd8 100644
--- a/swh/model/tests/test_model.py
+++ b/swh/model/tests/test_model.py
@@ -782,12 +782,13 @@ _metadata_authority = MetadataAuthority(
 _metadata_fetcher = MetadataFetcher(name="test-fetcher", version="0.0.1",)
 _content_swhid = parse_swhid("swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2")
 _origin_url = "https://forge.softwareheritage.org/source/swh-model.git"
+_dummy_qualifiers = {"origin": "https://example.com", "lines": "42"}
 _common_metadata_fields = dict(
     discovery_date=datetime.datetime.now(tz=datetime.timezone.utc),
     authority=_metadata_authority,
     fetcher=_metadata_fetcher,
     format="json",
-    metadata=b'{"foo": "bar"}',
+    metadata=b'{"origin": "https://example.com", "lines": "42"}',
 )
 
 
@@ -815,7 +816,7 @@ def test_metadata_to_dict():
         "fetcher": {"name": "test-fetcher", "version": "0.0.1",},
         "discovery_date": _common_metadata_fields["discovery_date"],
         "format": "json",
-        "metadata": b'{"foo": "bar"}',
+        "metadata": b'{"origin": "https://example.com", "lines": "42"}',
     }
 
     m = RawExtrinsicMetadata(
@@ -893,7 +894,7 @@ def test_metadata_invalid_target():
             target=SWHID(
                 object_type="content",
                 object_id="94a9ed024d3859793618152ea559a168bbcbb5e2",
-                metadata={"foo": "bar"},
+                metadata=_dummy_qualifiers,
             ),
             **_common_metadata_fields,
         )
@@ -1018,7 +1019,7 @@ def test_metadata_validate_context_snapshot():
             snapshot=SWHID(
                 object_type="snapshot",
                 object_id="94a9ed024d3859793618152ea559a168bbcbb5e2",
-                metadata={"foo": "bar"},
+                metadata=_dummy_qualifiers,
             ),
             **_common_metadata_fields,
         )
@@ -1073,7 +1074,7 @@ def test_metadata_validate_context_release():
             release=SWHID(
                 object_type="release",
                 object_id="94a9ed024d3859793618152ea559a168bbcbb5e2",
-                metadata={"foo": "bar"},
+                metadata=_dummy_qualifiers,
             ),
             **_common_metadata_fields,
         )
@@ -1128,7 +1129,7 @@ def test_metadata_validate_context_revision():
             revision=SWHID(
                 object_type="revision",
                 object_id="94a9ed024d3859793618152ea559a168bbcbb5e2",
-                metadata={"foo": "bar"},
+                metadata=_dummy_qualifiers,
             ),
             **_common_metadata_fields,
         )
@@ -1205,7 +1206,7 @@ def test_metadata_validate_context_directory():
             directory=SWHID(
                 object_type="directory",
                 object_id="94a9ed024d3859793618152ea559a168bbcbb5e2",
-                metadata={"foo": "bar"},
+                metadata=_dummy_qualifiers,
             ),
             **_common_metadata_fields,
         )
-- 
GitLab