From 6d1089cb25b62e521ee68af7369d908c89e70f4c Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Tue, 5 Apr 2022 15:40:52 +0200
Subject: [PATCH] Add support for HAL-ID as identifier

---
 swh/deposit/api/checks.py            | 38 ++++++++++++++++++----------
 swh/deposit/tests/api/test_checks.py | 32 +++++++++++++++++++++++
 swh/deposit/xsd/codemeta.xsd         | 14 +++++++++-
 3 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py
index 3fb990c4..1d82a28e 100644
--- a/swh/deposit/api/checks.py
+++ b/swh/deposit/api/checks.py
@@ -16,6 +16,7 @@ Suggested fields:
 
 import dataclasses
 import functools
+import re
 from typing import Dict, Iterator, Optional, Tuple, cast
 import urllib
 from xml.etree import ElementTree
@@ -55,22 +56,33 @@ def extra_validator(
         #     </xsd:simpleType>
         # However, this would give an unreadable error, so we implement it here
         # in Python instead.
-        try:
-            url = urllib.parse.urlparse(element.text)
-        except ValueError:
+        yield from absolute_uri_validator(element, xsd_element)
+    elif type_name == "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}identifierType":
+        # Made-up type, that allows both absolute URIs and HAL-IDs
+        if not re.match("hal-[0-9]+", element.text or ""):
+            yield from absolute_uri_validator(element, xsd_element)
+
+
+def absolute_uri_validator(
+    element: ElementTree.Element,
+    xsd_element: xmlschema.validators.elements.Xsd11Element,
+) -> Iterator[xmlschema.XMLSchemaValidationError]:
+    try:
+        url = urllib.parse.urlparse(element.text)
+    except ValueError:
+        yield xmlschema.XMLSchemaValidationError(
+            xsd_element, element, f"{element.text!r} is not a valid URI",
+        )
+    else:
+        if not url.scheme or not url.netloc:
+            yield xmlschema.XMLSchemaValidationError(
+                xsd_element, element, f"{element.text!r} is not an absolute URI",
+            )
+        elif " " in url.netloc:
+            # urllib is a little too permissive...
             yield xmlschema.XMLSchemaValidationError(
                 xsd_element, element, f"{element.text!r} is not a valid URI",
             )
-        else:
-            if not url.scheme or not url.netloc:
-                yield xmlschema.XMLSchemaValidationError(
-                    xsd_element, element, f"{element.text!r} is not an absolute URI",
-                )
-            elif " " in url.netloc:
-                # urllib is a little too permissive...
-                yield xmlschema.XMLSchemaValidationError(
-                    xsd_element, element, f"{element.text!r} is not a valid URI",
-                )
 
 
 @dataclasses.dataclass
diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py
index df6d8961..e3e55704 100644
--- a/swh/deposit/tests/api/test_checks.py
+++ b/swh/deposit/tests/api/test_checks.py
@@ -146,6 +146,38 @@ _parameters1 = [
             </entry>
             """,
         ),
+        (
+            "identifier-is-halid",
+            f"""\
+            <entry {XMLNS}>
+                <url>some url</url>
+                <codemeta:name>bar</codemeta:name>
+                <codemeta:author>
+                    <codemeta:name>The Author</codemeta:name>
+                </codemeta:author>
+                <codemeta:identifier>hal-12345</codemeta:identifier>
+                {PROVENANCE_XML}
+            </entry>
+            """,
+        ),
+        (
+            "identifier-is-propertyvalue",
+            f"""\
+            <entry {XMLNS}>
+                <url>some url</url>
+                <codemeta:name>bar</codemeta:name>
+                <codemeta:author>
+                    <codemeta:name>The Author</codemeta:name>
+                </codemeta:author>
+                <schema:identifier>
+                    <codemeta:type>schema:PropertyValue</codemeta:type>
+                    <schema:propertyID>HAL-ID</schema:propertyID>
+                    <schema:value>hal-02527911</schema:value>
+                </schema:identifier>
+                {PROVENANCE_XML}
+            </entry>
+            """,
+        ),
         (
             "codemeta-dates",
             f"""\
diff --git a/swh/deposit/xsd/codemeta.xsd b/swh/deposit/xsd/codemeta.xsd
index 64e5187c..474e4c9a 100644
--- a/swh/deposit/xsd/codemeta.xsd
+++ b/swh/deposit/xsd/codemeta.xsd
@@ -52,11 +52,23 @@
        Therefore, more custom checks are implemented in swh/deposit/api/checks.py
        in order to allow either. -->
 
+
+  <xsd:simpleType name="halId">
+    <xsd:restriction base="xsd:string">
+      <xsd:pattern value="hal-[0-9]+"/>
+    </xsd:restriction>
+  </xsd:simpleType>
+  <xsd:simpleType name="identifierType">
+    <!-- CodeMeta only allows URIs, but we make an exception for HAL-IDs,
+         in order not to break backward-compatibility. -->
+    <xsd:union memberTypes="xsd:anyURI codemeta:halId"/>
+  </xsd:simpleType>
+  <xsd:element name="identifier" type="codemeta:identifierType" />
+
   <xsd:element name="name" type="xsd:string" />
   <xsd:element name="givenName" type="xsd:string" />
   <xsd:element name="familyName" type="xsd:string" />
   <xsd:element name="email" type="xsd:string" />
-  <xsd:element name="identifier" type="xsd:anyURI" />
 
   <xsd:element name="applicationCategory" type="xsd:string" />
   <xsd:element name="applicationSubCategory" type="xsd:string" />
-- 
GitLab