From 6d1089cb25b62e521ee68af7369d908c89e70f4c Mon Sep 17 00:00:00 2001 From: Valentin Lorentz <vlorentz@softwareheritage.org> Date: Tue, 5 Apr 2022 15:40:52 +0200 Subject: [PATCH] Add support for HAL-ID as identifier --- swh/deposit/api/checks.py | 38 ++++++++++++++++++---------- swh/deposit/tests/api/test_checks.py | 32 +++++++++++++++++++++++ swh/deposit/xsd/codemeta.xsd | 14 +++++++++- 3 files changed, 70 insertions(+), 14 deletions(-) diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py index 3fb990c4..1d82a28e 100644 --- a/swh/deposit/api/checks.py +++ b/swh/deposit/api/checks.py @@ -16,6 +16,7 @@ Suggested fields: import dataclasses import functools +import re from typing import Dict, Iterator, Optional, Tuple, cast import urllib from xml.etree import ElementTree @@ -55,22 +56,33 @@ def extra_validator( # </xsd:simpleType> # However, this would give an unreadable error, so we implement it here # in Python instead. - try: - url = urllib.parse.urlparse(element.text) - except ValueError: + yield from absolute_uri_validator(element, xsd_element) + elif type_name == "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}identifierType": + # Made-up type, that allows both absolute URIs and HAL-IDs + if not re.match("hal-[0-9]+", element.text or ""): + yield from absolute_uri_validator(element, xsd_element) + + +def absolute_uri_validator( + element: ElementTree.Element, + xsd_element: xmlschema.validators.elements.Xsd11Element, +) -> Iterator[xmlschema.XMLSchemaValidationError]: + try: + url = urllib.parse.urlparse(element.text) + except ValueError: + yield xmlschema.XMLSchemaValidationError( + xsd_element, element, f"{element.text!r} is not a valid URI", + ) + else: + if not url.scheme or not url.netloc: + yield xmlschema.XMLSchemaValidationError( + xsd_element, element, f"{element.text!r} is not an absolute URI", + ) + elif " " in url.netloc: + # urllib is a little too permissive... yield xmlschema.XMLSchemaValidationError( xsd_element, element, f"{element.text!r} is not a valid URI", ) - else: - if not url.scheme or not url.netloc: - yield xmlschema.XMLSchemaValidationError( - xsd_element, element, f"{element.text!r} is not an absolute URI", - ) - elif " " in url.netloc: - # urllib is a little too permissive... - yield xmlschema.XMLSchemaValidationError( - xsd_element, element, f"{element.text!r} is not a valid URI", - ) @dataclasses.dataclass diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py index df6d8961..e3e55704 100644 --- a/swh/deposit/tests/api/test_checks.py +++ b/swh/deposit/tests/api/test_checks.py @@ -146,6 +146,38 @@ _parameters1 = [ </entry> """, ), + ( + "identifier-is-halid", + f"""\ + <entry {XMLNS}> + <url>some url</url> + <codemeta:name>bar</codemeta:name> + <codemeta:author> + <codemeta:name>The Author</codemeta:name> + </codemeta:author> + <codemeta:identifier>hal-12345</codemeta:identifier> + {PROVENANCE_XML} + </entry> + """, + ), + ( + "identifier-is-propertyvalue", + f"""\ + <entry {XMLNS}> + <url>some url</url> + <codemeta:name>bar</codemeta:name> + <codemeta:author> + <codemeta:name>The Author</codemeta:name> + </codemeta:author> + <schema:identifier> + <codemeta:type>schema:PropertyValue</codemeta:type> + <schema:propertyID>HAL-ID</schema:propertyID> + <schema:value>hal-02527911</schema:value> + </schema:identifier> + {PROVENANCE_XML} + </entry> + """, + ), ( "codemeta-dates", f"""\ diff --git a/swh/deposit/xsd/codemeta.xsd b/swh/deposit/xsd/codemeta.xsd index 64e5187c..474e4c9a 100644 --- a/swh/deposit/xsd/codemeta.xsd +++ b/swh/deposit/xsd/codemeta.xsd @@ -52,11 +52,23 @@ Therefore, more custom checks are implemented in swh/deposit/api/checks.py in order to allow either. --> + + <xsd:simpleType name="halId"> + <xsd:restriction base="xsd:string"> + <xsd:pattern value="hal-[0-9]+"/> + </xsd:restriction> + </xsd:simpleType> + <xsd:simpleType name="identifierType"> + <!-- CodeMeta only allows URIs, but we make an exception for HAL-IDs, + in order not to break backward-compatibility. --> + <xsd:union memberTypes="xsd:anyURI codemeta:halId"/> + </xsd:simpleType> + <xsd:element name="identifier" type="codemeta:identifierType" /> + <xsd:element name="name" type="xsd:string" /> <xsd:element name="givenName" type="xsd:string" /> <xsd:element name="familyName" type="xsd:string" /> <xsd:element name="email" type="xsd:string" /> - <xsd:element name="identifier" type="xsd:anyURI" /> <xsd:element name="applicationCategory" type="xsd:string" /> <xsd:element name="applicationSubCategory" type="xsd:string" /> -- GitLab