diff --git a/mypy.ini b/mypy.ini index d63e78953bd4973585a369650d034112c3e17408..28c26fbae6feb0024ef68459574c13312e3657de 100644 --- a/mypy.ini +++ b/mypy.ini @@ -11,6 +11,9 @@ ignore_missing_imports = True [mypy-confluent_kafka.*] ignore_missing_imports = True +[mypy-iso8601.*] +ignore_missing_imports = True + [mypy-magic.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 4dd61a2c280cd34b5ddefd2ae1204e7af8b9aa87..1cfc8ea75d4c3474fa3886ade2458c59753b7fc4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ click # the version 2.1.2 is causing segmentation faults # cf https://forge.softwareheritage.org/T3815 frozendict != 2.1.2 +iso8601 pyld rdflib sentry-sdk diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py index f5c611aa5e99e836d3d085b82c23b49eab340c2c..7472123e0f45dde0e04112ce22d1aab7529a9d44 100644 --- a/swh/indexer/metadata_dictionary/codemeta.py +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -9,6 +9,7 @@ import re from typing import Any, Dict, List, Optional, Tuple, Union import xml.etree.ElementTree as ET +import iso8601 import xmltodict from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand @@ -19,6 +20,7 @@ ATOM_URI = "http://www.w3.org/2005/Atom" _TAG_RE = re.compile(r"\{(?P<namespace>.*?)\}(?P<localname>.*)") _IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) +_DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$") class CodemetaMapping(SingleFileIntrinsicMapping): @@ -84,6 +86,20 @@ class SwordCodemetaMapping(BaseExtrinsicMapping): # expansion will convert it to a full URI based on # "@context": CODEMETA_CONTEXT_URL jsonld_child = self.xml_to_jsonld(child) + if ( + localname + in ( + "dateCreated", + "dateModified", + "datePublished", + ) + and isinstance(jsonld_child, str) + and _DATE_RE.match(jsonld_child) + ): + # Dates missing a leading zero for their day/month, used + # to be allowed by the deposit; so we need to reformat them + # to be valid ISO8601. + jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat() doc[localname].append(jsonld_child) else: # Otherwise, we already know the URI diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py index 8c354d68bc58a915af0e74814c8d5d8357b31106..bc08b251ef534a72850e783db1273c40e9e321fb 100644 --- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py +++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py @@ -384,6 +384,28 @@ def test_sword_propertyvalue(): } +def test_sword_fix_date(): + content = """<?xml version="1.0"?> + <entry xmlns="http://www.w3.org/2005/Atom" + xmlns:codemeta="https://doi.org/10.5063/schema/codemeta-2.0" + xmlns:schema="http://schema.org/"> + <name>Name</name> + <codemeta:dateModified>2020-12-1</codemeta:dateModified> + <codemeta:dateCreated>2020-12-2</codemeta:dateCreated> + <codemeta:datePublished>2020-12-3</codemeta:datePublished> + </entry> + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "Name", + "dateModified": "2020-12-01", + "dateCreated": "2020-12-02", + "datePublished": "2020-12-03", + } + + def test_json_sword(): content = """{"id": "hal-01243573", "@xmlns": "http://www.w3.org/2005/Atom", "author": {"name": "Author 1", "email": "foo@example.org"}, "client": "hal", "codemeta:url": "http://example.org/", "codemeta:name": "The assignment problem", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "codemeta:author": {"codemeta:name": "Author 2"}, "codemeta:license": {"codemeta:name": "GNU General Public License v3.0 or later"}}""" # noqa result = MAPPINGS["JsonSwordCodemetaMapping"]().translate(content)