From 3bad41489c4b5412fbf250d7dd53c3b188956f65 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz <vlorentz@softwareheritage.org> Date: Wed, 26 Oct 2022 14:19:26 +0200 Subject: [PATCH] codemeta: Fix malformed dates that used to be allowed by the deposit --- mypy.ini | 3 +++ requirements.txt | 1 + swh/indexer/metadata_dictionary/codemeta.py | 16 ++++++++++++++ .../metadata_dictionary/test_codemeta.py | 22 +++++++++++++++++++ 4 files changed, 42 insertions(+) diff --git a/mypy.ini b/mypy.ini index d63e7895..28c26fba 100644 --- a/mypy.ini +++ b/mypy.ini @@ -11,6 +11,9 @@ ignore_missing_imports = True [mypy-confluent_kafka.*] ignore_missing_imports = True +[mypy-iso8601.*] +ignore_missing_imports = True + [mypy-magic.*] ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt index 4dd61a2c..1cfc8ea7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ click # the version 2.1.2 is causing segmentation faults # cf https://forge.softwareheritage.org/T3815 frozendict != 2.1.2 +iso8601 pyld rdflib sentry-sdk diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py index f5c611aa..7472123e 100644 --- a/swh/indexer/metadata_dictionary/codemeta.py +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -9,6 +9,7 @@ import re from typing import Any, Dict, List, Optional, Tuple, Union import xml.etree.ElementTree as ET +import iso8601 import xmltodict from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand @@ -19,6 +20,7 @@ ATOM_URI = "http://www.w3.org/2005/Atom" _TAG_RE = re.compile(r"\{(?P<namespace>.*?)\}(?P<localname>.*)") _IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",) +_DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$") class CodemetaMapping(SingleFileIntrinsicMapping): @@ -84,6 +86,20 @@ class SwordCodemetaMapping(BaseExtrinsicMapping): # expansion will convert it to a full URI based on # "@context": CODEMETA_CONTEXT_URL jsonld_child = self.xml_to_jsonld(child) + if ( + localname + in ( + "dateCreated", + "dateModified", + "datePublished", + ) + and isinstance(jsonld_child, str) + and _DATE_RE.match(jsonld_child) + ): + # Dates missing a leading zero for their day/month, used + # to be allowed by the deposit; so we need to reformat them + # to be valid ISO8601. + jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat() doc[localname].append(jsonld_child) else: # Otherwise, we already know the URI diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py index 8c354d68..bc08b251 100644 --- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py +++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py @@ -384,6 +384,28 @@ def test_sword_propertyvalue(): } +def test_sword_fix_date(): + content = """<?xml version="1.0"?> + <entry xmlns="http://www.w3.org/2005/Atom" + xmlns:codemeta="https://doi.org/10.5063/schema/codemeta-2.0" + xmlns:schema="http://schema.org/"> + <name>Name</name> + <codemeta:dateModified>2020-12-1</codemeta:dateModified> + <codemeta:dateCreated>2020-12-2</codemeta:dateCreated> + <codemeta:datePublished>2020-12-3</codemeta:datePublished> + </entry> + """ + + result = MAPPINGS["SwordCodemetaMapping"]().translate(content) + assert result == { + "@context": "https://doi.org/10.5063/schema/codemeta-2.0", + "name": "Name", + "dateModified": "2020-12-01", + "dateCreated": "2020-12-02", + "datePublished": "2020-12-03", + } + + def test_json_sword(): content = """{"id": "hal-01243573", "@xmlns": "http://www.w3.org/2005/Atom", "author": {"name": "Author 1", "email": "foo@example.org"}, "client": "hal", "codemeta:url": "http://example.org/", "codemeta:name": "The assignment problem", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "codemeta:author": {"codemeta:name": "Author 2"}, "codemeta:license": {"codemeta:name": "GNU General Public License v3.0 or later"}}""" # noqa result = MAPPINGS["JsonSwordCodemetaMapping"]().translate(content) -- GitLab