From 3bad41489c4b5412fbf250d7dd53c3b188956f65 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Wed, 26 Oct 2022 14:19:26 +0200
Subject: [PATCH] codemeta: Fix malformed dates that used to be allowed by the
 deposit

---
 mypy.ini                                      |  3 +++
 requirements.txt                              |  1 +
 swh/indexer/metadata_dictionary/codemeta.py   | 16 ++++++++++++++
 .../metadata_dictionary/test_codemeta.py      | 22 +++++++++++++++++++
 4 files changed, 42 insertions(+)

diff --git a/mypy.ini b/mypy.ini
index d63e7895..28c26fba 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -11,6 +11,9 @@ ignore_missing_imports = True
 [mypy-confluent_kafka.*]
 ignore_missing_imports = True
 
+[mypy-iso8601.*]
+ignore_missing_imports = True
+
 [mypy-magic.*]
 ignore_missing_imports = True
 
diff --git a/requirements.txt b/requirements.txt
index 4dd61a2c..1cfc8ea7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ click
 # the version 2.1.2 is causing segmentation faults
 # cf https://forge.softwareheritage.org/T3815
 frozendict != 2.1.2
+iso8601
 pyld
 rdflib
 sentry-sdk
diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py
index f5c611aa..7472123e 100644
--- a/swh/indexer/metadata_dictionary/codemeta.py
+++ b/swh/indexer/metadata_dictionary/codemeta.py
@@ -9,6 +9,7 @@ import re
 from typing import Any, Dict, List, Optional, Tuple, Union
 import xml.etree.ElementTree as ET
 
+import iso8601
 import xmltodict
 
 from swh.indexer.codemeta import CODEMETA_CONTEXT_URL, CODEMETA_TERMS, compact, expand
@@ -19,6 +20,7 @@ ATOM_URI = "http://www.w3.org/2005/Atom"
 
 _TAG_RE = re.compile(r"\{(?P<namespace>.*?)\}(?P<localname>.*)")
 _IGNORED_NAMESPACES = ("http://www.w3.org/2005/Atom",)
+_DATE_RE = re.compile("^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$")
 
 
 class CodemetaMapping(SingleFileIntrinsicMapping):
@@ -84,6 +86,20 @@ class SwordCodemetaMapping(BaseExtrinsicMapping):
                 # expansion will convert it to a full URI based on
                 # "@context": CODEMETA_CONTEXT_URL
                 jsonld_child = self.xml_to_jsonld(child)
+                if (
+                    localname
+                    in (
+                        "dateCreated",
+                        "dateModified",
+                        "datePublished",
+                    )
+                    and isinstance(jsonld_child, str)
+                    and _DATE_RE.match(jsonld_child)
+                ):
+                    # Dates missing a leading zero for their day/month, used
+                    # to be allowed by the deposit; so we need to reformat them
+                    # to be valid ISO8601.
+                    jsonld_child = iso8601.parse_date(jsonld_child).date().isoformat()
                 doc[localname].append(jsonld_child)
             else:
                 # Otherwise, we already know the URI
diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
index 8c354d68..bc08b251 100644
--- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py
+++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py
@@ -384,6 +384,28 @@ def test_sword_propertyvalue():
     }
 
 
+def test_sword_fix_date():
+    content = """<?xml version="1.0"?>
+    <entry xmlns="http://www.w3.org/2005/Atom"
+           xmlns:codemeta="https://doi.org/10.5063/schema/codemeta-2.0"
+           xmlns:schema="http://schema.org/">
+      <name>Name</name>
+      <codemeta:dateModified>2020-12-1</codemeta:dateModified>
+      <codemeta:dateCreated>2020-12-2</codemeta:dateCreated>
+      <codemeta:datePublished>2020-12-3</codemeta:datePublished>
+    </entry>
+    """
+
+    result = MAPPINGS["SwordCodemetaMapping"]().translate(content)
+    assert result == {
+        "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        "name": "Name",
+        "dateModified": "2020-12-01",
+        "dateCreated": "2020-12-02",
+        "datePublished": "2020-12-03",
+    }
+
+
 def test_json_sword():
     content = """{"id": "hal-01243573", "@xmlns": "http://www.w3.org/2005/Atom", "author": {"name": "Author 1", "email": "foo@example.org"}, "client": "hal", "codemeta:url": "http://example.org/", "codemeta:name": "The assignment problem", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "codemeta:author": {"codemeta:name": "Author 2"}, "codemeta:license": {"codemeta:name": "GNU General Public License v3.0 or later"}}"""  # noqa
     result = MAPPINGS["JsonSwordCodemetaMapping"]().translate(content)
-- 
GitLab