From e42deb6d0086a57e88b65e6c6d7b087d807e78bb Mon Sep 17 00:00:00 2001
From: Antoine Lambert <anlambert@softwareheritage.org>
Date: Fri, 8 Nov 2024 12:46:40 +0100
Subject: [PATCH] bibtex: Robustify code extracting year and month from date

Previous implementation could lead to errors when an invalid date
is present in source codemeta.
---
 swh/indexer/bibtex.py            | 13 +++++++++----
 swh/indexer/tests/test_bibtex.py | 23 +++++++++++++++++++++++
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/swh/indexer/bibtex.py b/swh/indexer/bibtex.py
index 446ce4d..891cd11 100644
--- a/swh/indexer/bibtex.py
+++ b/swh/indexer/bibtex.py
@@ -11,6 +11,7 @@ import sys
 from typing import Any, Dict, List, Optional
 import uuid
 
+import iso8601
 from pybtex.database import Entry, Person
 from pybtex.database.output.bibtex import Writer
 from pybtex.plugin import register_plugin
@@ -130,10 +131,14 @@ def codemeta_to_bibtex(
                 fields["date"] = date
                 break
     if "date" in fields:
-        (fields["year"], month_number, _) = fields["date"].split("-")
-        fields["month"] = (
-            f"{MACRO_PREFIX}:{calendar.month_abbr[int(month_number)].lower()}"
-        )
+        try:
+            parsed_date = iso8601.parse_date(fields["date"])
+            fields["year"] = str(parsed_date.year)
+            fields["month"] = (
+                f"{MACRO_PREFIX}:{calendar.month_abbr[parsed_date.month].lower()}"
+            )
+        except iso8601.ParseError:
+            pass
 
     # identifier, doi, hal_id
     entry_key = None
diff --git a/swh/indexer/tests/test_bibtex.py b/swh/indexer/tests/test_bibtex.py
index fa6fbf1..a820861 100644
--- a/swh/indexer/tests/test_bibtex.py
+++ b/swh/indexer/tests/test_bibtex.py
@@ -276,6 +276,29 @@ def test_affiliation():
     )
 
 
+def test_invalid_date():
+    assert codemeta_to_bibtex(
+        {
+            "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+            "author": {"name": "Jane Doe"},
+            "name": "Example Software",
+            "url": "http://example.org/",
+            "datePublished": "TBD",
+            "license": "https://spdx.org/licenses/Apache-2.0",
+        }
+    ) == textwrap.dedent(
+        """\
+        @software{REPLACEME,
+            author = "Doe, Jane",
+            license = "Apache-2.0",
+            date = "TBD",
+            title = "Example Software",
+            url = "http://example.org/"
+        }
+        """
+    )
+
+
 def test_cff_empty():
     assert cff_to_bibtex("") == textwrap.dedent(
         """\
-- 
GitLab