From ad92ca25c0ecbe155ddbb7153fded36e701fbd2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A9l=C3=A8ne=20Jonin?= <helene.jonin@gmail.com>
Date: Fri, 2 Aug 2024 14:02:02 +0000
Subject: [PATCH] Add cff to bibtex converter

---
 swh/indexer/bibtex.py                         |  44 ++++++--
 swh/indexer/metadata_dictionary/cff.py        |   4 +-
 .../tests/metadata_dictionary/test_cff.py     |   1 +
 swh/indexer/tests/test_bibtex.py              | 101 +++++++++++++++++-
 4 files changed, 141 insertions(+), 9 deletions(-)

diff --git a/swh/indexer/bibtex.py b/swh/indexer/bibtex.py
index dba65dcb..a232ee49 100644
--- a/swh/indexer/bibtex.py
+++ b/swh/indexer/bibtex.py
@@ -6,14 +6,16 @@
 import collections
 import json
 import sys
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 import uuid
 
 from pybtex.database import Entry, Person
 import rdflib
 
 from swh.indexer.codemeta import compact, expand
+from swh.indexer.metadata_dictionary.cff import CffMapping
 from swh.indexer.namespaces import RDF, SCHEMA, SPDX_LICENSES
+from swh.model.swhids import ObjectType, QualifiedSWHID
 
 TMP_ROOT_URI_PREFIX = "https://www.softwareheritage.org/schema/2022/indexer/tmp-node/"
 """IRI used for `skolemization <https://www.w3.org/TR/rdf11-concepts/#section-skolemization>`_;
@@ -21,7 +23,9 @@ it is not used outside :func:`codemeta_to_bibtex`.
 """
 
 
-def codemeta_to_bibtex(doc: Dict[str, Any]) -> str:
+def codemeta_to_bibtex(
+    doc: Dict[str, Any], swhid: Optional[QualifiedSWHID] = None
+) -> str:
     doc = compact(doc, False)
 
     identifiers = []
@@ -54,15 +58,23 @@ def codemeta_to_bibtex(doc: Dict[str, Any]) -> str:
     fields: Dict[str, Any] = {}
 
     def add_person(persons: List[Person], person_id: rdflib.term.Node) -> None:
+        person = Person()
         for _, _, name in g.triples((person_id, SCHEMA.name, None)):
             if (person_id, RDF.type, SCHEMA.Organization) in g:
                 # prevent interpreting the name as "Firstname Lastname" and reformatting
                 # it to "Lastname, Firstname"
-                person = Person(last=name)
+                person.last_names.append(name)
             else:
                 person = Person(name)
-            if person not in persons:
-                persons.append(person)
+
+        for _, _, given_name in g.triples((person_id, SCHEMA.givenName, None)):
+            person.first_names.append(given_name)
+
+        for _, _, family_name in g.triples((person_id, SCHEMA.familyName, None)):
+            person.last_names.append(family_name)
+
+        if str(person) and person not in persons:
+            persons.append(person)
 
     def add_affiliations(person: rdflib.term.Node) -> None:
         for _, _, organization in g.triples((person, SCHEMA.affiliation, None)):
@@ -160,7 +172,20 @@ def codemeta_to_bibtex(doc: Dict[str, Any]) -> str:
         for _, _, version in g.triples((id_, SCHEMA.version, None)):
             fields["version"] = version
 
-    entry_type = "softwareversion" if "version" in fields else "software"
+    # entry_type
+    if swhid:
+        fields["swhid"] = str(swhid)
+        if swhid.object_type == ObjectType.SNAPSHOT:
+            entry_type = "software"
+        elif swhid.object_type == ObjectType.CONTENT:
+            entry_type = "codefragment"
+        else:
+            entry_type = "softwareversion"
+    elif "version" in fields:
+        entry_type = "softwareversion"
+    else:
+        entry_type = "software"
+
     entry = Entry(
         entry_type,
         persons=persons,
@@ -171,6 +196,13 @@ def codemeta_to_bibtex(doc: Dict[str, Any]) -> str:
     return entry.to_string(bib_format="bibtex")
 
 
+def cff_to_bibtex(content: str, swhid: Optional[QualifiedSWHID] = None) -> str:
+    codemeta = CffMapping().translate(raw_content=content.encode("utf-8"))
+    if codemeta is None:
+        codemeta = {}
+    return codemeta_to_bibtex(codemeta, swhid)
+
+
 if __name__ == "__main__":
     for filename in sys.argv[1:]:
         if filename == "-":
diff --git a/swh/indexer/metadata_dictionary/cff.py b/swh/indexer/metadata_dictionary/cff.py
index 0d730e88..93d32b21 100644
--- a/swh/indexer/metadata_dictionary/cff.py
+++ b/swh/indexer/metadata_dictionary/cff.py
@@ -25,9 +25,9 @@ class CffMapping(YamlMapping, SingleFileIntrinsicMapping):
     name = "cff"
     filename = b"CITATION.cff"
     mapping = CROSSWALK_TABLE["Citation File Format Core (CFF-Core) 1.0.2"]
-    string_fields = ["keywords", "license", "abstract", "version", "doi"]
+    string_fields = ["title", "keywords", "license", "abstract", "version", "doi"]
     date_fields = ["date-released"]
-    uri_fields = ["repository-code"]
+    uri_fields = ["url", "repository-code"]
 
     def _translate_author(self, graph: Graph, author: dict) -> rdflib.term.Node:
         node: rdflib.term.Node
diff --git a/swh/indexer/tests/metadata_dictionary/test_cff.py b/swh/indexer/tests/metadata_dictionary/test_cff.py
index fb50ba5d..35964faf 100644
--- a/swh/indexer/tests/metadata_dictionary/test_cff.py
+++ b/swh/indexer/tests/metadata_dictionary/test_cff.py
@@ -85,6 +85,7 @@ RIS, schema.org, CodeMeta, and .zenodo.json.""",
         "identifier": "https://doi.org/10.5281/zenodo.1162057",
         "license": "https://spdx.org/licenses/Apache-2.0",
         "version": "1.4.0-alpha0",
+        "name": "cffconvert",
     }
 
     assert expected == result
diff --git a/swh/indexer/tests/test_bibtex.py b/swh/indexer/tests/test_bibtex.py
index e0220d71..c2f4a861 100644
--- a/swh/indexer/tests/test_bibtex.py
+++ b/swh/indexer/tests/test_bibtex.py
@@ -7,7 +7,8 @@ import textwrap
 
 import pytest
 
-from swh.indexer.bibtex import codemeta_to_bibtex
+from swh.indexer.bibtex import cff_to_bibtex, codemeta_to_bibtex
+from swh.model.swhids import QualifiedSWHID
 
 
 def test_empty():
@@ -273,3 +274,101 @@ def test_affiliation():
         }
         """
     )
+
+
+def test_cff_empty():
+    assert cff_to_bibtex("") == textwrap.dedent(
+        """\
+        @software{REPLACEME
+        }
+        """
+    )
+
+
+def test_cff_invalid():
+    assert cff_to_bibtex("foo") == textwrap.dedent(
+        """\
+        @software{REPLACEME
+        }
+        """
+    )
+
+
+def test_cff_minimal():
+    assert (
+        cff_to_bibtex(
+            """
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - family-names: Druskat
+    given-names: Stephan
+title: "My Research Software"
+date-released: 2021-08-11
+url: "http://example.org/"
+            """
+        )
+        == textwrap.dedent(
+            """\
+            @software{REPLACEME,
+                author = "Druskat, Stephan",
+                date = "2021-08-11",
+                year = "2021",
+                month = "08",
+                title = "My Research Software",
+                url = "http://example.org/"
+            }
+            """
+        )
+    )
+
+
+def test_swhid_type_snp():
+    assert codemeta_to_bibtex(
+        {
+            "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        },
+        QualifiedSWHID.from_string(
+            "swh:1:snp:da39a3ee5e6b4b0d3255bfef95601890afd80709"
+        ),
+    ) == textwrap.dedent(
+        """\
+        @software{REPLACEME,
+            swhid = "swh:1:snp:da39a3ee5e6b4b0d3255bfef95601890afd80709"
+        }
+        """
+    )
+
+
+def test_swhid_type_rev():
+    assert codemeta_to_bibtex(
+        {
+            "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        },
+        QualifiedSWHID.from_string(
+            "swh:1:rev:5b909292bcfe6099d726c0b5194165c72f93b767"
+        ),
+    ) == textwrap.dedent(
+        """\
+        @softwareversion{REPLACEME,
+            swhid = "swh:1:rev:5b909292bcfe6099d726c0b5194165c72f93b767"
+        }
+        """
+    )
+
+
+def test_swhid_type_cnt():
+    assert codemeta_to_bibtex(
+        {
+            "@context": "https://doi.org/10.5063/schema/codemeta-2.0",
+        },
+        QualifiedSWHID.from_string(
+            "swh:1:cnt:5b909292bcfe6099d726c0b5194165c72f93b767;lines=5-10"
+        ),
+    ) == textwrap.dedent(
+        """\
+        @codefragment{REPLACEME,
+            swhid = "swh:1:cnt:5b909292bcfe6099d726c0b5194165c72f93b767;lines=5-10"
+        }
+        """
+    )
-- 
GitLab