diff --git a/swh/indexer/metadata_dictionary/codemeta.py b/swh/indexer/metadata_dictionary/codemeta.py index 1fc613f74a6c0345179f0aee72c0ecd0b7957ac8..0da51b8bb0d59650baadcc1c8ffb3c3c49fddcaf 100644 --- a/swh/indexer/metadata_dictionary/codemeta.py +++ b/swh/indexer/metadata_dictionary/codemeta.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2022 The Software Heritage developers +# Copyright (C) 2018-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -146,21 +146,25 @@ class SwordCodemetaMapping(BaseExtrinsicMapping): def translate(self, content: bytes) -> Optional[Dict[str, Any]]: # Parse XML - root = ET.fromstring(content) - - # Transform to JSON-LD document - doc = self.xml_to_jsonld(root) + try: + root = ET.fromstring(content) + except ET.ParseError: + logger.error("Failed to parse XML document: %s", content) + return None + else: + # Transform to JSON-LD document + doc = self.xml_to_jsonld(root) - assert isinstance(doc, dict), f"Root object is not a dict: {doc}" + assert isinstance(doc, dict), f"Root object is not a dict: {doc}" - # Add @context to JSON-LD expansion replaces the "codemeta:" prefix - # hash (which uses the context URL as namespace URI for historical - # reasons) into properties in `http://schema.org/` and - # `https://codemeta.github.io/terms/` namespaces - doc["@context"] = CODEMETA_CONTEXT_URL + # Add @context to JSON-LD expansion replaces the "codemeta:" prefix + # hash (which uses the context URL as namespace URI for historical + # reasons) into properties in `http://schema.org/` and + # `https://codemeta.github.io/terms/` namespaces + doc["@context"] = CODEMETA_CONTEXT_URL - # Normalize as a Codemeta document - return self.normalize_translation(expand(doc)) + # Normalize as a Codemeta document + return self.normalize_translation(expand(doc)) def normalize_translation(self, metadata: Dict[str, Any]) -> Dict[str, Any]: return compact(metadata, forgefed=False) @@ -183,14 +187,18 @@ class JsonSwordCodemetaMapping(SwordCodemetaMapping): # ``content`` was generated by calling ``xmltodict.parse()`` on a XML document, # so ``xmltodict.unparse()`` is guaranteed to return a document that is # semantically equivalent to the original and pass it to SwordCodemetaMapping. - json_doc = json.loads(content) - - if json_doc.get("@xmlns") != ATOM_URI: - # Technically, non-default XMLNS were allowed, but it does not seem like - # anyone used them, so they do not need to be implemented here. - raise NotImplementedError(f"Unexpected XMLNS set: {json_doc}") + try: + json_doc = json.loads(content) + except json.JSONDecodeError: + logger.error("Failed to parse JSON document: %s", content) + return None + else: + if json_doc.get("@xmlns") != ATOM_URI: + # Technically, non-default XMLNS were allowed, but it does not seem like + # anyone used them, so they do not need to be implemented here. + raise NotImplementedError(f"Unexpected XMLNS set: {json_doc}") - # Root tag was stripped by swh-deposit - json_doc = {"entry": json_doc} + # Root tag was stripped by swh-deposit + json_doc = {"entry": json_doc} - return super().translate(xmltodict.unparse(json_doc)) + return super().translate(xmltodict.unparse(json_doc)) diff --git a/swh/indexer/tests/metadata_dictionary/test_codemeta.py b/swh/indexer/tests/metadata_dictionary/test_codemeta.py index 6c9d6def061c33197d9d21f80b04e6cb24760dff..4240b50adbda81dcb58f54dcf6ab51459b75ba8b 100644 --- a/swh/indexer/tests/metadata_dictionary/test_codemeta.py +++ b/swh/indexer/tests/metadata_dictionary/test_codemeta.py @@ -1,9 +1,10 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json +import logging from hypothesis import HealthCheck, given, settings import pytest @@ -518,6 +519,12 @@ def test_sword_fix_date(): } +def test_sword_codemeta_parsing_error(caplog): + caplog.set_level(logging.ERROR) + assert MAPPINGS["SwordCodemetaMapping"]().translate(b"123") is None + assert caplog.text.endswith("Failed to parse XML document: b'123'\n") + + def test_json_sword(): content = """{"id": "hal-01243573", "@xmlns": "http://www.w3.org/2005/Atom", "author": {"name": "Author 1", "email": "foo@example.org"}, "client": "hal", "codemeta:url": "http://example.org/", "codemeta:name": "The assignment problem", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "codemeta:author": {"codemeta:name": "Author 2"}, "codemeta:license": {"codemeta:name": "GNU General Public License v3.0 or later"}}""" # noqa result = MAPPINGS["JsonSwordCodemetaMapping"]().translate(content) @@ -532,3 +539,9 @@ def test_json_sword(): "url": "http://example.org/", "name": "The assignment problem", } + + +def test_json_sword_codemeta_parsing_error(caplog): + caplog.set_level(logging.ERROR) + assert MAPPINGS["JsonSwordCodemetaMapping"]().translate(b"{123}") is None + assert caplog.text.endswith("Failed to parse JSON document: b'{123}'\n")