diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py index cb827938878704be9e32c2e5f4fe7fad35fadafc..56c7f88a628fc120d18298ea0114634d8b6c40a9 100644 --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -10,7 +10,7 @@ from swh.scheduler import get_scheduler from swh.scheduler.utils import create_task_dict from swh.storage import get_storage -from swh.indexer.metadata_dictionary import MAPPINGS +from swh.indexer import metadata_dictionary from swh.indexer.storage import get_indexer_storage from swh.indexer.storage.api.server import load_and_check_config, app @@ -54,12 +54,34 @@ def mapping(): @mapping.command('list') def mapping_list(): """Prints the list of known mappings.""" - mapping_names = [mapping.name for mapping in MAPPINGS.values()] + mapping_names = [mapping.name + for mapping in metadata_dictionary.MAPPINGS.values()] mapping_names.sort() for mapping_name in mapping_names: click.echo(mapping_name) +@mapping.command('list-terms') +@click.option('--exclude-mapping', multiple=True, + help='Exclude the given mapping from the output') +@click.option('--concise', is_flag=True, + default=False, + help='Don\'t print the list of mappings supporting each term.') +def mapping_list_terms(concise, exclude_mapping): + """Prints the list of known CodeMeta terms, and which mappings + support them.""" + properties = metadata_dictionary.list_terms() + for (property_name, supported_mappings) in sorted(properties.items()): + supported_mappings = {m.name for m in supported_mappings} + supported_mappings -= set(exclude_mapping) + if supported_mappings: + if concise: + click.echo(property_name) + else: + click.echo('{}:'.format(property_name)) + click.echo('\t' + ', '.join(sorted(supported_mappings))) + + @cli.group('schedule') @click.option('--scheduler-url', '-s', default=None, help="URL of the scheduler API") diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py index b670b87dcd70452405cbac9b8248f047806f2ad2..7cc316f086e9e00491efa225784ae769f7c5c8bc 100644 --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -72,6 +72,7 @@ def _read_crosstable(fd): assert 'codemeta-V1' in data_sources codemeta_translation = {data_source: {} for data_source in data_sources} + terms = set() for line in reader: # For each canonical name local_name = dict(zip(header, line))['Property'] @@ -80,6 +81,7 @@ def _read_crosstable(fd): canonical_name = make_absolute_uri(local_name) if canonical_name in PROPERTY_BLACKLIST: continue + terms.add(canonical_name) for (col, value) in zip(header, line): # For each cell in the row if col in data_sources: # If that's not the parentType/property/type/description @@ -90,11 +92,11 @@ def _read_crosstable(fd): codemeta_translation[col][local_name.strip()] = \ canonical_name - return (header, codemeta_translation) + return (terms, codemeta_translation) with open(CROSSWALK_TABLE_PATH) as fd: - (CODEMETA_KEYS, CROSSWALK_TABLE) = _read_crosstable(fd) + (CODEMETA_TERMS, CROSSWALK_TABLE) = _read_crosstable(fd) def _document_loader(url): diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py index b579ab23e5b1003f2e62a82d6958eb35c564d791..47fb559d0028bb007c591ff844cbad00d9b8a66a 100644 --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -10,14 +10,15 @@ import ast import json import logging import itertools +import collections import email.parser -import xml.parsers.expat import email.policy +import xml.parsers.expat import click import xmltodict -from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI, CODEMETA_TERMS from swh.indexer.codemeta import compact, expand @@ -29,6 +30,16 @@ def register_mapping(cls): return cls +def list_terms(): + """Returns a dictionary with all supported CodeMeta terms as keys, + and the mappings that support each of them as values.""" + d = collections.defaultdict(set) + for mapping in MAPPINGS.values(): + for term in mapping.supported_terms(): + d[term].add(mapping) + return d + + def merge_values(v1, v2): """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`, returns `{"@list": l1 + l2}`. @@ -137,6 +148,18 @@ class DictMapping(BaseMapping): """A translation dict to map dict keys into a canonical name.""" pass + @staticmethod + def _normalize_method_name(name): + return name.replace('-', '_') + + @classmethod + def supported_terms(cls): + return { + term for (key, term) in cls.mapping.items() + if key in cls.string_fields + or hasattr(cls, 'translate_' + cls._normalize_method_name(key)) + or hasattr(cls, 'normalize_' + cls._normalize_method_name(key))} + def _translate_dict(self, content_dict, *, normalize=True): """ Translates content by parsing content from a dict object @@ -155,7 +178,7 @@ class DictMapping(BaseMapping): # First, check if there is a specific translation # method for this key translation_method = getattr( - self, 'translate_' + k.replace('-', '_'), None) + self, 'translate_' + self._normalize_method_name(k), None) if translation_method: translation_method(translated_metadata, v) elif k in self.mapping: @@ -165,7 +188,7 @@ class DictMapping(BaseMapping): # if there is a normalization method, use it on the value normalization_method = getattr( - self, 'normalize_' + k.replace('-', '_'), None) + self, 'normalize_' + self._normalize_method_name(k), None) if normalization_method: v = normalization_method(v) elif k in self.string_fields and isinstance(v, str): @@ -374,7 +397,11 @@ class CodemetaMapping(SingleFileMapping): """ name = 'codemeta' filename = b'codemeta.json' - string_fields = ['name', 'version', 'url', 'description', 'email'] + string_fields = None + + @classmethod + def supported_terms(cls): + return [term for term in CODEMETA_TERMS if not term.startswith('@')] def translate(self, content): try: diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py index d14b186065a604488349d7ae0ec50fba4394446b..6a9a31e7f20d2c255b1ca4d91d03abd00d38b81e 100644 --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from functools import reduce +import re import tempfile from unittest.mock import patch @@ -108,6 +109,31 @@ def test_mapping_list(indexer_scheduler): assert result.output == expected_output +def test_mapping_list_terms(indexer_scheduler): + result = invoke(indexer_scheduler, False, [ + 'mapping', 'list-terms', + ]) + assert result.exit_code == 0, result.output + assert re.search(r'http://schema.org/url:\n.*npm', result.output) + assert re.search(r'http://schema.org/url:\n.*codemeta', result.output) + assert re.search( + r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', + result.output) + + +def test_mapping_list_terms_exclude(indexer_scheduler): + result = invoke(indexer_scheduler, False, [ + 'mapping', 'list-terms', + '--exclude-mapping', 'codemeta' + ]) + assert result.exit_code == 0, result.output + assert re.search(r'http://schema.org/url:\n.*npm', result.output) + assert not re.search(r'http://schema.org/url:\n.*codemeta', result.output) + assert not re.search( + r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', + result.output) + + @patch('swh.indexer.cli.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_empty_db( indexer_scheduler, idx_storage, storage): diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py index cba46755733e439a7d26d74f96a5e1cafdd967c4..902a48391c0d1ebea29e2bd8c0f440ddcdff7663 100644 --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -11,7 +11,7 @@ import xmltodict from swh.model.hashutil import hash_to_bytes -from swh.indexer.codemeta import CODEMETA_KEYS +from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata_dictionary import ( CROSSWALK_TABLE, MAPPINGS, merge_values) from swh.indexer.metadata_detector import ( @@ -1066,7 +1066,7 @@ Gem::Specification.new { |s| self.npm_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) - @given(json_document_strategy(keys=CODEMETA_KEYS)) + @given(json_document_strategy(keys=CODEMETA_TERMS)) def test_codemeta_adversarial(self, doc): raw = json.dumps(doc).encode() self.codemeta_mapping.translate(raw)