diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst --- a/docs/metadata-workflow.rst +++ b/docs/metadata-workflow.rst @@ -108,3 +108,14 @@ .. _package.json: https://docs.npmjs.com/files/package.json .. _PKG-INFO: https://www.python.org/dev/peps/pep-0314/ .. _.gemspec: https://guides.rubygems.org/specification-reference/ + + +Supported CodeMeta terms +------------------------ + +The following terms may be found in the output of the metadata translation +(other than the `codemeta` mapping, which is the identity function, and +therefore supports all terms): + +.. program-output:: python3 -m swh.indexer.cli mapping list-terms --exclude-mapping codemeta + :nostderr: diff --git a/swh/indexer/cli.py b/swh/indexer/cli.py --- a/swh/indexer/cli.py +++ b/swh/indexer/cli.py @@ -10,7 +10,7 @@ from swh.scheduler.utils import create_task_dict from swh.storage import get_storage -from swh.indexer.metadata_dictionary import MAPPINGS +from swh.indexer import metadata_dictionary from swh.indexer.storage import get_indexer_storage from swh.indexer.storage.api.server import load_and_check_config, app @@ -54,12 +54,34 @@ @mapping.command('list') def mapping_list(): """Prints the list of known mappings.""" - mapping_names = [mapping.name for mapping in MAPPINGS.values()] + mapping_names = [mapping.name + for mapping in metadata_dictionary.MAPPINGS.values()] mapping_names.sort() for mapping_name in mapping_names: click.echo(mapping_name) +@mapping.command('list-terms') +@click.option('--exclude-mapping', multiple=True, + help='Exclude the given mapping from the output') +@click.option('--concise', is_flag=True, + default=False, + help='Don\'t print the list of mappings supporting each term.') +def mapping_list_terms(concise, exclude_mapping): + """Prints the list of known CodeMeta terms, and which mappings + support them.""" + properties = metadata_dictionary.list_terms() + for (property_name, supported_mappings) in sorted(properties.items()): + supported_mappings = {m.name for m in supported_mappings} + supported_mappings -= set(exclude_mapping) + if supported_mappings: + if concise: + click.echo(property_name) + else: + click.echo('{}:'.format(property_name)) + click.echo('\t' + ', '.join(sorted(supported_mappings))) + + @cli.group('schedule') @click.option('--scheduler-url', '-s', default=None, help="URL of the scheduler API") diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -72,6 +72,7 @@ assert 'codemeta-V1' in data_sources codemeta_translation = {data_source: {} for data_source in data_sources} + terms = set() for line in reader: # For each canonical name local_name = dict(zip(header, line))['Property'] @@ -80,6 +81,7 @@ canonical_name = make_absolute_uri(local_name) if canonical_name in PROPERTY_BLACKLIST: continue + terms.add(canonical_name) for (col, value) in zip(header, line): # For each cell in the row if col in data_sources: # If that's not the parentType/property/type/description @@ -90,11 +92,11 @@ codemeta_translation[col][local_name.strip()] = \ canonical_name - return (header, codemeta_translation) + return (terms, codemeta_translation) with open(CROSSWALK_TABLE_PATH) as fd: - (CODEMETA_KEYS, CROSSWALK_TABLE) = _read_crosstable(fd) + (CODEMETA_TERMS, CROSSWALK_TABLE) = _read_crosstable(fd) def _document_loader(url): diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -10,14 +10,15 @@ import json import logging import itertools +import collections import email.parser -import xml.parsers.expat import email.policy +import xml.parsers.expat import click import xmltodict -from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI +from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI, CODEMETA_TERMS from swh.indexer.codemeta import compact, expand @@ -29,6 +30,16 @@ return cls +def list_terms(): + """Returns a dictionary with all supported CodeMeta terms as keys, + and the mappings that support each of them as values.""" + d = collections.defaultdict(set) + for mapping in MAPPINGS.values(): + for term in mapping.supported_terms(): + d[term].add(mapping) + return d + + def merge_values(v1, v2): """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`, returns `{"@list": l1 + l2}`. @@ -137,6 +148,18 @@ """A translation dict to map dict keys into a canonical name.""" pass + @staticmethod + def _normalize_method_name(name): + return name.replace('-', '_') + + @classmethod + def supported_terms(cls): + return { + term for (key, term) in cls.mapping.items() + if key in cls.string_fields + or hasattr(cls, 'translate_' + cls._normalize_method_name(key)) + or hasattr(cls, 'normalize_' + cls._normalize_method_name(key))} + def _translate_dict(self, content_dict, *, normalize=True): """ Translates content by parsing content from a dict object @@ -155,7 +178,7 @@ # First, check if there is a specific translation # method for this key translation_method = getattr( - self, 'translate_' + k.replace('-', '_'), None) + self, 'translate_' + self._normalize_method_name(k), None) if translation_method: translation_method(translated_metadata, v) elif k in self.mapping: @@ -165,7 +188,7 @@ # if there is a normalization method, use it on the value normalization_method = getattr( - self, 'normalize_' + k.replace('-', '_'), None) + self, 'normalize_' + self._normalize_method_name(k), None) if normalization_method: v = normalization_method(v) elif k in self.string_fields and isinstance(v, str): @@ -374,7 +397,11 @@ """ name = 'codemeta' filename = b'codemeta.json' - string_fields = ['name', 'version', 'url', 'description', 'email'] + string_fields = None + + @classmethod + def supported_terms(cls): + return [term for term in CODEMETA_TERMS if not term.startswith('@')] def translate(self, content): try: diff --git a/swh/indexer/tests/test_cli.py b/swh/indexer/tests/test_cli.py --- a/swh/indexer/tests/test_cli.py +++ b/swh/indexer/tests/test_cli.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from functools import reduce +import re import tempfile from unittest.mock import patch @@ -108,6 +109,31 @@ assert result.output == expected_output +def test_mapping_list_terms(indexer_scheduler): + result = invoke(indexer_scheduler, False, [ + 'mapping', 'list-terms', + ]) + assert result.exit_code == 0, result.output + assert re.search(r'http://schema.org/url:\n.*npm', result.output) + assert re.search(r'http://schema.org/url:\n.*codemeta', result.output) + assert re.search( + r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', + result.output) + + +def test_mapping_list_terms_exclude(indexer_scheduler): + result = invoke(indexer_scheduler, False, [ + 'mapping', 'list-terms', + '--exclude-mapping', 'codemeta' + ]) + assert result.exit_code == 0, result.output + assert re.search(r'http://schema.org/url:\n.*npm', result.output) + assert not re.search(r'http://schema.org/url:\n.*codemeta', result.output) + assert not re.search( + r'https://codemeta.github.io/terms/developmentStatus:\n\tcodemeta', + result.output) + + @patch('swh.indexer.cli.TASK_BATCH_SIZE', 3) def test_origin_metadata_reindex_empty_db( indexer_scheduler, idx_storage, storage): diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -11,7 +11,7 @@ from swh.model.hashutil import hash_to_bytes -from swh.indexer.codemeta import CODEMETA_KEYS +from swh.indexer.codemeta import CODEMETA_TERMS from swh.indexer.metadata_dictionary import ( CROSSWALK_TABLE, MAPPINGS, merge_values) from swh.indexer.metadata_detector import ( @@ -1066,7 +1066,7 @@ self.npm_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) - @given(json_document_strategy(keys=CODEMETA_KEYS)) + @given(json_document_strategy(keys=CODEMETA_TERMS)) def test_codemeta_adversarial(self, doc): raw = json.dumps(doc).encode() self.codemeta_mapping.translate(raw)