diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py index 90e863b..fe66a16 100644 --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -1,192 +1,205 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import collections import csv import itertools import json import os.path import re import swh.indexer from pyld import jsonld _DATA_DIR = os.path.join(os.path.dirname(swh.indexer.__file__), 'data') CROSSWALK_TABLE_PATH = os.path.join(_DATA_DIR, 'codemeta', 'crosswalk.csv') CODEMETA_CONTEXT_PATH = os.path.join(_DATA_DIR, 'codemeta', 'codemeta.jsonld') with open(CODEMETA_CONTEXT_PATH) as fd: CODEMETA_CONTEXT = json.load(fd) CODEMETA_CONTEXT_URL = 'https://doi.org/10.5063/schema/codemeta-2.0' CODEMETA_ALTERNATE_CONTEXT_URLS = { ('https://raw.githubusercontent.com/codemeta/codemeta/' 'master/codemeta.jsonld') } CODEMETA_URI = 'https://codemeta.github.io/terms/' SCHEMA_URI = 'http://schema.org/' PROPERTY_BLACKLIST = { # CodeMeta properties that we cannot properly represent. SCHEMA_URI + 'softwareRequirements', CODEMETA_URI + 'softwareSuggestions', # Duplicate of 'author' SCHEMA_URI + 'creator', } _codemeta_field_separator = re.compile(r'\s*[,/]\s*') def make_absolute_uri(local_name): definition = CODEMETA_CONTEXT['@context'][local_name] if isinstance(definition, str): return definition elif isinstance(definition, dict): prefixed_name = definition['@id'] (prefix, local_name) = prefixed_name.split(':') if prefix == 'schema': canonical_name = SCHEMA_URI + local_name elif prefix == 'codemeta': canonical_name = CODEMETA_URI + local_name else: assert False, prefix return canonical_name else: assert False, definition def _read_crosstable(fd): reader = csv.reader(fd) try: header = next(reader) except StopIteration: raise ValueError('empty file') data_sources = set(header) - {'Parent Type', 'Property', 'Type', 'Description'} assert 'codemeta-V1' in data_sources codemeta_translation = {data_source: {} for data_source in data_sources} terms = set() for line in reader: # For each canonical name local_name = dict(zip(header, line))['Property'] if not local_name: continue canonical_name = make_absolute_uri(local_name) if canonical_name in PROPERTY_BLACKLIST: continue terms.add(canonical_name) for (col, value) in zip(header, line): # For each cell in the row if col in data_sources: # If that's not the parentType/property/type/description for local_name in _codemeta_field_separator.split(value): # For each of the data source's properties that maps # to this canonical name if local_name.strip(): codemeta_translation[col][local_name.strip()] = \ canonical_name return (terms, codemeta_translation) with open(CROSSWALK_TABLE_PATH) as fd: (CODEMETA_TERMS, CROSSWALK_TABLE) = _read_crosstable(fd) def _document_loader(url): """Document loader for pyld. Reads the local codemeta.jsonld file instead of fetching it from the Internet every single time.""" if url == CODEMETA_CONTEXT_URL or url in CODEMETA_ALTERNATE_CONTEXT_URLS: return { 'contextUrl': None, 'documentUrl': url, 'document': CODEMETA_CONTEXT, } elif url == CODEMETA_URI: raise Exception('{} is CodeMeta\'s URI, use {} as context url'.format( CODEMETA_URI, CODEMETA_CONTEXT_URL)) else: raise Exception(url) def compact(doc): """Same as `pyld.jsonld.compact`, but in the context of CodeMeta.""" return jsonld.compact(doc, CODEMETA_CONTEXT_URL, options={'documentLoader': _document_loader}) def expand(doc): """Same as `pyld.jsonld.expand`, but in the context of CodeMeta.""" return jsonld.expand(doc, options={'documentLoader': _document_loader}) def merge_values(v1, v2): """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`, returns `{"@list": l1 + l2}`. Otherwise, make them lists (if they are not already) and concatenate them. >>> merge_values('a', 'b') ['a', 'b'] >>> merge_values(['a', 'b'], 'c') ['a', 'b', 'c'] >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) {'@list': ['a', 'b', 'c']} """ if v1 is None: return v2 elif v2 is None: return v1 elif isinstance(v1, dict) and set(v1) == {'@list'}: assert isinstance(v1['@list'], list) if isinstance(v2, dict) and set(v2) == {'@list'}: assert isinstance(v2['@list'], list) return {'@list': v1['@list'] + v2['@list']} else: raise ValueError('Cannot merge %r and %r' % (v1, v2)) else: if isinstance(v2, dict) and '@list' in v2: raise ValueError('Cannot merge %r and %r' % (v1, v2)) if not isinstance(v1, list): v1 = [v1] if not isinstance(v2, list): v2 = [v2] return v1 + v2 def merge_documents(documents): """Takes a list of metadata dicts, each generated from a different metadata file, and merges them. Removes duplicates, if any.""" documents = list(itertools.chain.from_iterable(map(expand, documents))) merged_document = collections.defaultdict(list) for document in documents: for (key, values) in document.items(): if key == '@id': # @id does not get expanded to a list value = values # Only one @id is allowed, move it to sameAs if '@id' not in merged_document: merged_document['@id'] = value elif value != merged_document['@id']: if value not in merged_document[SCHEMA_URI + 'sameAs']: merged_document[SCHEMA_URI + 'sameAs'].append(value) else: for value in values: - if value not in merged_document[key]: + if isinstance(value, dict) and set(value) == {'@list'}: + # Value is of the form {'@list': [item1, item2]} + # instead of the usual [item1, item2]. + # We need to merge the inner lists (and mostly + # preserve order). + merged_value = merged_document.setdefault( + key, {'@list': []}) + for subvalue in value['@list']: + # merged_value must be of the form + # {'@list': [item1, item2]}; as it is the same + # type as value, which is an @list. + if subvalue not in merged_value['@list']: + merged_value['@list'].append(subvalue) + elif value not in merged_document[key]: merged_document[key].append(value) return compact(merged_document) diff --git a/swh/indexer/tests/test_codemeta.py b/swh/indexer/tests/test_codemeta.py index e5ba00e..c73b23a 100644 --- a/swh/indexer/tests/test_codemeta.py +++ b/swh/indexer/tests/test_codemeta.py @@ -1,158 +1,283 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest from swh.indexer.codemeta import merge_documents, merge_values from swh.indexer.codemeta import CROSSWALK_TABLE def test_crosstable(): assert CROSSWALK_TABLE['NodeJS'] == { 'repository': 'http://schema.org/codeRepository', 'os': 'http://schema.org/operatingSystem', 'cpu': 'http://schema.org/processorRequirements', 'engines': 'http://schema.org/processorRequirements', 'author': 'http://schema.org/author', 'author.email': 'http://schema.org/email', 'author.name': 'http://schema.org/name', 'contributor': 'http://schema.org/contributor', 'keywords': 'http://schema.org/keywords', 'license': 'http://schema.org/license', 'version': 'http://schema.org/version', 'description': 'http://schema.org/description', 'name': 'http://schema.org/name', 'bugs': 'https://codemeta.github.io/terms/issueTracker', 'homepage': 'http://schema.org/url' } def test_merge_values(): assert merge_values('a', 'b') == ['a', 'b'] assert merge_values(['a', 'b'], 'c') == ['a', 'b', 'c'] assert merge_values('a', ['b', 'c']) == ['a', 'b', 'c'] assert merge_values({'@list': ['a']}, {'@list': ['b']}) \ == {'@list': ['a', 'b']} assert merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) \ == {'@list': ['a', 'b', 'c']} with pytest.raises(ValueError): merge_values({'@list': ['a']}, 'b') with pytest.raises(ValueError): merge_values('a', {'@list': ['b']}) with pytest.raises(ValueError): merge_values({'@list': ['a']}, ['b']) with pytest.raises(ValueError): merge_values(['a'], {'@list': ['b']}) assert merge_values('a', None) == 'a' assert merge_values(['a', 'b'], None) == ['a', 'b'] assert merge_values(None, ['b', 'c']) == ['b', 'c'] assert merge_values({'@list': ['a']}, None) == {'@list': ['a']} assert merge_values(None, {'@list': ['a']}) == {'@list': ['a']} def test_merge_documents(): """ Test the creation of a coherent minimal metadata set """ # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_0_1', 'version': '0.0.2', 'description': 'Simple package.json test for indexer', 'codeRepository': 'git+https://github.com/moranegg/metadata_test' }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'test_metadata', 'version': '0.0.2', 'author': { 'type': 'Person', 'name': 'moranegg', }, }] # when results = merge_documents(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', "version": '0.0.2', "description": 'Simple package.json test for indexer', "name": ['test_1', 'test_0_1', 'test_metadata'], "author": [{ 'type': 'Person', 'name': 'moranegg' }], "codeRepository": 'git+https://github.com/moranegg/metadata_test', } assert results == expected_results def test_merge_documents_ids(): # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'id': 'http://example.org/test1', 'name': 'test_1', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'id': 'http://example.org/test2', 'name': 'test_2', }] # when results = merge_documents(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'id': 'http://example.org/test1', 'schema:sameAs': 'http://example.org/test2', "name": ['test_1', 'test_2'] } assert results == expected_results def test_merge_documents_duplicate_ids(): # given metadata_list = [{ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'id': 'http://example.org/test1', 'name': 'test_1', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'id': 'http://example.org/test1', 'name': 'test_1b', }, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'id': 'http://example.org/test2', 'name': 'test_2', }] # when results = merge_documents(metadata_list) # then expected_results = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'id': 'http://example.org/test1', 'schema:sameAs': 'http://example.org/test2', "name": ['test_1', 'test_1b', 'test_2'] } assert results == expected_results + + +def test_merge_documents_lists(): + """Tests merging two @list elements.""" + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_1'}, + ] + }, + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_2'}, + ] + }, + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': [ + {'name': 'test_1'}, + {'name': 'test_2'}, + ], + } + assert results == expected_results + + +def test_merge_documents_lists_duplicates(): + """Tests merging two @list elements with a duplicate subelement.""" + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_1'}, + ] + }, + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_2'}, + {'name': 'test_1'}, + ] + }, + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': [ + {'name': 'test_1'}, + {'name': 'test_2'}, + ], + } + assert results == expected_results + + +def test_merge_documents_list_left(): + """Tests merging a singleton with an @list.""" + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': {'name': 'test_1'}, + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_2'}, + ] + }, + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': [ + {'name': 'test_1'}, + {'name': 'test_2'}, + ], + } + assert results == expected_results + + +def test_merge_documents_list_right(): + """Tests merging an @list with a singleton.""" + # given + metadata_list = [{ + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': { + '@list': [ + {'name': 'test_1'}, + ] + }, + }, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': {'name': 'test_2'}, + }] + + # when + results = merge_documents(metadata_list) + + # then + expected_results = { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'author': [ + {'name': 'test_1'}, + {'name': 'test_2'}, + ], + } + assert results == expected_results