diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -3,7 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import collections import csv +import itertools import json import os.path import re @@ -127,3 +129,19 @@ """Same as `pyld.jsonld.expand`, but in the context of CodeMeta.""" return jsonld.expand(doc, options={'documentLoader': _document_loader}) + + +def merge_documents(documents): + """Takes a list of metadata dicts, each generated from a different + metadata file, and merges them. + + Removes duplicates, if any.""" + documents = list(itertools.chain.from_iterable(map(expand, documents))) + merged_document = collections.defaultdict(list) + for document in documents: + for (key, values) in document.items(): + for value in values: + if value not in merged_document[key]: + merged_document[key].append(value) + + return compact(merged_document) diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -7,11 +7,11 @@ from swh.core.utils import grouper +from swh.indexer.codemeta import merge_documents from swh.indexer.indexer import ContentIndexer, RevisionIndexer, OriginIndexer from swh.indexer.origin_head import OriginHeadIndexer from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_detector import detect_metadata -from swh.indexer.metadata_detector import extract_minimal_metadata_dict from swh.indexer.storage import INDEXER_CFG_KEY from swh.model import hashutil @@ -263,9 +263,8 @@ self.log.exception( "Exception while indexing metadata on contents") - # transform metadata into min set with swh-metadata-detector - min_metadata = extract_minimal_metadata_dict(metadata) - return (used_mappings, min_metadata) + metadata = merge_documents(metadata) + return (used_mappings, metadata) class OriginMetadataIndexer(OriginIndexer): diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -3,8 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from swh.indexer.codemeta import compact, expand -from swh.indexer.codemeta import make_absolute_uri from swh.indexer.metadata_dictionary import MAPPINGS @@ -24,39 +22,3 @@ if matches: results[mapping_name] = matches return results - - -_MINIMAL_PROPERTY_SET = { - "developmentStatus", "version", "operatingSystem", "description", - "keywords", "issueTracker", "name", "author", "relatedLink", - "url", "license", "maintainer", "email", "identifier", - "codeRepository"} - -MINIMAL_METADATA_SET = {make_absolute_uri(prop) - for prop in _MINIMAL_PROPERTY_SET} - - -def extract_minimal_metadata_dict(metadata_list): - """ - Every item in the metadata_list is a dict of translated_metadata in the - CodeMeta vocabulary. - - We wish to extract a minimal set of terms and keep all values corresponding - to this term without duplication. - - Args: - metadata_list (list): list of dicts of translated_metadata - - Returns: - dict: minimal_dict; dict with selected values of metadata - """ - minimal_dict = {} - for document in metadata_list: - for metadata_item in expand(document): - for (term, value) in metadata_item.items(): - if term in MINIMAL_METADATA_SET: - if term not in minimal_dict: - minimal_dict[term] = [value] - elif value not in minimal_dict[term]: - minimal_dict[term].append(value) - return compact(minimal_dict) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -11,10 +11,11 @@ from swh.model.hashutil import hash_to_bytes from swh.indexer.codemeta import CODEMETA_TERMS, CROSSWALK_TABLE +from swh.indexer.codemeta import merge_documents from swh.indexer.metadata_dictionary import MAPPINGS from swh.indexer.metadata_dictionary.base import merge_values from swh.indexer.metadata_detector import ( - detect_metadata, extract_minimal_metadata_dict + detect_metadata ) from swh.indexer.metadata import ( ContentMetadataIndexer, RevisionMetadataIndexer @@ -184,7 +185,7 @@ # then self.assertEqual(declared_metadata, result) - def test_extract_minimal_metadata_dict(self): + def test_merge_documents(self): """ Test the creation of a coherent minimal metadata set """ @@ -211,7 +212,7 @@ }] # when - results = extract_minimal_metadata_dict(metadata_list) + results = merge_documents(metadata_list) # then expected_results = { diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -407,6 +407,7 @@ 'https://github.com/librariesio/yarn-parser/issues', 'name': 'yarn-parser', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], + 'type': 'SoftwareSourceCode', }