Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_detector.py
# Copyright (C) 2017 The Software Heritage developers | # Copyright (C) 2017 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from swh.indexer.codemeta import compact, expand | |||||
from swh.indexer.codemeta import make_absolute_uri | |||||
from swh.indexer.metadata_dictionary import MAPPINGS | from swh.indexer.metadata_dictionary import MAPPINGS | ||||
def detect_metadata(files): | def detect_metadata(files): | ||||
""" | """ | ||||
Detects files potentially containing metadata | Detects files potentially containing metadata | ||||
Args: | Args: | ||||
- file_entries (list): list of files | - file_entries (list): list of files | ||||
Returns: | Returns: | ||||
- empty list if nothing was found | - empty list if nothing was found | ||||
- dictionary {mapping_filenames[name]:f['sha1']} | - dictionary {mapping_filenames[name]:f['sha1']} | ||||
""" | """ | ||||
results = {} | results = {} | ||||
for (mapping_name, mapping) in MAPPINGS.items(): | for (mapping_name, mapping) in MAPPINGS.items(): | ||||
matches = mapping.detect_metadata_files(files) | matches = mapping.detect_metadata_files(files) | ||||
if matches: | if matches: | ||||
results[mapping_name] = matches | results[mapping_name] = matches | ||||
return results | return results | ||||
_MINIMAL_PROPERTY_SET = { | |||||
"developmentStatus", "version", "operatingSystem", "description", | |||||
"keywords", "issueTracker", "name", "author", "relatedLink", | |||||
"url", "license", "maintainer", "email", "identifier", | |||||
"codeRepository"} | |||||
MINIMAL_METADATA_SET = {make_absolute_uri(prop) | |||||
for prop in _MINIMAL_PROPERTY_SET} | |||||
def extract_minimal_metadata_dict(metadata_list): | def extract_minimal_metadata_dict(metadata_list): | ||||
""" | """ | ||||
Every item in the metadata_list is a dict of translated_metadata in the | Every item in the metadata_list is a dict of translated_metadata in the | ||||
CodeMeta vocabulary | CodeMeta vocabulary | ||||
we wish to extract a minimal set of terms and keep all values corresponding | we wish to extract a minimal set of terms and keep all values corresponding | ||||
to this term without duplication | to this term without duplication | ||||
Args: | Args: | ||||
- metadata_list (list): list of dicts of translated_metadata | - metadata_list (list): list of dicts of translated_metadata | ||||
Returns: | Returns: | ||||
- minimal_dict (dict): one dict with selected values of metadata | - minimal_dict (dict): one dict with selected values of metadata | ||||
""" | """ | ||||
minimal_dict = { | minimal_dict = {} | ||||
"developmentStatus": [], | for document in metadata_list: | ||||
"version": [], | for metadata_item in expand(document): | ||||
"operatingSystem": [], | for (term, value) in metadata_item.items(): | ||||
"description": [], | if term in MINIMAL_METADATA_SET: | ||||
"keywords": [], | if term not in minimal_dict: | ||||
"issueTracker": [], | minimal_dict[term] = [value] | ||||
"name": [], | elif value not in minimal_dict[term]: | ||||
moranegg: I see it does become a list when multiple values are given for the same term.
Would love to see… | |||||
Done Inline ActionsThat's the same behavior as before; it's already tested in test_extract_minimal_metadata_dict. vlorentz: That's the same behavior as before; it's already tested in `test_extract_minimal_metadata_dict`. | |||||
"author": [], | minimal_dict[term].append(value) | ||||
"relatedLink": [], | return compact(minimal_dict) | ||||
"url": [], | |||||
"license": [], | |||||
"maintainer": [], | |||||
"email": [], | |||||
"softwareRequirements": [], | |||||
"identifier": [], | |||||
"codeRepository": [] | |||||
} | |||||
for term in minimal_dict.keys(): | |||||
for metadata_item in metadata_list: | |||||
if term in metadata_item: | |||||
if not metadata_item[term] in minimal_dict[term]: | |||||
minimal_dict[term].append(metadata_item[term]) | |||||
if not minimal_dict[term]: | |||||
minimal_dict[term] = None | |||||
return minimal_dict |
I see it does become a list when multiple values are given for the same term.
Would love to see it tested :-)