Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_detector.py
# Copyright (C) 2017 The Software Heritage developers | # Copyright (C) 2017 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from swh.indexer.codemeta import compact, expand | from swh.indexer.codemeta import compact, expand | ||||
from swh.indexer.codemeta import make_absolute_uri | from swh.indexer.codemeta import make_absolute_uri | ||||
from swh.indexer.metadata_dictionary import MAPPINGS | from swh.indexer.metadata_dictionary import MAPPINGS | ||||
def detect_metadata(files): | def detect_metadata(files): | ||||
""" | """ | ||||
Detects files potentially containing metadata | Detects files potentially containing metadata | ||||
Args: | Args: | ||||
- file_entries (list): list of files | file_entries (list): list of files | ||||
Returns: | Returns: | ||||
- empty list if nothing was found | dict: {mapping_filenames[name]:f['sha1']} (may be empty) | ||||
- dictionary {mapping_filenames[name]:f['sha1']} | |||||
""" | """ | ||||
results = {} | results = {} | ||||
for (mapping_name, mapping) in MAPPINGS.items(): | for (mapping_name, mapping) in MAPPINGS.items(): | ||||
matches = mapping.detect_metadata_files(files) | matches = mapping.detect_metadata_files(files) | ||||
if matches: | if matches: | ||||
results[mapping_name] = matches | results[mapping_name] = matches | ||||
return results | return results | ||||
_MINIMAL_PROPERTY_SET = { | _MINIMAL_PROPERTY_SET = { | ||||
"developmentStatus", "version", "operatingSystem", "description", | "developmentStatus", "version", "operatingSystem", "description", | ||||
"keywords", "issueTracker", "name", "author", "relatedLink", | "keywords", "issueTracker", "name", "author", "relatedLink", | ||||
"url", "license", "maintainer", "email", "identifier", | "url", "license", "maintainer", "email", "identifier", | ||||
"codeRepository"} | "codeRepository"} | ||||
MINIMAL_METADATA_SET = {make_absolute_uri(prop) | MINIMAL_METADATA_SET = {make_absolute_uri(prop) | ||||
for prop in _MINIMAL_PROPERTY_SET} | for prop in _MINIMAL_PROPERTY_SET} | ||||
def extract_minimal_metadata_dict(metadata_list): | def extract_minimal_metadata_dict(metadata_list): | ||||
""" | """ | ||||
Every item in the metadata_list is a dict of translated_metadata in the | Every item in the metadata_list is a dict of translated_metadata in the | ||||
CodeMeta vocabulary | CodeMeta vocabulary. | ||||
we wish to extract a minimal set of terms and keep all values corresponding | |||||
to this term without duplication | We wish to extract a minimal set of terms and keep all values corresponding | ||||
to this term without duplication. | |||||
Args: | Args: | ||||
- metadata_list (list): list of dicts of translated_metadata | metadata_list (list): list of dicts of translated_metadata | ||||
Returns: | Returns: | ||||
- minimal_dict (dict): one dict with selected values of metadata | dict: minimal_dict; dict with selected values of metadata | ||||
""" | """ | ||||
minimal_dict = {} | minimal_dict = {} | ||||
for document in metadata_list: | for document in metadata_list: | ||||
for metadata_item in expand(document): | for metadata_item in expand(document): | ||||
for (term, value) in metadata_item.items(): | for (term, value) in metadata_item.items(): | ||||
if term in MINIMAL_METADATA_SET: | if term in MINIMAL_METADATA_SET: | ||||
if term not in minimal_dict: | if term not in minimal_dict: | ||||
minimal_dict[term] = [value] | minimal_dict[term] = [value] | ||||
elif value not in minimal_dict[term]: | elif value not in minimal_dict[term]: | ||||
minimal_dict[term].append(value) | minimal_dict[term].append(value) | ||||
return compact(minimal_dict) | return compact(minimal_dict) |