Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary.py
# Copyright (C) 2017 The Software Heritage developers | # Copyright (C) 2017 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import os | import os | ||||
import re | import re | ||||
import abc | import abc | ||||
import json | import json | ||||
import logging | import logging | ||||
import email.parser | |||||
import xmltodict | import xmltodict | ||||
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI | from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI | ||||
from swh.indexer.codemeta import compact, expand | from swh.indexer.codemeta import compact, expand | ||||
MAPPINGS = {} | MAPPINGS = {} | ||||
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines | def translate_dict(self, content_dict, *, normalize=True): | ||||
dict: translated metadata in json-friendly form needed for | dict: translated metadata in json-friendly form needed for | ||||
the indexer | the indexer | ||||
""" | """ | ||||
translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} | translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} | ||||
for k, v in content_dict.items(): | for k, v in content_dict.items(): | ||||
# First, check if there is a specific translation | # First, check if there is a specific translation | ||||
# method for this key | # method for this key | ||||
translation_method = getattr(self, 'translate_' + k, None) | translation_method = getattr( | ||||
self, 'translate_' + k.replace('-', '_'), None) | |||||
if translation_method: | if translation_method: | ||||
translation_method(translated_metadata, v) | translation_method(translated_metadata, v) | ||||
elif k in self.mapping: | elif k in self.mapping: | ||||
# if there is no method, but the key is known from the | # if there is no method, but the key is known from the | ||||
# crosswalk table | # crosswalk table | ||||
# if there is a normalization method, use it on the value | # if there is a normalization method, use it on the value | ||||
normalization_method = getattr(self, 'normalize_' + k, None) | normalization_method = getattr( | ||||
self, 'normalize_' + k.replace('-', '_'), None) | |||||
if normalization_method: | if normalization_method: | ||||
v = normalization_method(v) | v = normalization_method(v) | ||||
# set the translation metadata with the normalized value | # set the translation metadata with the normalized value | ||||
translated_metadata[self.mapping[k]] = v | translated_metadata[self.mapping[k]] = v | ||||
if normalize: | if normalize: | ||||
return self.normalize_translation(translated_metadata) | return self.normalize_translation(translated_metadata) | ||||
else: | else: | ||||
▲ Show 20 Lines • Show All 212 Lines • ▼ Show 20 Lines | def parse_licenses(self, d): | ||||
""" | """ | ||||
licenses = d.get('licenses', {}).get('license', []) | licenses = d.get('licenses', {}).get('license', []) | ||||
if isinstance(licenses, dict): | if isinstance(licenses, dict): | ||||
licenses = [licenses] | licenses = [licenses] | ||||
return [{"@id": license['url']} for license in licenses] | return [{"@id": license['url']} for license in licenses] | ||||
_normalize_pkginfo_key = str.lower | |||||
@register_mapping | |||||
class PythonPkginfoMapping(DictMapping, SingleFileMapping): | |||||
"""Dedicated class for Python's PKG-INFO mapping and translation. | |||||
https://www.python.org/dev/peps/pep-0314/""" | |||||
filename = b'PKG-INFO' | |||||
mapping = {_normalize_pkginfo_key(k): v | |||||
for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} | |||||
_parser = email.parser.BytesHeaderParser() | |||||
def translate(self, content): | |||||
msg = self._parser.parsebytes(content) | |||||
d = {} | |||||
for (key, value) in msg.items(): | |||||
key = _normalize_pkginfo_key(key) | |||||
if value != 'UNKNOWN': | |||||
d.setdefault(key, []).append(value) | |||||
metadata = self.translate_dict(d, normalize=False) | |||||
if SCHEMA_URI+'author' in metadata or SCHEMA_URI+'email' in metadata: | |||||
metadata[SCHEMA_URI+'author'] = { | |||||
'@list': [{ | |||||
'@type': SCHEMA_URI+'Person', | |||||
SCHEMA_URI+'name': | |||||
metadata.pop(SCHEMA_URI+'author', [None])[0], | |||||
SCHEMA_URI+'email': | |||||
metadata.pop(SCHEMA_URI+'email', [None])[0], | |||||
olasd: I'm surprised flake8 is happy about those `+`es | |||||
}] | |||||
} | |||||
return self.normalize_translation(metadata) | |||||
def translate_summary(self, translated_metadata, v): | |||||
k = self.mapping['summary'] | |||||
translated_metadata.setdefault(k, []).append(v) | |||||
def translate_description(self, translated_metadata, v): | |||||
k = self.mapping['description'] | |||||
translated_metadata.setdefault(k, []).append(v) | |||||
def normalize_home_page(self, urls): | |||||
return [{'@id': url} for url in urls] | |||||
def normalize_license(self, licenses): | |||||
return [{'@id': license} for license in licenses] | |||||
def main(): | def main(): | ||||
raw_content = """{"name": "test_name", "unknown_term": "ut"}""" | raw_content = """{"name": "test_name", "unknown_term": "ut"}""" | ||||
raw_content1 = b"""{"name": "test_name", | raw_content1 = b"""{"name": "test_name", | ||||
"unknown_term": "ut", | "unknown_term": "ut", | ||||
"prerequisites" :"packageXYZ"}""" | "prerequisites" :"packageXYZ"}""" | ||||
result = MAPPINGS["NpmMapping"].translate(raw_content) | result = MAPPINGS["NpmMapping"].translate(raw_content) | ||||
result1 = MAPPINGS["MavenMapping"].translate(raw_content1) | result1 = MAPPINGS["MavenMapping"].translate(raw_content1) | ||||
print(result) | print(result) | ||||
print(result1) | print(result1) | ||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
main() | main() |
I'm surprised flake8 is happy about those +es