Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary.py
# Copyright (C) 2017 The Software Heritage developers | # Copyright (C) 2017 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import abc | import abc | ||||
import csv | |||||
import json | import json | ||||
import os.path | |||||
import logging | import logging | ||||
import swh.indexer | from swh.indexer.codemeta import CROSSWALK_TABLE, compact | ||||
CROSSWALK_TABLE_PATH = os.path.join(os.path.dirname(swh.indexer.__file__), | |||||
'data', 'codemeta', 'crosswalk.csv') | |||||
def read_crosstable(fd): | |||||
reader = csv.reader(fd) | |||||
try: | |||||
header = next(reader) | |||||
except StopIteration: | |||||
raise ValueError('empty file') | |||||
data_sources = set(header) - {'Parent Type', 'Property', | |||||
'Type', 'Description'} | |||||
assert 'codemeta-V1' in data_sources | |||||
codemeta_translation = {data_source: {} for data_source in data_sources} | |||||
for line in reader: # For each canonical name | |||||
canonical_name = dict(zip(header, line))['Property'] | |||||
for (col, value) in zip(header, line): # For each cell in the row | |||||
if col in data_sources: | |||||
# If that's not the parentType/property/type/description | |||||
for local_name in value.split('/'): | |||||
# For each of the data source's properties that maps | |||||
# to this canonical name | |||||
if local_name.strip(): | |||||
codemeta_translation[col][local_name.strip()] = \ | |||||
canonical_name | |||||
return codemeta_translation | |||||
with open(CROSSWALK_TABLE_PATH) as fd: | |||||
CROSSWALK_TABLE = read_crosstable(fd) | |||||
MAPPINGS = {} | MAPPINGS = {} | ||||
def register_mapping(cls): | def register_mapping(cls): | ||||
MAPPINGS[cls.__name__] = cls() | MAPPINGS[cls.__name__] = cls() | ||||
return cls | return cls | ||||
Show All 24 Lines | def detect_metadata_files(self, files): | ||||
- list of sha1 otherwise | - list of sha1 otherwise | ||||
""" | """ | ||||
pass | pass | ||||
@abc.abstractmethod | @abc.abstractmethod | ||||
def translate(self, file_content): | def translate(self, file_content): | ||||
pass | pass | ||||
def normalize_translation(self, metadata): | |||||
return compact(metadata) | |||||
class DictMapping(BaseMapping): | class DictMapping(BaseMapping): | ||||
"""Base class for mappings that take as input a file that is mostly | """Base class for mappings that take as input a file that is mostly | ||||
a key-value store (eg. a shallow JSON dict).""" | a key-value store (eg. a shallow JSON dict).""" | ||||
@property | @property | ||||
@abc.abstractmethod | @abc.abstractmethod | ||||
def mapping(self): | def mapping(self): | ||||
Show All 9 Lines | def translate_dict(self, content_dict): | ||||
content_dict (dict) | content_dict (dict) | ||||
Returns: | Returns: | ||||
dict: translated metadata in json-friendly form needed for | dict: translated metadata in json-friendly form needed for | ||||
the indexer | the indexer | ||||
""" | """ | ||||
translated_metadata = {} | translated_metadata = {} | ||||
default = 'other' | |||||
translated_metadata['other'] = {} | |||||
try: | |||||
for k, v in content_dict.items(): | for k, v in content_dict.items(): | ||||
try: | # First, check if there is a specific translation | ||||
term = self.mapping.get(k, default) | # method for this key | ||||
if term not in translated_metadata: | translation_method = getattr(self, 'translate_' + k, None) | ||||
translated_metadata[term] = v | if translation_method: | ||||
continue | translation_method(translated_metadata, v) | ||||
if isinstance(translated_metadata[term], str): | elif k in self.mapping: | ||||
in_value = translated_metadata[term] | # if there is no method, but the key is known from the | ||||
translated_metadata[term] = [in_value, v] | # crosswalk table | ||||
continue | |||||
if isinstance(translated_metadata[term], list): | # if there is a normalization method, use it on the value | ||||
translated_metadata[term].append(v) | normalization_method = getattr(self, 'normalize_' + k, None) | ||||
continue | if normalization_method: | ||||
if isinstance(translated_metadata[term], dict): | v = normalization_method(v) | ||||
translated_metadata[term][k] = v | |||||
continue | # set the translation metadata with the normalized value | ||||
except KeyError: | translated_metadata[self.mapping[k]] = v | ||||
self.log.exception( | return self.normalize_translation(translated_metadata) | ||||
"Problem during item mapping") | |||||
continue | |||||
except Exception: | |||||
raise | |||||
return None | |||||
return translated_metadata | |||||
class JsonMapping(DictMapping): | class JsonMapping(DictMapping): | ||||
"""Base class for all mappings that use a JSON file as input.""" | """Base class for all mappings that use a JSON file as input.""" | ||||
@property | @property | ||||
@abc.abstractmethod | @abc.abstractmethod | ||||
def filename(self): | def filename(self): | ||||
Show All 35 Lines | |||||
@register_mapping | @register_mapping | ||||
class NpmMapping(JsonMapping): | class NpmMapping(JsonMapping): | ||||
""" | """ | ||||
dedicated class for NPM (package.json) mapping and translation | dedicated class for NPM (package.json) mapping and translation | ||||
""" | """ | ||||
mapping = CROSSWALK_TABLE['NodeJS'] | mapping = CROSSWALK_TABLE['NodeJS'] | ||||
filename = b'package.json' | filename = b'package.json' | ||||
def normalize_repository(self, d): | |||||
return '{type}+{url}'.format(**d) | |||||
def normalize_bugs(self, d): | |||||
return '{url}'.format(**d) | |||||
@register_mapping | @register_mapping | ||||
class CodemetaMapping(JsonMapping): | class CodemetaMapping(JsonMapping): | ||||
""" | """ | ||||
dedicated class for CodeMeta (codemeta.json) mapping and translation | dedicated class for CodeMeta (codemeta.json) mapping and translation | ||||
""" | """ | ||||
mapping = CROSSWALK_TABLE['codemeta-V1'] | mapping = CROSSWALK_TABLE['codemeta-V1'] | ||||
filename = b'codemeta.json' | filename = b'codemeta.json' | ||||
Show All 16 Lines |