Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/metadata_dictionary.py
# Copyright (C) 2017 The Software Heritage developers | # Copyright (C) 2017 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import abc | import abc | ||||
import csv | |||||
import json | import json | ||||
import os.path | |||||
import logging | import logging | ||||
import swh.indexer | from swh.indexer.codemeta import CROSSWALK_TABLE, compact | ||||
CROSSWALK_TABLE_PATH = os.path.join(os.path.dirname(swh.indexer.__file__), | |||||
'data', 'codemeta', 'crosswalk.csv') | |||||
def read_crosstable(fd): | |||||
reader = csv.reader(fd) | |||||
try: | |||||
header = next(reader) | |||||
except StopIteration: | |||||
raise ValueError('empty file') | |||||
data_sources = set(header) - {'Parent Type', 'Property', | |||||
'Type', 'Description'} | |||||
assert 'codemeta-V1' in data_sources | |||||
codemeta_translation = {data_source: {} for data_source in data_sources} | |||||
for line in reader: # For each canonical name | |||||
canonical_name = dict(zip(header, line))['Property'] | |||||
for (col, value) in zip(header, line): # For each cell in the row | |||||
if col in data_sources: | |||||
# If that's not the parentType/property/type/description | |||||
for local_name in value.split('/'): | |||||
# For each of the data source's properties that maps | |||||
# to this canonical name | |||||
if local_name.strip(): | |||||
codemeta_translation[col][local_name.strip()] = \ | |||||
canonical_name | |||||
return codemeta_translation | |||||
with open(CROSSWALK_TABLE_PATH) as fd: | |||||
CROSSWALK_TABLE = read_crosstable(fd) | |||||
MAPPINGS = {} | MAPPINGS = {} | ||||
def register_mapping(cls): | def register_mapping(cls): | ||||
MAPPINGS[cls.__name__] = cls() | MAPPINGS[cls.__name__] = cls() | ||||
return cls | return cls | ||||
Show All 24 Lines | def detect_metadata_files(self, files): | ||||
- list of sha1 otherwise | - list of sha1 otherwise | ||||
""" | """ | ||||
pass | pass | ||||
@abc.abstractmethod | @abc.abstractmethod | ||||
def translate(self, file_content): | def translate(self, file_content): | ||||
pass | pass | ||||
def normalize_translation(self, metadata): | |||||
return compact(metadata) | |||||
class DictMapping(BaseMapping): | class DictMapping(BaseMapping): | ||||
"""Base class for mappings that take as input a file that is mostly | """Base class for mappings that take as input a file that is mostly | ||||
a key-value store (eg. a shallow JSON dict).""" | a key-value store (eg. a shallow JSON dict).""" | ||||
value_format = {} | |||||
def normalize_value(self, key, term, value): | |||||
if isinstance(value, list): | |||||
return [self.normalize_value(key, term, v) for v in value] | |||||
elif isinstance(value, dict): | |||||
return self.value_format[term].format(**value) | |||||
else: | |||||
return value | |||||
@property | @property | ||||
@abc.abstractmethod | @abc.abstractmethod | ||||
def mapping(self): | def mapping(self): | ||||
"""A translation dict to map dict keys into a canonical name.""" | """A translation dict to map dict keys into a canonical name.""" | ||||
pass | pass | ||||
def translate_dict(self, content_dict): | def translate_dict(self, content_dict): | ||||
""" | """ | ||||
Show All 10 Lines | def translate_dict(self, content_dict): | ||||
""" | """ | ||||
translated_metadata = {} | translated_metadata = {} | ||||
default = 'other' | default = 'other' | ||||
translated_metadata['other'] = {} | translated_metadata['other'] = {} | ||||
try: | try: | ||||
for k, v in content_dict.items(): | for k, v in content_dict.items(): | ||||
try: | try: | ||||
term = self.mapping.get(k, default) | term = self.mapping.get(k, default) | ||||
v = self.normalize_value(k, term, v) | |||||
if term not in translated_metadata: | if term not in translated_metadata: | ||||
translated_metadata[term] = v | translated_metadata[term] = v | ||||
continue | continue | ||||
if isinstance(translated_metadata[term], str): | if isinstance(translated_metadata[term], str): | ||||
in_value = translated_metadata[term] | in_value = translated_metadata[term] | ||||
translated_metadata[term] = [in_value, v] | translated_metadata[term] = [in_value, v] | ||||
continue | continue | ||||
if isinstance(translated_metadata[term], list): | if isinstance(translated_metadata[term], list): | ||||
translated_metadata[term].append(v) | translated_metadata[term].append(v) | ||||
continue | continue | ||||
if isinstance(translated_metadata[term], dict): | if isinstance(translated_metadata[term], dict): | ||||
translated_metadata[term][k] = v | translated_metadata[term][k] = v | ||||
continue | continue | ||||
except KeyError: | except KeyError: | ||||
self.log.exception( | self.log.exception( | ||||
"Problem during item mapping") | "Problem during item mapping") | ||||
continue | continue | ||||
except Exception: | except Exception: | ||||
raise | raise | ||||
return None | return None | ||||
return translated_metadata | return self.normalize_translation(translated_metadata) | ||||
class JsonMapping(DictMapping): | class JsonMapping(DictMapping): | ||||
"""Base class for all mappings that use a JSON file as input.""" | """Base class for all mappings that use a JSON file as input.""" | ||||
@property | @property | ||||
@abc.abstractmethod | @abc.abstractmethod | ||||
def filename(self): | def filename(self): | ||||
Show All 35 Lines | |||||
@register_mapping | @register_mapping | ||||
class NpmMapping(JsonMapping): | class NpmMapping(JsonMapping): | ||||
""" | """ | ||||
dedicated class for NPM (package.json) mapping and translation | dedicated class for NPM (package.json) mapping and translation | ||||
""" | """ | ||||
mapping = CROSSWALK_TABLE['NodeJS'] | mapping = CROSSWALK_TABLE['NodeJS'] | ||||
filename = b'package.json' | filename = b'package.json' | ||||
value_format = { | |||||
'https://codemeta.github.io/terms/codeRepository': | |||||
'{type}+{url}', | |||||
'https://codemeta.github.io/terms/issueTracker': | |||||
'{url}', | |||||
} | |||||
@register_mapping | @register_mapping | ||||
class CodemetaMapping(JsonMapping): | class CodemetaMapping(JsonMapping): | ||||
""" | """ | ||||
dedicated class for CodeMeta (codemeta.json) mapping and translation | dedicated class for CodeMeta (codemeta.json) mapping and translation | ||||
""" | """ | ||||
mapping = CROSSWALK_TABLE['codemeta-V1'] | mapping = CROSSWALK_TABLE['codemeta-V1'] | ||||
filename = b'codemeta.json' | filename = b'codemeta.json' | ||||
Show All 16 Lines |