Changeset View
Standalone View
swh/indexer/metadata_dictionary.py
- This file was added.
# Copyright (C) 2017 The Software Heritage developers | |||||
ardumont: 2017 | |||||
Not Done Inline Actionsack moranegg: ack | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import json | |||||
def convert(raw_content): | |||||
""" | |||||
convert raw_content recursively: | |||||
- from bytes to string | |||||
- from string to dict | |||||
Args: | |||||
- raw_content (bytes / string / dict) | |||||
Returns: | |||||
- Dict of content (if string was json, otherwise returns string) | |||||
""" | |||||
if isinstance(raw_content, bytes): | |||||
return convert(raw_content.decode()) | |||||
if isinstance(raw_content, str): | |||||
try: | |||||
content = json.loads(raw_content) | |||||
if content: | |||||
return content | |||||
else: | |||||
return raw_content | |||||
except json.decoder.JSONDecodeError: | |||||
return raw_content | |||||
if isinstance(raw_content, dict): | |||||
return raw_content | |||||
class BaseMapping(): | |||||
"""Base class for mappings to inherit from | |||||
To implement a new mapping: | |||||
- inherit this class | |||||
- add a local property self.mapping | |||||
- override translate function | |||||
""" | |||||
def __init__(self): | |||||
pass | |||||
def translate(self, content_dict): | |||||
""" | |||||
Tranlsates content by parsing content to a json object | |||||
and translating with the npm mapping (for now hard_coded mapping) | |||||
Args: | |||||
- context_text (text) : should be json | |||||
Returns: | |||||
- translated_metadata (dict): jsonb form needed for the indexer | |||||
""" | |||||
translated_metadata = {} | |||||
default = 'other' | |||||
translated_metadata['other'] = {} | |||||
Done Inline ActionsYou could use a dict of key: {context, translation function}. Something like: mapping_fn = { "hard_mapping_fn": translate_npm, "pom.xml": lambda content: translate_pom(parse_xml(content)), ... } # then parse method is simplified class MetadataDict(): def __init__(self): pass def parse(self, context, content): return mapping_fn(context)(content) ... Since i don't see any object state used in the Metadata dict functions (translate_npm, etc...), those could be simple functions (and not object methods). Hey, the parse method itself could be a function here :) Personally, i tend to use class when there needs to be shared state (accessible through the self. instance variable). ardumont: You could use a dict of key: {context, translation function}.
Something like:
```
mapping_fn… | |||||
Not Done Inline Actionsvery useful. thanks for this, I will rework the design of the MetaDict to decide is the class is needed. I'm trying to integrate the mapping_fn but I have some issues with calling it.. I will continue this tomorrow.. moranegg: very useful. thanks for this, I will rework the design of the MetaDict to decide is the class… | |||||
Done Inline Actions
Sure.
Calling it should be as simple as: mapping_fn['hard_mapping_fn'](content)
oh... I see there is a typo in the gist i mentioned to you. Try using bracket instead of parenthesis. mapping_fn[context](content) ardumont: > I will continue this tomorrow..
Sure.
> I'm trying to integrate the mapping_fn but I have… | |||||
Not Done Inline Actionsnot using a class and moved the function compute_metadata into the metadata_dictionary to keep the logic:
moranegg: not using a class and moved the function compute_metadata into the metadata_dictionary to keep… | |||||
try: | |||||
for k, v in content_dict.items(): | |||||
try: | |||||
term = self.mapping.get(k, default) | |||||
if term not in translated_metadata: | |||||
Done Inline Actionsparse_xml? ardumont: `parse_xml`? | |||||
translated_metadata[term] = v | |||||
continue | |||||
if isinstance(translated_metadata[term], str): | |||||
in_value = translated_metadata[term] | |||||
translated_metadata[term] = [in_value, v] | |||||
continue | |||||
if isinstance(translated_metadata[term], list): | |||||
translated_metadata[term].append(v) | |||||
continue | |||||
if isinstance(translated_metadata[term], dict): | |||||
translated_metadata[term][k] = v | |||||
continue | |||||
except KeyError: | |||||
self.log.exception( | |||||
"Problem during item mapping") | |||||
continue | |||||
except: | |||||
return None | |||||
return translated_metadata | |||||
class NpmMapping(BaseMapping): | |||||
""" | |||||
dedicated class for NPM (package.json) mapping and translation | |||||
""" | |||||
mapping = { | |||||
'repository': 'codeRepository', | |||||
'os': 'operatingSystem', | |||||
'cpu': 'processorRequirements', | |||||
'engines': 'processorRequirements', | |||||
'dependencies': 'softwareRequirements', | |||||
'bundleDependencies': 'softwareRequirements', | |||||
'peerDependencies': 'softwareRequirements', | |||||
'author': 'author', | |||||
'contributor': 'contributor', | |||||
'keywords': 'keywords', | |||||
'license': 'license', | |||||
'version': 'version', | |||||
'description': 'description', | |||||
'name': 'name', | |||||
'devDependencies': 'softwareSuggestions', | |||||
'optionalDependencies': 'softwareSuggestions', | |||||
'bugs': 'issueTracker', | |||||
'homepage': 'url' | |||||
} | |||||
def __init__(self): | |||||
ardumontUnsubmitted Not Done Inline ActionsIf there is nothing inside, you don't need to define it :) ardumont: If there is nothing inside, you don't need to define it :) | |||||
pass | |||||
Done Inline Actionswhy do you want to name it differently, the class already holds the npm inside. ardumont: why do you want to name it differently, the class already holds the npm inside.
I found the… | |||||
def translate(self, raw_content): | |||||
content_dict = convert(raw_content) | |||||
return super().translate(content_dict) | |||||
Not Done Inline ActionsIf that code is the same on all derived classes, this could prove it belongs to the BaseMapping's translate definition. ardumont: If that code is the same on all derived classes, this could prove it belongs to the… | |||||
Not Done Inline Actionsnow it's the same because I'm lazy and I haven't started working on other files, but in cases where the file doesn't contain json it should be decoded and/or parsed differently so this is why i didn't keep it in the translate method. Also I wanted to keep only one task in the translate method which the semantical translation from a dict with original terms to a dict with CodeMeta terms. moranegg: now it's the same because I'm lazy and I haven't started working on other files, but in cases… | |||||
Not Done Inline Actions
sure, aren't we all in some form or another? :) ardumont: > now it's the same because I'm lazy...
sure, aren't we all in some form or another? :)
| |||||
class MavenMapping(BaseMapping): | |||||
""" | |||||
dedicated class for Maven (pom.xml) mapping and translation | |||||
""" | |||||
mapping = { | |||||
'license': 'license', | |||||
'version': 'version', | |||||
'description': 'description', | |||||
'name': 'name', | |||||
'prerequisites': 'softwareRequirements', | |||||
'repositories': 'codeRepository', | |||||
'groupId': 'identifier', | |||||
'ciManagement': 'contIntegration', | |||||
'issuesManagement': 'issueTracker', | |||||
} | |||||
def __init__(self): | |||||
pass | |||||
Done Inline ActionsSince you do need a distinct mapping dict for all your translation, you could use classes... To avoid having out of context dictionaries defined all over the place (npm_mapping, doap_mapping, etc...). class NpmMapping: mapping = {<npm-mapping-dict-here>} def translate(self, content): return translate(content, self.mapping) class PomMapping: mapping = {<pom-mapping-here>} def translate(self, content): return translate(parse_xml(content), self.mapping) class DoapMapping: mapping = {<doap-mapping-dict-here>} def translate(self, content): return translate(parse_xml(content), self.mapping) ### Note that we could factor again code here :) ### # mapping_tool_fn becomes: mapping_tool_fn = { "hard_mapping_npm": NpmMapping(), "pom_xml": PomMapping(), "doap_xml": DoapMapping(), } # finally compute-metadata: def compute_metadata(context, raw_content): content = convert(raw_content) if content is None: return None translated_metadata = mapping_tool_fn[context].translate(content) return translated_metadata etc... But feel free to continue on your initial way for now, it's just suggestion. ardumont: Since you do need a distinct mapping dict for all your translation, you could use classes... | |||||
Not Done Inline ActionsI really like this approach and I could overload the "translate()" function if needed with a parent class metadataMapping() Eventually I will use the CodeMeta crosswalk table to generate the mappings so this might change a bit. Thanks for the help with the design of the component ! moranegg: I really like this approach and I could overload the "translate()" function if needed with a… | |||||
Not Done Inline Actions
cool
indeed :) ardumont: > I really like this approach
cool
> and I could overload the "translate()" ...
indeed :) | |||||
def translate(self, raw_content): | |||||
content = convert(raw_content) | |||||
# parse content from xml to dict | |||||
return super().translate(content) | |||||
class DoapMapping(BaseMapping): | |||||
mapping = { | |||||
} | |||||
def __init__(self): | |||||
pass | |||||
def translate(self, raw_content): | |||||
content = convert(raw_content) | |||||
# parse content from xml to dict | |||||
return super().translate(content) | |||||
def parse_xml(content): | |||||
""" | |||||
Parses content from xml to a python dict | |||||
Args: | |||||
- content (text): the string form of the raw_content ( in xml) | |||||
Returns: | |||||
- parsed_xml (dict): a python dict of the content after parsing | |||||
""" | |||||
# check if xml | |||||
# use xml parser to dict | |||||
return content | |||||
mapping_tool_fn = { | |||||
"hard_mapping_npm": NpmMapping(), | |||||
"pom_xml": MavenMapping(), | |||||
"doap_xml": DoapMapping() | |||||
} | |||||
def compute_metadata(context, raw_content): | |||||
""" | |||||
first landing method: a dispatcher that sends content | |||||
to the right function to carry out the real parsing of syntax | |||||
and translation of terms | |||||
Args: | |||||
- context (text) : defines to which function/tool | |||||
the content is sent | |||||
- content (text): the string form of the raw_content | |||||
Returns: | |||||
- translated_metadata (dict): jsonb form needed for the indexer | |||||
to store in storage | |||||
""" | |||||
if raw_content is None or raw_content is b"": | |||||
return None | |||||
# TODO: keep mapping not in code (maybe fetch crosswalk from storage?) | |||||
# if fetched from storage should be done once for batch of sha1s | |||||
dictionary = mapping_tool_fn[context] | |||||
translated_metadata = dictionary.translate(raw_content) | |||||
# print(translated_metadata) | |||||
return translated_metadata | |||||
def main(): | |||||
raw_content = """{"name": "test_name", "unknown_term": "ut"}""" | |||||
raw_content1 = b"""{"name": "test_name", | |||||
"unknown_term": "ut", | |||||
"prerequisites" :"packageXYZ"}""" | |||||
result = compute_metadata("hard_mapping_npm", raw_content) | |||||
result1 = compute_metadata("pom_xml", raw_content1) | |||||
print(result) | |||||
print(result1) | |||||
if __name__ == "__main__": | |||||
main() |
2017