Changeset View
Standalone View
swh/indexer/metadata_dictionary.py
# Copyright (C) 2017 The Software Heritage developers | # Copyright (C) 2017 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import os | import os | ||||
import re | import re | ||||
import abc | import abc | ||||
import ast | |||||
import json | import json | ||||
import logging | import logging | ||||
import itertools | |||||
import email.parser | import email.parser | ||||
import xmltodict | import xmltodict | ||||
from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI | from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI | ||||
from swh.indexer.codemeta import compact, expand | from swh.indexer.codemeta import compact, expand | ||||
▲ Show 20 Lines • Show All 381 Lines • ▼ Show 20 Lines | class PythonPkginfoMapping(DictMapping, SingleFileMapping): | ||||
def normalize_home_page(self, urls): | def normalize_home_page(self, urls): | ||||
return [{'@id': url} for url in urls] | return [{'@id': url} for url in urls] | ||||
def normalize_license(self, licenses): | def normalize_license(self, licenses): | ||||
return [{'@id': license} for license in licenses] | return [{'@id': license} for license in licenses] | ||||
def eval_ruby_expression(expr): | |||||
"""Very simple evaluator of Ruby expressions. | |||||
>>> eval_ruby_expression('"Foo bar"') | |||||
'Foo bar' | |||||
>>> eval_ruby_expression("'Foo bar'") | |||||
'Foo bar' | |||||
>>> eval_ruby_expression("['Foo', 'bar']") | |||||
['Foo', 'bar'] | |||||
>>> eval_ruby_expression("'Foo bar'.freeze") | |||||
'Foo bar' | |||||
>>> eval_ruby_expression("['Foo'.freeze, 'bar'.freeze]") | |||||
['Foo', 'bar'] | |||||
""" | |||||
expr = expr.replace('.freeze', '') | |||||
try: | |||||
# We're parsing Ruby expressions here, but Python's | |||||
douardda: For all logging statements, shouldn't it make sens to have the id of the object being indexed ? | |||||
Done Inline Actionsvlorentz: T1474 | |||||
Not Done Inline ActionsAll logged events from tasks some metadata items attached in the systemd journal (see swh.core.logger.get_extra_data and JournalHandler), notably the (celery) task id. olasd: All logged events from tasks some metadata items attached in the systemd journal (see swh.core. | |||||
# ast.literal_eval is rather good at parsing simple | |||||
# Ruby expressions (mainly strings delimited with " or ', | |||||
# and lists of such strings). | |||||
return ast.literal_eval(expr) | |||||
except (SyntaxError, ValueError): | |||||
# Obviously, ast.literal_eval won't work on any Ruby code | |||||
return | |||||
Done Inline Actionsunless i'm mistaken, the match var is not used, so why not for line in lines: if self._re_spec_new.match*line): break Another solution is using the dropwhile iterator (itertools), something like: lines = dropwhile(lambda x: not self._re_spec_new.match(x), lines.split('\n')) and move the 'emptyness' detection at the end of the function. douardda: unless i'm mistaken, the match var is not used, so why not
```
for line in lines:
if self. | |||||
@register_mapping | |||||
class GemspecMapping(DictMapping): | |||||
_re_spec_new = re.compile(r'.*Gem::Specification.new do \|.*\|.*') | |||||
_re_spec_entry = re.compile(r'\s*\w+\.(?P<key>\w+)\s*=\s*(?P<expr>.*)') | |||||
mapping = CROSSWALK_TABLE['Ruby Gem'] | |||||
def detect_metadata_files(self, file_entries): | |||||
for entry in file_entries: | |||||
if entry['name'].endswith(b'.gemspec'): | |||||
return [entry['sha1']] | |||||
return [] | |||||
def translate(self, raw_content): | |||||
try: | |||||
raw_content = raw_content.decode() | |||||
Done Inline ActionsMaybe put this in a parse_ruby_expression method. ardumont: Maybe put this in a `parse_ruby_expression` method. | |||||
except UnicodeDecodeError: | |||||
self.log.warning('Error unidecoding %r', raw_content) | |||||
Not Done Inline ActionsIt'd be interesting to know what's not supported so a log here would be nice (content's id) ardumont: It'd be interesting to know what's not supported so a log here would be nice (content's id) | |||||
Done Inline ActionsThere's a lot of stuff that's not supported (most of which would need a real Ruby interpreter), so it would get very spammy. vlorentz: There's a lot of stuff that's not supported (most of which would need a real Ruby interpreter)… | |||||
return | |||||
# Skip lines before 'Gem::Specification.new' | |||||
lines = itertools.dropwhile( | |||||
lambda x: not self._re_spec_new.match(x), | |||||
raw_content.split('\n')) | |||||
try: | |||||
next(lines) # Consume 'Gem::Specification.new' | |||||
except StopIteration: | |||||
Not Done Inline ActionsI don't like too much this kind of if isinstance(str)... What are the expected types for s? Can it be something else than str or None? douardda: I don't like too much this kind of `if isinstance(str)`... What are the expected types for s? | |||||
Done Inline ActionsAny basic data type (str, list, dict, bytes, int, float, tuple), we're parsing arbitrary and untrusted input. vlorentz: Any basic data type (str, list, dict, bytes, int, float, tuple), we're parsing arbitrary and… | |||||
self.log.warning('Could not find Gem::Specification in %r', | |||||
raw_content) | |||||
return | |||||
Not Done Inline Actionswhy forbid tuples or iterators? douardda: why forbid tuples or iterators? | |||||
Done Inline ActionsRuby does not have tuple literals, and ast.literal_eval never returns iterators. vlorentz: Ruby does not have tuple literals, and `ast.literal_eval` never returns iterators. | |||||
content_dict = {} | |||||
for line in lines: | |||||
match = self._re_spec_entry.match(line) | |||||
if match: | |||||
value = eval_ruby_expression(match.group('expr')) | |||||
if value: | |||||
content_dict[match.group('key')] = value | |||||
return self.translate_dict(content_dict) | |||||
def normalize_homepage(self, s): | |||||
return {"@id": s} | |||||
def normalize_license(self, s): | |||||
if isinstance(s, str): | |||||
return [{"@id": "https://spdx.org/licenses/" + s}] | |||||
def normalize_licenses(self, licenses): | |||||
if isinstance(licenses, list): | |||||
return [{"@id": "https://spdx.org/licenses/" + license} | |||||
for license in licenses | |||||
if isinstance(license, str)] | |||||
def translate_author(self, translated_metadata, v): | |||||
k = self.mapping['author'] | |||||
translated_metadata.setdefault(k, {"@list": []})["@list"].append(v) | |||||
def translate_authors(self, translated_metadata, v): | |||||
k = self.mapping['authors'] | |||||
translated_metadata.setdefault(k, {"@list": []})["@list"].extend(v) | |||||
def translate_summary(self, translated_metadata, v): | |||||
k = self.mapping['summary'] | |||||
translated_metadata.setdefault(k, []).append(v) | |||||
def translate_description(self, translated_metadata, v): | |||||
k = self.mapping['description'] | |||||
translated_metadata.setdefault(k, []).append(v) | |||||
def main(): | def main(): | ||||
raw_content = """{"name": "test_name", "unknown_term": "ut"}""" | raw_content = """{"name": "test_name", "unknown_term": "ut"}""" | ||||
raw_content1 = b"""{"name": "test_name", | raw_content1 = b"""{"name": "test_name", | ||||
"unknown_term": "ut", | "unknown_term": "ut", | ||||
"prerequisites" :"packageXYZ"}""" | "prerequisites" :"packageXYZ"}""" | ||||
result = MAPPINGS["NpmMapping"].translate(raw_content) | result = MAPPINGS["NpmMapping"].translate(raw_content) | ||||
result1 = MAPPINGS["MavenMapping"].translate(raw_content1) | result1 = MAPPINGS["MavenMapping"].translate(raw_content1) | ||||
print(result) | print(result) | ||||
print(result1) | print(result1) | ||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
main() | main() |
For all logging statements, shouldn't it make sens to have the id of the object being indexed ? I've not checked how indexers work so i'm not sure if this id makes sense and is available at this level, but IMHO having warnings and error messages in the logs which we cannot easily link to a known entity (an origin, a revision, etc.) is pretty frustrating.