Differential D956 Diff 3052 swh/indexer/metadata_dictionary.py

Changeset View

Standalone View

swh/indexer/metadata_dictionary.py

# Copyright (C) 2017 The Software Heritage developers		# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

import os		import os
import re		import re
import abc		import abc
		import ast
import json		import json
import logging		import logging
		import itertools
import email.parser		import email.parser

import xmltodict		import xmltodict

from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI		from swh.indexer.codemeta import CROSSWALK_TABLE, SCHEMA_URI
from swh.indexer.codemeta import compact, expand		from swh.indexer.codemeta import compact, expand


▲ Show 20 Lines • Show All 381 Lines • ▼ Show 20 Lines	class PythonPkginfoMapping(DictMapping, SingleFileMapping):

def normalize_home_page(self, urls):		def normalize_home_page(self, urls):
return [{'@id': url} for url in urls]		return [{'@id': url} for url in urls]

def normalize_license(self, licenses):		def normalize_license(self, licenses):
return [{'@id': license} for license in licenses]		return [{'@id': license} for license in licenses]


		def eval_ruby_expression(expr):
		"""Very simple evaluator of Ruby expressions.

		>>> eval_ruby_expression('"Foo bar"')
		'Foo bar'
		>>> eval_ruby_expression("'Foo bar'")
		'Foo bar'
		>>> eval_ruby_expression("['Foo', 'bar']")
		['Foo', 'bar']
		>>> eval_ruby_expression("'Foo bar'.freeze")
		'Foo bar'
		>>> eval_ruby_expression("['Foo'.freeze, 'bar'.freeze]")
		['Foo', 'bar']
		"""
		expr = expr.replace('.freeze', '')
		try:
		# We're parsing Ruby expressions here, but Python's
		douarddaUnsubmitted Not Done Inline Actions For all logging statements, shouldn't it make sens to have the id of the object being indexed ? I've not checked how indexers work so i'm not sure if this id makes sense and is available at this level, but IMHO having warnings and error messages in the logs which we cannot easily link to a known entity (an origin, a revision, etc.) is pretty frustrating. douardda: For all logging statements, shouldn't it make sens to have the id of the object being indexed ?
		vlorentzAuthorUnsubmitted Done Inline Actions T1474 vlorentz: T1474
		olasdUnsubmitted Not Done Inline Actions All logged events from tasks some metadata items attached in the systemd journal (see swh.core.logger.get_extra_data and JournalHandler), notably the (celery) task id. olasd: All logged events from tasks some metadata items attached in the systemd journal (see swh.core.
		# ast.literal_eval is rather good at parsing simple
		# Ruby expressions (mainly strings delimited with " or ',
		# and lists of such strings).
		return ast.literal_eval(expr)
		except (SyntaxError, ValueError):
		# Obviously, ast.literal_eval won't work on any Ruby code
		return

		douarddaUnsubmitted Done Inline Actions unless i'm mistaken, the match var is not used, so why not for line in lines: if self._re_spec_new.matchline): break Another solution is using the dropwhile iterator (itertools), something like: lines = dropwhile(lambda x: not self._re_spec_new.match(x), lines.split('\n')) and move the 'emptyness' detection at the end of the function. douardda:* unless i'm mistaken, the match var is not used, so why not ``` for line in lines: if self.

		@register_mapping
		class GemspecMapping(DictMapping):
		_re_spec_new = re.compile(r'.Gem::Specification.new do \\|.\\|.*')
		_re_spec_entry = re.compile(r'\s\w+\.(?P<key>\w+)\s=\s(?P<expr>.)')

		mapping = CROSSWALK_TABLE['Ruby Gem']

		def detect_metadata_files(self, file_entries):
		for entry in file_entries:
		if entry['name'].endswith(b'.gemspec'):
		return [entry['sha1']]
		return []

		def translate(self, raw_content):
		try:
		raw_content = raw_content.decode()
		ardumontUnsubmitted Done Inline Actions Maybe put this in a `parse_ruby_expression` method. ardumont: Maybe put this in a `parse_ruby_expression` method.
		except UnicodeDecodeError:
		self.log.warning('Error unidecoding %r', raw_content)
		ardumontUnsubmitted Not Done Inline Actions It'd be interesting to know what's not supported so a log here would be nice (content's id) ardumont: It'd be interesting to know what's not supported so a log here would be nice (content's id)
		vlorentzAuthorUnsubmitted Done Inline Actions There's a lot of stuff that's not supported (most of which would need a real Ruby interpreter), so it would get very spammy. vlorentz: There's a lot of stuff that's not supported (most of which would need a real Ruby interpreter)…
		return

		# Skip lines before 'Gem::Specification.new'
		lines = itertools.dropwhile(
		lambda x: not self._re_spec_new.match(x),
		raw_content.split('\n'))

		try:
		next(lines) # Consume 'Gem::Specification.new'
		except StopIteration:
		douarddaUnsubmitted Not Done Inline Actions I don't like too much this kind of `if isinstance(str)`... What are the expected types for s? Can it be something else than str or None? In which case I prefer `if s is not None:` douardda: I don't like too much this kind of `if isinstance(str)`... What are the expected types for s?
		vlorentzAuthorUnsubmitted Done Inline Actions Any basic data type (str, list, dict, bytes, int, float, tuple), we're parsing arbitrary and untrusted input. vlorentz: Any basic data type (str, list, dict, bytes, int, float, tuple), we're parsing arbitrary and…
		self.log.warning('Could not find Gem::Specification in %r',
		raw_content)
		return
		douarddaUnsubmitted Not Done Inline Actions why forbid tuples or iterators? douardda: why forbid tuples or iterators?
		vlorentzAuthorUnsubmitted Done Inline Actions Ruby does not have tuple literals, and `ast.literal_eval` never returns iterators. vlorentz: Ruby does not have tuple literals, and `ast.literal_eval` never returns iterators.

		content_dict = {}
		for line in lines:
		match = self._re_spec_entry.match(line)
		if match:
		value = eval_ruby_expression(match.group('expr'))
		if value:
		content_dict[match.group('key')] = value
		return self.translate_dict(content_dict)

		def normalize_homepage(self, s):
		return {"@id": s}

		def normalize_license(self, s):
		if isinstance(s, str):
		return [{"@id": "https://spdx.org/licenses/" + s}]

		def normalize_licenses(self, licenses):
		if isinstance(licenses, list):
		return [{"@id": "https://spdx.org/licenses/" + license}
		for license in licenses
		if isinstance(license, str)]

		def translate_author(self, translated_metadata, v):
		k = self.mapping['author']
		translated_metadata.setdefault(k, {"@list": []})["@list"].append(v)

		def translate_authors(self, translated_metadata, v):
		k = self.mapping['authors']
		translated_metadata.setdefault(k, {"@list": []})["@list"].extend(v)

		def translate_summary(self, translated_metadata, v):
		k = self.mapping['summary']
		translated_metadata.setdefault(k, []).append(v)

		def translate_description(self, translated_metadata, v):
		k = self.mapping['description']
		translated_metadata.setdefault(k, []).append(v)


def main():		def main():
raw_content = """{"name": "test_name", "unknown_term": "ut"}"""		raw_content = """{"name": "test_name", "unknown_term": "ut"}"""
raw_content1 = b"""{"name": "test_name",		raw_content1 = b"""{"name": "test_name",
"unknown_term": "ut",		"unknown_term": "ut",
"prerequisites" :"packageXYZ"}"""		"prerequisites" :"packageXYZ"}"""
result = MAPPINGS["NpmMapping"].translate(raw_content)		result = MAPPINGS["NpmMapping"].translate(raw_content)
result1 = MAPPINGS["MavenMapping"].translate(raw_content1)		result1 = MAPPINGS["MavenMapping"].translate(raw_content1)

print(result)		print(result)
print(result1)		print(result1)


if __name__ == "__main__":		if __name__ == "__main__":
main()		main()