diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -6,6 +6,7 @@ import csv import json import os.path +import re import swh.indexer from pyld import jsonld @@ -34,6 +35,8 @@ SCHEMA_URI + 'creator', } +_codemeta_field_separator = re.compile(r'\s*[,/]\s*') + def make_absolute_uri(local_name): definition = CODEMETA_CONTEXT['@context'][local_name] @@ -76,7 +79,7 @@ for (col, value) in zip(header, line): # For each cell in the row if col in data_sources: # If that's not the parentType/property/type/description - for local_name in value.split('/'): + for local_name in _codemeta_field_separator.split(value): # For each of the data source's properties that maps # to this canonical name if local_name.strip(): diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -6,8 +6,10 @@ import os import re import abc +import ast import json import logging +import itertools import email.parser import xml.parsers.expat @@ -471,6 +473,116 @@ return [{'@id': license} for license in licenses] +@register_mapping +class GemspecMapping(DictMapping): + _re_spec_new = re.compile(r'.*Gem::Specification.new do \|.*\|.*') + _re_spec_entry = re.compile(r'\s*\w+\.(?P\w+)\s*=\s*(?P.*)') + + mapping = CROSSWALK_TABLE['Ruby Gem'] + + def detect_metadata_files(self, file_entries): + for entry in file_entries: + if entry['name'].endswith(b'.gemspec'): + return [entry['sha1']] + return [] + + def translate(self, raw_content): + try: + raw_content = raw_content.decode() + except UnicodeDecodeError: + self.log.warning('Error unidecoding %r', raw_content) + return + + # Skip lines before 'Gem::Specification.new' + lines = itertools.dropwhile( + lambda x: not self._re_spec_new.match(x), + raw_content.split('\n')) + + try: + next(lines) # Consume 'Gem::Specification.new' + except StopIteration: + self.log.warning('Could not find Gem::Specification in %r', + raw_content) + return + + content_dict = {} + for line in lines: + match = self._re_spec_entry.match(line) + if match: + value = self.eval_ruby_expression(match.group('expr')) + if value: + content_dict[match.group('key')] = value + return self.translate_dict(content_dict) + + def eval_ruby_expression(self, expr): + """Very simple evaluator of Ruby expressions. + + >>> GemspecMapping().eval_ruby_expression('"Foo bar"') + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression("'Foo bar'") + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression("['Foo', 'bar']") + ['Foo', 'bar'] + >>> GemspecMapping().eval_ruby_expression("'Foo bar'.freeze") + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression( \ + "['Foo'.freeze, 'bar'.freeze]") + ['Foo', 'bar'] + """ + def evaluator(node): + if isinstance(node, ast.Str): + return node.s + elif isinstance(node, ast.List): + res = [] + for element in node.elts: + val = evaluator(element) + if not val: + return + res.append(val) + return res + + expr = expr.replace('.freeze', '') + try: + # We're parsing Ruby expressions here, but Python's + # ast.parse works for very simple Ruby expressions + # (mainly strings delimited with " or ', and lists + # of such strings). + tree = ast.parse(expr, mode='eval') + except (SyntaxError, ValueError): + return + if isinstance(tree, ast.Expression): + return evaluator(tree.body) + + def normalize_homepage(self, s): + return {"@id": s} + + def normalize_license(self, s): + if isinstance(s, str): + return [{"@id": "https://spdx.org/licenses/" + s}] + + def normalize_licenses(self, licenses): + if isinstance(licenses, list): + return [{"@id": "https://spdx.org/licenses/" + license} + for license in licenses + if isinstance(license, str)] + + def translate_author(self, translated_metadata, v): + k = self.mapping['author'] + translated_metadata.setdefault(k, {"@list": []})["@list"].append(v) + + def translate_authors(self, translated_metadata, v): + k = self.mapping['authors'] + translated_metadata.setdefault(k, {"@list": []})["@list"].extend(v) + + def translate_summary(self, translated_metadata, v): + k = self.mapping['summary'] + translated_metadata.setdefault(k, []).append(v) + + def translate_description(self, translated_metadata, v): + k = self.mapping['description'] + translated_metadata.setdefault(k, []).append(v) + + def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -727,6 +727,61 @@ 'license': 'MIT', }) + def test_gemspec_base(self): + raw_content = b""" +Gem::Specification.new do |s| + s.name = 'example' + s.version = '0.1.0' + s.licenses = ['MIT'] + s.summary = "This is an example!" + s.description = "Much longer explanation of the example!" + s.authors = ["Ruby Coder"] + s.email = 'rubycoder@example.com' + s.files = ["lib/example.rb"] + s.homepage = 'https://rubygems.org/gems/example' + s.metadata = { "source_code_uri" => "https://github.com/example/example" } +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertCountEqual(result.pop('description'), [ + "This is an example!", + "Much longer explanation of the example!" + ]) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'author': ['Ruby Coder'], + 'name': 'example', + 'license': 'https://spdx.org/licenses/MIT', + 'codeRepository': 'https://rubygems.org/gems/example', + 'email': 'rubycoder@example.com', + 'version': '0.1.0', + }) + + def test_gemspec_two_author_fields(self): + raw_content = b""" +Gem::Specification.new do |s| + s.authors = ["Ruby Coder1"] + s.author = "Ruby Coder2" +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertCountEqual(result.pop('author'), [ + 'Ruby Coder1', 'Ruby Coder2']) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + }) + + def test_gemspec_invalid_author(self): + raw_content = b""" +Gem::Specification.new do |s| + s.author = "Ruby Coder1", +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + }) + def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() fill_obj_storage(metadata_indexer.objstorage)