diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -6,6 +6,7 @@ import csv import json import os.path +import re import swh.indexer from pyld import jsonld @@ -34,6 +35,8 @@ SCHEMA_URI + 'creator', } +_codemeta_field_separator = re.compile(r'\s*[,/]\s*') + def make_absolute_uri(local_name): definition = CODEMETA_CONTEXT['@context'][local_name] @@ -76,7 +79,7 @@ for (col, value) in zip(header, line): # For each cell in the row if col in data_sources: # If that's not the parentType/property/type/description - for local_name in value.split('/'): + for local_name in _codemeta_field_separator.split(value): # For each of the data source's properties that maps # to this canonical name if local_name.strip(): diff --git a/swh/indexer/data/codemeta/crosswalk.csv b/swh/indexer/data/codemeta/crosswalk.csv --- a/swh/indexer/data/codemeta/crosswalk.csv +++ b/swh/indexer/data/codemeta/crosswalk.csv @@ -18,7 +18,7 @@ schema:SoftwareApplication,softwareVersion,Text,Version of the software instance.,,,,,,,,,,,,,,,,,,,,,release,software version, schema:SoftwareApplication,storageRequirements,Text or URL,Storage requirements (free space required).,,,,,,,,,,,,,,,,,,,,,,, schema:SoftwareApplication,supportingData,DataFeed,Supporting data for a SoftwareApplication.,,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,author,Author,,,author,,,author,,developer,,authors +schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,author,Author,,,author,,,author/authors,,developer,,authors schema:CreativeWork,citation,CreativeWork or URL,"A citation or reference to another creative work, such as another publication, web page, scholarly article, etc.",relatedLink,,,,,,,,,,,,,,,,,,,,,, schema:CreativeWork,contributor,Organization or Person,A secondary contributor to the CreativeWork or Event.,,,,,,,,,,[ctb] in Author,,,,,,contributor,,,,,developer,, schema:CreativeWork,copyrightHolder,Organization or Person,The party holding the legal copyright to the CreativeWork.,agents [role=copyrightHolder],,,,,,,,,,,,,,,,,,,,,, diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -6,8 +6,10 @@ import os import re import abc +import ast import json import logging +import itertools import email.parser import xmltodict @@ -405,6 +407,103 @@ return [{'@id': license} for license in licenses] +def eval_ruby_expression(expr): + """Very simple evaluator of Ruby expressions. + + >>> eval_ruby_expression('"Foo bar"') + 'Foo bar' + >>> eval_ruby_expression("'Foo bar'") + 'Foo bar' + >>> eval_ruby_expression("['Foo', 'bar']") + ['Foo', 'bar'] + >>> eval_ruby_expression("'Foo bar'.freeze") + 'Foo bar' + >>> eval_ruby_expression("['Foo'.freeze, 'bar'.freeze]") + ['Foo', 'bar'] + """ + expr = expr.replace('.freeze', '') + try: + # We're parsing Ruby expressions here, but Python's + # ast.literal_eval is rather good at parsing simple + # Ruby expressions (mainly strings delimited with " or ', + # and lists of such strings). + return ast.literal_eval(expr) + except (SyntaxError, ValueError): + # Obviously, ast.literal_eval won't work on any Ruby code + return + + +@register_mapping +class GemspecMapping(DictMapping): + _re_spec_new = re.compile(r'.*Gem::Specification.new do \|.*\|.*') + _re_spec_entry = re.compile(r'\s*\w+\.(?P\w+)\s*=\s*(?P.*)') + + mapping = CROSSWALK_TABLE['Ruby Gem'] + + def detect_metadata_files(self, file_entries): + for entry in file_entries: + if entry['name'].endswith(b'.gemspec'): + return [entry['sha1']] + return [] + + def translate(self, raw_content): + try: + raw_content = raw_content.decode() + except UnicodeDecodeError: + self.log.warning('Error unidecoding %r', raw_content) + return + + # Skip lines before 'Gem::Specification.new' + lines = itertools.dropwhile( + lambda x: not self._re_spec_new.match(x), + raw_content.split('\n')) + + try: + next(lines) # Consume 'Gem::Specification.new' + except StopIteration: + self.log.warning('Could not find Gem::Specification in %r', + raw_content) + return + + content_dict = {} + for line in lines: + match = self._re_spec_entry.match(line) + if match: + value = eval_ruby_expression(match.group('expr')) + if value: + content_dict[match.group('key')] = value + return self.translate_dict(content_dict) + + def normalize_homepage(self, s): + return {"@id": s} + + def normalize_license(self, s): + if isinstance(s, str): + return [{"@id": "https://spdx.org/licenses/" + s}] + + def normalize_licenses(self, licenses): + if isinstance(licenses, list): + return [{"@id": "https://spdx.org/licenses/" + license} + for license in licenses + if isinstance(license, str)] + + def translate_author(self, translated_metadata, v): + k = self.mapping['author'] + translated_metadata.setdefault(k, {"@list": []})["@list"].append(v) + + def translate_authors(self, translated_metadata, v): + k = self.mapping['authors'] + translated_metadata.setdefault(k, {"@list": []})["@list"].extend(v) + + def translate_summary(self, translated_metadata, v): + k = self.mapping['summary'] + translated_metadata.setdefault(k, []).append(v) + + def translate_description(self, translated_metadata, v): + k = self.mapping['description'] + translated_metadata.setdefault(k, []).append(v) + + def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -696,6 +696,50 @@ 'license': 'MIT', }) + def test_gemspec_base(self): + raw_content = b""" +Gem::Specification.new do |s| + s.name = 'example' + s.version = '0.1.0' + s.licenses = ['MIT'] + s.summary = "This is an example!" + s.description = "Much longer explanation of the example!" + s.authors = ["Ruby Coder"] + s.email = 'rubycoder@example.com' + s.files = ["lib/example.rb"] + s.homepage = 'https://rubygems.org/gems/example' + s.metadata = { "source_code_uri" => "https://github.com/example/example" } +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertCountEqual(result.pop('description'), [ + "This is an example!", + "Much longer explanation of the example!" + ]) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'author': ['Ruby Coder'], + 'name': 'example', + 'license': 'https://spdx.org/licenses/MIT', + 'codeRepository': 'https://rubygems.org/gems/example', + 'email': 'rubycoder@example.com', + 'version': '0.1.0', + }) + + def test_gemspec_two_author_fields(self): + raw_content = b""" +Gem::Specification.new do |s| + s.authors = ["Ruby Coder1"] + s.author = "Ruby Coder2" +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertCountEqual(result.pop('author'), [ + 'Ruby Coder1', 'Ruby Coder2']) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + }) + def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() fill_obj_storage(metadata_indexer.objstorage)