diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -6,6 +6,7 @@ import csv import json import os.path +import re import swh.indexer from pyld import jsonld @@ -34,6 +35,8 @@ SCHEMA_URI + 'creator', } +_codemeta_field_separator = re.compile(r'\s*[,/]\s*') + def make_absolute_uri(local_name): definition = CODEMETA_CONTEXT['@context'][local_name] @@ -76,7 +79,7 @@ for (col, value) in zip(header, line): # For each cell in the row if col in data_sources: # If that's not the parentType/property/type/description - for local_name in value.split('/'): + for local_name in _codemeta_field_separator.split(value): # For each of the data source's properties that maps # to this canonical name if local_name.strip(): diff --git a/swh/indexer/data/codemeta/crosswalk.csv b/swh/indexer/data/codemeta/crosswalk.csv --- a/swh/indexer/data/codemeta/crosswalk.csv +++ b/swh/indexer/data/codemeta/crosswalk.csv @@ -18,7 +18,7 @@ schema:SoftwareApplication,softwareVersion,Text,Version of the software instance.,,,,,,,,,,,,,,,,,,,,,release,software version, schema:SoftwareApplication,storageRequirements,Text or URL,Storage requirements (free space required).,,,,,,,,,,,,,,,,,,,,,,, schema:SoftwareApplication,supportingData,DataFeed,Supporting data for a SoftwareApplication.,,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,author,Author,,,author,,,author,,developer,,authors +schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,author,Author,,,author,,,author/authors,,developer,,authors schema:CreativeWork,citation,CreativeWork or URL,"A citation or reference to another creative work, such as another publication, web page, scholarly article, etc.",relatedLink,,,,,,,,,,,,,,,,,,,,,, schema:CreativeWork,contributor,Organization or Person,A secondary contributor to the CreativeWork or Event.,,,,,,,,,,[ctb] in Author,,,,,,contributor,,,,,developer,, schema:CreativeWork,copyrightHolder,Organization or Person,The party holding the legal copyright to the CreativeWork.,agents [role=copyrightHolder],,,,,,,,,,,,,,,,,,,,,, diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -6,8 +6,10 @@ import os import re import abc +import ast import json import logging +import itertools import email.parser import xmltodict @@ -24,6 +26,40 @@ return cls +def merge_values(v1, v2): + """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`, + returns `{"@list": l1 + l2}`. + Otherwise, make them lists (if they are not already) and concatenate + them. + + >>> merge_values('a', 'b') + ['a', 'b'] + >>> merge_values(['a', 'b'], 'c') + ['a', 'b', 'c'] + >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) + {'@list': ['a', 'b', 'c']} + """ + if v1 is None: + return v2 + elif v2 is None: + return v1 + elif isinstance(v1, dict) and set(v1) == {'@list'}: + assert isinstance(v1['@list'], list) + if isinstance(v2, dict) and set(v2) == {'@list'}: + assert isinstance(v2['@list'], list) + return {'@list': v1['@list'] + v2['@list']} + else: + raise ValueError('Cannot merge %r and %r' % (v1, v2)) + else: + if isinstance(v2, dict) and '@list' in v2: + raise ValueError('Cannot merge %r and %r' % (v1, v2)) + if not isinstance(v1, list): + v1 = [v1] + if not isinstance(v2, list): + v2 = [v2] + return v1 + v2 + + class BaseMapping(metaclass=abc.ABCMeta): """Base class for mappings to inherit from @@ -108,6 +144,7 @@ elif k in self.mapping: # if there is no method, but the key is known from the # crosswalk table + codemeta_key = self.mapping[k] # if there is a normalization method, use it on the value normalization_method = getattr( @@ -116,7 +153,11 @@ v = normalization_method(v) # set the translation metadata with the normalized value - translated_metadata[self.mapping[k]] = v + if codemeta_key in translated_metadata: + translated_metadata[codemeta_key] = merge_values( + translated_metadata[codemeta_key], v) + else: + translated_metadata[codemeta_key] = v if normalize: return self.normalize_translation(translated_metadata) else: @@ -235,7 +276,8 @@ return None def normalize_homepage(self, s): - return {"@id": s} + if isinstance(s, str): + return {"@id": s} @register_mapping @@ -390,14 +432,6 @@ } return self.normalize_translation(metadata) - def translate_summary(self, translated_metadata, v): - k = self.mapping['summary'] - translated_metadata.setdefault(k, []).append(v) - - def translate_description(self, translated_metadata, v): - k = self.mapping['description'] - translated_metadata.setdefault(k, []).append(v) - def normalize_home_page(self, urls): return [{'@id': url} for url in urls] @@ -405,6 +439,110 @@ return [{'@id': license} for license in licenses] +@register_mapping +class GemspecMapping(DictMapping): + _re_spec_new = re.compile(r'.*Gem::Specification.new do \|.*\|.*') + _re_spec_entry = re.compile(r'\s*\w+\.(?P\w+)\s*=\s*(?P.*)') + + mapping = CROSSWALK_TABLE['Ruby Gem'] + + def detect_metadata_files(self, file_entries): + for entry in file_entries: + if entry['name'].endswith(b'.gemspec'): + return [entry['sha1']] + return [] + + def translate(self, raw_content): + try: + raw_content = raw_content.decode() + except UnicodeDecodeError: + self.log.warning('Error unidecoding %r', raw_content) + return + + # Skip lines before 'Gem::Specification.new' + lines = itertools.dropwhile( + lambda x: not self._re_spec_new.match(x), + raw_content.split('\n')) + + try: + next(lines) # Consume 'Gem::Specification.new' + except StopIteration: + self.log.warning('Could not find Gem::Specification in %r', + raw_content) + return + + content_dict = {} + for line in lines: + match = self._re_spec_entry.match(line) + if match: + value = self.eval_ruby_expression(match.group('expr')) + if value: + content_dict[match.group('key')] = value + return self.translate_dict(content_dict) + + def eval_ruby_expression(self, expr): + """Very simple evaluator of Ruby expressions. + + >>> GemspecMapping().eval_ruby_expression('"Foo bar"') + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression("'Foo bar'") + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression("['Foo', 'bar']") + ['Foo', 'bar'] + >>> GemspecMapping().eval_ruby_expression("'Foo bar'.freeze") + 'Foo bar' + >>> GemspecMapping().eval_ruby_expression( \ + "['Foo'.freeze, 'bar'.freeze]") + ['Foo', 'bar'] + """ + def evaluator(node): + if isinstance(node, ast.Str): + return node.s + elif isinstance(node, ast.List): + res = [] + for element in node.elts: + val = evaluator(element) + if not val: + return + res.append(val) + return res + + expr = expr.replace('.freeze', '') + try: + # We're parsing Ruby expressions here, but Python's + # ast.parse works for very simple Ruby expressions + # (mainly strings delimited with " or ', and lists + # of such strings). + tree = ast.parse(expr, mode='eval') + except (SyntaxError, ValueError): + return + if isinstance(tree, ast.Expression): + return evaluator(tree.body) + + def normalize_homepage(self, s): + if isinstance(s, str): + return {"@id": s} + + def normalize_license(self, s): + if isinstance(s, str): + return [{"@id": "https://spdx.org/licenses/" + s}] + + def normalize_licenses(self, licenses): + if isinstance(licenses, list): + return [{"@id": "https://spdx.org/licenses/" + license} + for license in licenses + if isinstance(license, str)] + + def normalize_author(self, author): + if isinstance(author, str): + return {"@list": [author]} + + def normalize_authors(self, authors): + if isinstance(authors, list): + return {"@list": [author for author in authors + if isinstance(author, str)]} + + def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -7,7 +7,8 @@ from swh.model.hashutil import hash_to_bytes -from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS +from swh.indexer.metadata_dictionary import ( + CROSSWALK_TABLE, MAPPINGS, merge_values) from swh.indexer.metadata_detector import ( detect_metadata, extract_minimal_metadata_dict ) @@ -82,6 +83,49 @@ 'homepage': 'http://schema.org/url' }) + def test_merge_values(self): + self.assertEqual( + merge_values('a', 'b'), + ['a', 'b']) + self.assertEqual( + merge_values(['a', 'b'], 'c'), + ['a', 'b', 'c']) + self.assertEqual( + merge_values('a', ['b', 'c']), + ['a', 'b', 'c']) + + self.assertEqual( + merge_values({'@list': ['a']}, {'@list': ['b']}), + {'@list': ['a', 'b']}) + self.assertEqual( + merge_values({'@list': ['a', 'b']}, {'@list': ['c']}), + {'@list': ['a', 'b', 'c']}) + + with self.assertRaises(ValueError): + merge_values({'@list': ['a']}, 'b') + with self.assertRaises(ValueError): + merge_values('a', {'@list': ['b']}) + with self.assertRaises(ValueError): + merge_values({'@list': ['a']}, ['b']) + with self.assertRaises(ValueError): + merge_values(['a'], {'@list': ['b']}) + + self.assertEqual( + merge_values('a', None), + 'a') + self.assertEqual( + merge_values(['a', 'b'], None), + ['a', 'b']) + self.assertEqual( + merge_values(None, ['b', 'c']), + ['b', 'c']) + self.assertEqual( + merge_values({'@list': ['a']}, None), + {'@list': ['a']}) + self.assertEqual( + merge_values(None, {'@list': ['a']}), + {'@list': ['a']}) + def test_compute_metadata_none(self): """ testing content empty content is empty @@ -696,6 +740,80 @@ 'license': 'MIT', }) + def test_gemspec_base(self): + raw_content = b""" +Gem::Specification.new do |s| + s.name = 'example' + s.version = '0.1.0' + s.licenses = ['MIT'] + s.summary = "This is an example!" + s.description = "Much longer explanation of the example!" + s.authors = ["Ruby Coder"] + s.email = 'rubycoder@example.com' + s.files = ["lib/example.rb"] + s.homepage = 'https://rubygems.org/gems/example' + s.metadata = { "source_code_uri" => "https://github.com/example/example" } +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertCountEqual(result.pop('description'), [ + "This is an example!", + "Much longer explanation of the example!" + ]) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'author': ['Ruby Coder'], + 'name': 'example', + 'license': 'https://spdx.org/licenses/MIT', + 'codeRepository': 'https://rubygems.org/gems/example', + 'email': 'rubycoder@example.com', + 'version': '0.1.0', + }) + + def test_gemspec_two_author_fields(self): + raw_content = b""" +Gem::Specification.new do |s| + s.authors = ["Ruby Coder1"] + s.author = "Ruby Coder2" +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertCountEqual(result.pop('author'), [ + 'Ruby Coder1', 'Ruby Coder2']) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + }) + + def test_gemspec_invalid_author(self): + raw_content = b""" +Gem::Specification.new do |s| + s.author = ["Ruby Coder"] +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + }) + raw_content = b""" +Gem::Specification.new do |s| + s.author = "Ruby Coder1", +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + }) + raw_content = b""" +Gem::Specification.new do |s| + s.authors = ["Ruby Coder1", ["Ruby Coder2"]] +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'author': ['Ruby Coder1'], + }) + def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() fill_obj_storage(metadata_indexer.objstorage)