diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py --- a/swh/indexer/codemeta.py +++ b/swh/indexer/codemeta.py @@ -6,6 +6,7 @@ import csv import json import os.path +import re import swh.indexer from pyld import jsonld @@ -34,6 +35,8 @@ SCHEMA_URI + 'creator', } +_codemeta_field_separator = re.compile(r'\s*[,/]\s*') + def make_absolute_uri(local_name): definition = CODEMETA_CONTEXT['@context'][local_name] @@ -76,7 +79,7 @@ for (col, value) in zip(header, line): # For each cell in the row if col in data_sources: # If that's not the parentType/property/type/description - for local_name in value.split('/'): + for local_name in _codemeta_field_separator.split(value): # For each of the data source's properties that maps # to this canonical name if local_name.strip(): diff --git a/swh/indexer/data/codemeta/crosswalk.csv b/swh/indexer/data/codemeta/crosswalk.csv --- a/swh/indexer/data/codemeta/crosswalk.csv +++ b/swh/indexer/data/codemeta/crosswalk.csv @@ -18,7 +18,7 @@ schema:SoftwareApplication,softwareVersion,Text,Version of the software instance.,,,,,,,,,,,,,,,,,,,,,release,software version, schema:SoftwareApplication,storageRequirements,Text or URL,Storage requirements (free space required).,,,,,,,,,,,,,,,,,,,,,,, schema:SoftwareApplication,supportingData,DataFeed,Supporting data for a SoftwareApplication.,,,,,,,,,,,,,,,,,,,,,,, -schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,author,Author,,,author,,,author,,developer,,authors +schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,author,Author,,,author,,,author/authors,,developer,,authors schema:CreativeWork,citation,CreativeWork or URL,"A citation or reference to another creative work, such as another publication, web page, scholarly article, etc.",relatedLink,,,,,,,,,,,,,,,,,,,,,, schema:CreativeWork,contributor,Organization or Person,A secondary contributor to the CreativeWork or Event.,,,,,,,,,,[ctb] in Author,,,,,,contributor,,,,,developer,, schema:CreativeWork,copyrightHolder,Organization or Person,The party holding the legal copyright to the CreativeWork.,agents [role=copyrightHolder],,,,,,,,,,,,,,,,,,,,,, diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -6,6 +6,7 @@ import os import re import abc +import ast import json import logging import email.parser @@ -24,6 +25,36 @@ return cls +def merge_values(v1, v2): + """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`, + returns `{"@list": l1 + l2}`. + Otherwise, make them lists (if they are not already) and concatenate + them. + + >>> merge_values('a', 'b') + ['a', 'b'] + >>> merge_values(['a', 'b'], 'c') + ['a', 'b', 'c'] + >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']}) + {'@list': ['a', 'b', 'c']} + """ + if isinstance(v1, dict) and set(v1) == {'@list'}: + assert isinstance(v1['@list'], list) + if isinstance(v2, dict) and set(v2) == {'@list'}: + assert isinstance(v2['@list'], list) + return {'@list': v1['@list'] + v2['@list']} + else: + raise ValueError('Cannot merge %r and %r' % (v1, v2)) + else: + if isinstance(v2, dict) and '@list' in v2: + raise ValueError('Cannot merge %r and %r' % (v1, v2)) + if not isinstance(v1, list): + v1 = [v1] + if not isinstance(v2, list): + v2 = [v2] + return v1 + v2 + + class BaseMapping(metaclass=abc.ABCMeta): """Base class for mappings to inherit from @@ -108,6 +139,7 @@ elif k in self.mapping: # if there is no method, but the key is known from the # crosswalk table + codemeta_key = self.mapping[k] # if there is a normalization method, use it on the value normalization_method = getattr( @@ -116,7 +148,11 @@ v = normalization_method(v) # set the translation metadata with the normalized value - translated_metadata[self.mapping[k]] = v + if codemeta_key in translated_metadata: + translated_metadata[codemeta_key] = merge_values( + translated_metadata[codemeta_key], v) + else: + translated_metadata[codemeta_key] = v if normalize: return self.normalize_translation(translated_metadata) else: @@ -390,14 +426,6 @@ } return self.normalize_translation(metadata) - def translate_summary(self, translated_metadata, v): - k = self.mapping['summary'] - translated_metadata.setdefault(k, []).append(v) - - def translate_description(self, translated_metadata, v): - k = self.mapping['description'] - translated_metadata.setdefault(k, []).append(v) - def normalize_home_page(self, urls): return [{'@id': url} for url in urls] @@ -405,6 +433,75 @@ return [{'@id': license} for license in licenses] +@register_mapping +class GemspecMapping(DictMapping): + _re_spec_new = re.compile(r'.*Gem::Specification.new do \|.*\|.*') + _re_spec_entry = re.compile(r'\s*\w+\.(?P\w+)\s*=\s*(?P.*)') + + mapping = CROSSWALK_TABLE['Ruby Gem'] + + def detect_metadata_files(self, file_entries): + for entry in file_entries: + if entry['name'].endswith(b'.gemspec'): + return [entry['sha1']] + return [] + + def translate(self, raw_content): + try: + raw_content = raw_content.decode() + except UnicodeDecodeError: + self.log.warning('Error unidecoding %r', raw_content) + return + + # Skip lines before 'Gem::Specification.new' + lines = iter(raw_content.split('\n')) + for line in lines: + match = self._re_spec_new.match(line) + if match: + break + else: + self.log.warning('Could not find Gem::Specification in %r', + raw_content) + return + + content_dict = {} + for line in lines: + match = self._re_spec_entry.match(line) + if match: + raw_value = match.group('value') + raw_value = raw_value.replace('.freeze', '') + try: + # We're parsing Ruby expressions here, but Python's + # ast.literal_eval is rather good at parsing simple + # Ruby expressions (mainly strings delimited with " or ', + # and lists of such strings). + value = ast.literal_eval(raw_value) + except (SyntaxError, ValueError): + # Obviously, ast.literal_eval won't work on any Ruby code + continue + content_dict[match.group('key')] = value + return self.translate_dict(content_dict) + + def normalize_homepage(self, s): + return {"@id": s} + + def normalize_license(self, s): + if isinstance(s, str): + return [{"@id": "https://spdx.org/licenses/" + s}] + + def normalize_licenses(self, licenses): + if isinstance(licenses, list): + return [{"@id": "https://spdx.org/licenses/" + license} + for license in licenses + if isinstance(license, str)] + + def normalize_author(self, author): + return {"@list": [author]} + + def normalize_authors(self, authors): + return {"@list": authors} + + def main(): raw_content = """{"name": "test_name", "unknown_term": "ut"}""" raw_content1 = b"""{"name": "test_name", diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -7,7 +7,8 @@ from swh.model.hashutil import hash_to_bytes -from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS +from swh.indexer.metadata_dictionary import ( + CROSSWALK_TABLE, MAPPINGS, merge_values) from swh.indexer.metadata_detector import ( detect_metadata, extract_minimal_metadata_dict ) @@ -82,6 +83,31 @@ 'homepage': 'http://schema.org/url' }) + def test_merge_values(self): + self.assertEqual( + merge_values('a', 'b'), + ['a', 'b']) + self.assertEqual( + merge_values(['a', 'b'], 'c'), + ['a', 'b', 'c']) + self.assertEqual( + merge_values('a', ['b', 'c']), + ['a', 'b', 'c']) + self.assertEqual( + merge_values({'@list': ['a']}, {'@list': ['b']}), + {'@list': ['a', 'b']}) + self.assertEqual( + merge_values({'@list': ['a', 'b']}, {'@list': ['c']}), + {'@list': ['a', 'b', 'c']}) + with self.assertRaises(ValueError): + merge_values({'@list': ['a']}, 'b') + with self.assertRaises(ValueError): + merge_values('a', {'@list': ['b']}) + with self.assertRaises(ValueError): + merge_values({'@list': ['a']}, ['b']) + with self.assertRaises(ValueError): + merge_values(['a'], {'@list': ['b']}) + def test_compute_metadata_none(self): """ testing content empty content is empty @@ -696,6 +722,50 @@ 'license': 'MIT', }) + def test_gemspec_base(self): + raw_content = b""" +Gem::Specification.new do |s| + s.name = 'example' + s.version = '0.1.0' + s.licenses = ['MIT'] + s.summary = "This is an example!" + s.description = "Much longer explanation of the example!" + s.authors = ["Ruby Coder"] + s.email = 'rubycoder@example.com' + s.files = ["lib/example.rb"] + s.homepage = 'https://rubygems.org/gems/example' + s.metadata = { "source_code_uri" => "https://github.com/example/example" } +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertCountEqual(result.pop('description'), [ + "This is an example!", + "Much longer explanation of the example!" + ]) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'author': ['Ruby Coder'], + 'name': 'example', + 'license': 'https://spdx.org/licenses/MIT', + 'codeRepository': 'https://rubygems.org/gems/example', + 'email': 'rubycoder@example.com', + 'version': '0.1.0', + }) + + def test_gemspec_two_author_fields(self): + raw_content = b""" +Gem::Specification.new do |s| + s.authors = ["Ruby Coder1"] + s.author = "Ruby Coder2" +end""" + result = MAPPINGS['GemspecMapping'].translate(raw_content) + self.assertCountEqual(result.pop('author'), [ + 'Ruby Coder1', 'Ruby Coder2']) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + }) + def test_revision_metadata_indexer(self): metadata_indexer = RevisionMetadataTestIndexer() fill_obj_storage(metadata_indexer.objstorage)