Page MenuHomeSoftware Heritage

D957.id3050.diff
No OneTemporary

D957.id3050.diff

diff --git a/swh/indexer/codemeta.py b/swh/indexer/codemeta.py
--- a/swh/indexer/codemeta.py
+++ b/swh/indexer/codemeta.py
@@ -6,6 +6,7 @@
import csv
import json
import os.path
+import re
import swh.indexer
from pyld import jsonld
@@ -34,6 +35,8 @@
SCHEMA_URI + 'creator',
}
+_codemeta_field_separator = re.compile(r'\s*[,/]\s*')
+
def make_absolute_uri(local_name):
definition = CODEMETA_CONTEXT['@context'][local_name]
@@ -76,7 +79,7 @@
for (col, value) in zip(header, line): # For each cell in the row
if col in data_sources:
# If that's not the parentType/property/type/description
- for local_name in value.split('/'):
+ for local_name in _codemeta_field_separator.split(value):
# For each of the data source's properties that maps
# to this canonical name
if local_name.strip():
diff --git a/swh/indexer/data/codemeta/crosswalk.csv b/swh/indexer/data/codemeta/crosswalk.csv
--- a/swh/indexer/data/codemeta/crosswalk.csv
+++ b/swh/indexer/data/codemeta/crosswalk.csv
@@ -18,7 +18,7 @@
schema:SoftwareApplication,softwareVersion,Text,Version of the software instance.,,,,,,,,,,,,,,,,,,,,,release,software version,
schema:SoftwareApplication,storageRequirements,Text or URL,Storage requirements (free space required).,,,,,,,,,,,,,,,,,,,,,,,
schema:SoftwareApplication,supportingData,DataFeed,Supporting data for a SoftwareApplication.,,,,,,,,,,,,,,,,,,,,,,,
-schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,author,Author,,,author,,,author,,developer,,authors
+schema:CreativeWork,author,Organization or Person,The author of this content or rating. Please note that author is special in that HTML 5 provides a special mechanism for indicating authorship via the rel tag. That is equivalent to this and may be used interchangeably.,agents,creators,,creators,login,,,,,[aut] in Author,,author,Author,,,author,,,author/authors,,developer,,authors
schema:CreativeWork,citation,CreativeWork or URL,"A citation or reference to another creative work, such as another publication, web page, scholarly article, etc.",relatedLink,,,,,,,,,,,,,,,,,,,,,,
schema:CreativeWork,contributor,Organization or Person,A secondary contributor to the CreativeWork or Event.,,,,,,,,,,[ctb] in Author,,,,,,contributor,,,,,developer,,
schema:CreativeWork,copyrightHolder,Organization or Person,The party holding the legal copyright to the CreativeWork.,agents [role=copyrightHolder],,,,,,,,,,,,,,,,,,,,,,
diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py
--- a/swh/indexer/metadata_dictionary.py
+++ b/swh/indexer/metadata_dictionary.py
@@ -6,6 +6,7 @@
import os
import re
import abc
+import ast
import json
import logging
import email.parser
@@ -24,6 +25,36 @@
return cls
+def merge_values(v1, v2):
+ """If v1 and v2 are of the form `{"@list": l1}` and `{"@list": l2}`,
+ returns `{"@list": l1 + l2}`.
+ Otherwise, make them lists (if they are not already) and concatenate
+ them.
+
+ >>> merge_values('a', 'b')
+ ['a', 'b']
+ >>> merge_values(['a', 'b'], 'c')
+ ['a', 'b', 'c']
+ >>> merge_values({'@list': ['a', 'b']}, {'@list': ['c']})
+ {'@list': ['a', 'b', 'c']}
+ """
+ if isinstance(v1, dict) and set(v1) == {'@list'}:
+ assert isinstance(v1['@list'], list)
+ if isinstance(v2, dict) and set(v2) == {'@list'}:
+ assert isinstance(v2['@list'], list)
+ return {'@list': v1['@list'] + v2['@list']}
+ else:
+ raise ValueError('Cannot merge %r and %r' % (v1, v2))
+ else:
+ if isinstance(v2, dict) and '@list' in v2:
+ raise ValueError('Cannot merge %r and %r' % (v1, v2))
+ if not isinstance(v1, list):
+ v1 = [v1]
+ if not isinstance(v2, list):
+ v2 = [v2]
+ return v1 + v2
+
+
class BaseMapping(metaclass=abc.ABCMeta):
"""Base class for mappings to inherit from
@@ -108,6 +139,7 @@
elif k in self.mapping:
# if there is no method, but the key is known from the
# crosswalk table
+ codemeta_key = self.mapping[k]
# if there is a normalization method, use it on the value
normalization_method = getattr(
@@ -116,7 +148,11 @@
v = normalization_method(v)
# set the translation metadata with the normalized value
- translated_metadata[self.mapping[k]] = v
+ if codemeta_key in translated_metadata:
+ translated_metadata[codemeta_key] = merge_values(
+ translated_metadata[codemeta_key], v)
+ else:
+ translated_metadata[codemeta_key] = v
if normalize:
return self.normalize_translation(translated_metadata)
else:
@@ -390,14 +426,6 @@
}
return self.normalize_translation(metadata)
- def translate_summary(self, translated_metadata, v):
- k = self.mapping['summary']
- translated_metadata.setdefault(k, []).append(v)
-
- def translate_description(self, translated_metadata, v):
- k = self.mapping['description']
- translated_metadata.setdefault(k, []).append(v)
-
def normalize_home_page(self, urls):
return [{'@id': url} for url in urls]
@@ -405,6 +433,75 @@
return [{'@id': license} for license in licenses]
+@register_mapping
+class GemspecMapping(DictMapping):
+ _re_spec_new = re.compile(r'.*Gem::Specification.new do \|.*\|.*')
+ _re_spec_entry = re.compile(r'\s*\w+\.(?P<key>\w+)\s*=\s*(?P<value>.*)')
+
+ mapping = CROSSWALK_TABLE['Ruby Gem']
+
+ def detect_metadata_files(self, file_entries):
+ for entry in file_entries:
+ if entry['name'].endswith(b'.gemspec'):
+ return [entry['sha1']]
+ return []
+
+ def translate(self, raw_content):
+ try:
+ raw_content = raw_content.decode()
+ except UnicodeDecodeError:
+ self.log.warning('Error unidecoding %r', raw_content)
+ return
+
+ # Skip lines before 'Gem::Specification.new'
+ lines = iter(raw_content.split('\n'))
+ for line in lines:
+ match = self._re_spec_new.match(line)
+ if match:
+ break
+ else:
+ self.log.warning('Could not find Gem::Specification in %r',
+ raw_content)
+ return
+
+ content_dict = {}
+ for line in lines:
+ match = self._re_spec_entry.match(line)
+ if match:
+ raw_value = match.group('value')
+ raw_value = raw_value.replace('.freeze', '')
+ try:
+ # We're parsing Ruby expressions here, but Python's
+ # ast.literal_eval is rather good at parsing simple
+ # Ruby expressions (mainly strings delimited with " or ',
+ # and lists of such strings).
+ value = ast.literal_eval(raw_value)
+ except (SyntaxError, ValueError):
+ # Obviously, ast.literal_eval won't work on any Ruby code
+ continue
+ content_dict[match.group('key')] = value
+ return self.translate_dict(content_dict)
+
+ def normalize_homepage(self, s):
+ return {"@id": s}
+
+ def normalize_license(self, s):
+ if isinstance(s, str):
+ return [{"@id": "https://spdx.org/licenses/" + s}]
+
+ def normalize_licenses(self, licenses):
+ if isinstance(licenses, list):
+ return [{"@id": "https://spdx.org/licenses/" + license}
+ for license in licenses
+ if isinstance(license, str)]
+
+ def normalize_author(self, author):
+ return {"@list": [author]}
+
+ def normalize_authors(self, authors):
+ return {"@list": authors}
+
+
def main():
raw_content = """{"name": "test_name", "unknown_term": "ut"}"""
raw_content1 = b"""{"name": "test_name",
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -7,7 +7,8 @@
from swh.model.hashutil import hash_to_bytes
-from swh.indexer.metadata_dictionary import CROSSWALK_TABLE, MAPPINGS
+from swh.indexer.metadata_dictionary import (
+ CROSSWALK_TABLE, MAPPINGS, merge_values)
from swh.indexer.metadata_detector import (
detect_metadata, extract_minimal_metadata_dict
)
@@ -82,6 +83,31 @@
'homepage': 'http://schema.org/url'
})
+ def test_merge_values(self):
+ self.assertEqual(
+ merge_values('a', 'b'),
+ ['a', 'b'])
+ self.assertEqual(
+ merge_values(['a', 'b'], 'c'),
+ ['a', 'b', 'c'])
+ self.assertEqual(
+ merge_values('a', ['b', 'c']),
+ ['a', 'b', 'c'])
+ self.assertEqual(
+ merge_values({'@list': ['a']}, {'@list': ['b']}),
+ {'@list': ['a', 'b']})
+ self.assertEqual(
+ merge_values({'@list': ['a', 'b']}, {'@list': ['c']}),
+ {'@list': ['a', 'b', 'c']})
+ with self.assertRaises(ValueError):
+ merge_values({'@list': ['a']}, 'b')
+ with self.assertRaises(ValueError):
+ merge_values('a', {'@list': ['b']})
+ with self.assertRaises(ValueError):
+ merge_values({'@list': ['a']}, ['b'])
+ with self.assertRaises(ValueError):
+ merge_values(['a'], {'@list': ['b']})
+
def test_compute_metadata_none(self):
"""
testing content empty content is empty
@@ -696,6 +722,50 @@
'license': 'MIT',
})
+ def test_gemspec_base(self):
+ raw_content = b"""
+Gem::Specification.new do |s|
+ s.name = 'example'
+ s.version = '0.1.0'
+ s.licenses = ['MIT']
+ s.summary = "This is an example!"
+ s.description = "Much longer explanation of the example!"
+ s.authors = ["Ruby Coder"]
+ s.email = 'rubycoder@example.com'
+ s.files = ["lib/example.rb"]
+ s.homepage = 'https://rubygems.org/gems/example'
+ s.metadata = { "source_code_uri" => "https://github.com/example/example" }
+end"""
+ result = MAPPINGS['GemspecMapping'].translate(raw_content)
+ self.assertCountEqual(result.pop('description'), [
+ "This is an example!",
+ "Much longer explanation of the example!"
+ ])
+ self.assertEqual(result, {
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'type': 'SoftwareSourceCode',
+ 'author': ['Ruby Coder'],
+ 'name': 'example',
+ 'license': 'https://spdx.org/licenses/MIT',
+ 'codeRepository': 'https://rubygems.org/gems/example',
+ 'email': 'rubycoder@example.com',
+ 'version': '0.1.0',
+ })
+
+ def test_gemspec_two_author_fields(self):
+ raw_content = b"""
+Gem::Specification.new do |s|
+ s.authors = ["Ruby Coder1"]
+ s.author = "Ruby Coder2"
+end"""
+ result = MAPPINGS['GemspecMapping'].translate(raw_content)
+ self.assertCountEqual(result.pop('author'), [
+ 'Ruby Coder1', 'Ruby Coder2'])
+ self.assertEqual(result, {
+ '@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
+ 'type': 'SoftwareSourceCode',
+ })
+
def test_revision_metadata_indexer(self):
metadata_indexer = RevisionMetadataTestIndexer()
fill_obj_storage(metadata_indexer.objstorage)

File Metadata

Mime Type
text/plain
Expires
Mar 17 2025, 6:21 PM (7 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3225737

Event Timeline