diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
--- a/swh/indexer/metadata.py
+++ b/swh/indexer/metadata.py
@@ -43,7 +43,7 @@
} for sha1 in ids
))
- def index(self, id, data):
+ def index(self, id, data, log_suffix='unknown revision'):
"""Index sha1s' content and store result.
Args:
@@ -63,8 +63,9 @@
}
try:
mapping_name = self.tool['tool_configuration']['context']
- result['translated_metadata'] = MAPPINGS[mapping_name] \
- .translate(data)
+ log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id)
+ result['translated_metadata'] = \
+ MAPPINGS[mapping_name](log_suffix).translate(data)
except Exception:
self.log.exception(
"Problem during metadata translation "
@@ -111,7 +112,7 @@
'version': '0.0.2',
'configuration': {
'type': 'local',
- 'context': ['NpmMapping', 'CodemetaMapping']
+ 'context': list(MAPPINGS),
},
}),
}
@@ -158,7 +159,9 @@
files = [entry for entry in dir_ls if entry['type'] == 'file']
detected_files = detect_metadata(files)
result['translated_metadata'] = self.translate_revision_metadata(
- detected_files)
+ detected_files,
+ log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id'])
+ )
except Exception as e:
self.log.exception(
'Problem when indexing rev: %r', e)
@@ -181,7 +184,7 @@
self.idx_storage.revision_metadata_add(
results, conflict_update=(policy_update == 'update-dups'))
- def translate_revision_metadata(self, detected_files):
+ def translate_revision_metadata(self, detected_files, log_suffix):
"""
Determine plan of action to translate metadata when containing
one or multiple detected files:
@@ -236,7 +239,8 @@
# content indexing
try:
c_metadata_indexer.run(sha1s_filtered,
- policy_update='ignore-dups')
+ policy_update='ignore-dups',
+ log_suffix=log_suffix)
# on the fly possibility:
for result in c_metadata_indexer.results:
local_metadata = result['translated_metadata']
diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py
--- a/swh/indexer/metadata_dictionary.py
+++ b/swh/indexer/metadata_dictionary.py
@@ -23,7 +23,7 @@
def register_mapping(cls):
- MAPPINGS[cls.__name__] = cls()
+ MAPPINGS[cls.__name__] = cls
return cls
@@ -69,13 +69,15 @@
- inherit this class
- override translate function
"""
- def __init__(self):
+ def __init__(self, log_suffix=''):
+ self.log_suffix = log_suffix
self.log = logging.getLogger('%s.%s' % (
self.__class__.__module__,
self.__class__.__name__))
+ @classmethod
@abc.abstractmethod
- def detect_metadata_files(self, files):
+ def detect_metadata_files(cls, files):
"""
Detects files potentially containing metadata
@@ -104,9 +106,10 @@
"""The .json file to extract metadata from."""
pass
- def detect_metadata_files(self, file_entries):
+ @classmethod
+ def detect_metadata_files(cls, file_entries):
for entry in file_entries:
- if entry['name'] == self.filename:
+ if entry['name'] == cls.filename:
return [entry['sha1']]
return []
@@ -184,12 +187,12 @@
try:
raw_content = raw_content.decode()
except UnicodeDecodeError:
- self.log.warning('Error unidecoding %r', raw_content)
+ self.log.warning('Error unidecoding from %s', self.log_suffix)
return
try:
content_dict = json.loads(raw_content)
except json.JSONDecodeError:
- self.log.warning('Error unjsoning %r' % raw_content)
+ self.log.warning('Error unjsoning from %s', self.log_suffix)
return
return self.translate_dict(content_dict)
@@ -355,7 +358,7 @@
try:
d = xmltodict.parse(content).get('project') or {}
except xml.parsers.expat.ExpatError:
- self.log.warning('Error parsing XML of %r', content)
+ self.log.warning('Error parsing XML from %s', self.log_suffix)
return None
metadata = self.translate_dict(d, normalize=False)
metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d)
@@ -512,7 +515,8 @@
mapping = CROSSWALK_TABLE['Ruby Gem']
- def detect_metadata_files(self, file_entries):
+ @classmethod
+ def detect_metadata_files(cls, file_entries):
for entry in file_entries:
if entry['name'].endswith(b'.gemspec'):
return [entry['sha1']]
@@ -522,7 +526,7 @@
try:
raw_content = raw_content.decode()
except UnicodeDecodeError:
- self.log.warning('Error unidecoding %r', raw_content)
+ self.log.warning('Error unidecoding from %s', self.log_suffix)
return
# Skip lines before 'Gem::Specification.new'
@@ -533,8 +537,8 @@
try:
next(lines) # Consume 'Gem::Specification.new'
except StopIteration:
- self.log.warning('Could not find Gem::Specification in %r',
- raw_content)
+ self.log.warning('Could not find Gem::Specification in %s',
+ self.log_suffix)
return
content_dict = {}
@@ -607,19 +611,3 @@
if isinstance(authors, list):
return {"@list": [author for author in authors
if isinstance(author, str)]}
-
-
-def main():
- raw_content = """{"name": "test_name", "unknown_term": "ut"}"""
- raw_content1 = b"""{"name": "test_name",
- "unknown_term": "ut",
- "prerequisites" :"packageXYZ"}"""
- result = MAPPINGS["NpmMapping"].translate(raw_content)
- result1 = MAPPINGS["MavenMapping"].translate(raw_content1)
-
- print(result)
- print(result1)
-
-
-if __name__ == "__main__":
- main()
diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -62,6 +62,11 @@
shows the entire diff in the results
"""
self.maxDiff = None
+ self.npm_mapping = MAPPINGS['NpmMapping']()
+ self.codemeta_mapping = MAPPINGS['CodemetaMapping']()
+ self.maven_mapping = MAPPINGS['MavenMapping']()
+ self.pkginfo_mapping = MAPPINGS['PythonPkginfoMapping']()
+ self.gemspec_mapping = MAPPINGS['GemspecMapping']()
def test_crosstable(self):
self.assertEqual(CROSSWALK_TABLE['NodeJS'], {
@@ -137,7 +142,7 @@
# None if no metadata was found or an error occurred
declared_metadata = None
# when
- result = MAPPINGS["NpmMapping"].translate(content)
+ result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
@@ -177,7 +182,7 @@
}
# when
- result = MAPPINGS["NpmMapping"].translate(content)
+ result = self.npm_mapping.translate(content)
# then
self.assertEqual(declared_metadata, result)
@@ -302,7 +307,7 @@
"email": "foo@example.com"
}
}"""
- result = MAPPINGS["NpmMapping"].translate(package_json)
+ result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
@@ -317,7 +322,7 @@
"email": "foo@example.com"
}
}"""
- result = MAPPINGS["NpmMapping"].translate(package_json)
+ result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
@@ -329,7 +334,7 @@
"name": "foo",
"bugs": "https://github.com/owner/project/issues"
}"""
- result = MAPPINGS["NpmMapping"].translate(package_json)
+ result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
@@ -346,7 +351,7 @@
"url" : "https://github.com/npm/cli.git"
}
}"""
- result = MAPPINGS["NpmMapping"].translate(package_json)
+ result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
@@ -361,7 +366,7 @@
"type" : "git"
}
}"""
- result = MAPPINGS["NpmMapping"].translate(package_json)
+ result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
@@ -373,7 +378,7 @@
"name": "foo",
"repository": "github:npm/cli"
}"""
- result = MAPPINGS["NpmMapping"].translate(package_json)
+ result = self.npm_mapping.translate(package_json)
expected_result = {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
@@ -387,7 +392,7 @@
"name": "foo",
"repository": "npm/cli"
}"""
- result = MAPPINGS["NpmMapping"].translate(package_json)
+ result = self.npm_mapping.translate(package_json)
self.assertEqual(result, expected_result)
# gitlab shortcut
@@ -395,7 +400,7 @@
"name": "foo",
"repository": "gitlab:user/repo"
}"""
- result = MAPPINGS["NpmMapping"].translate(package_json)
+ result = self.npm_mapping.translate(package_json)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'name': 'foo',
@@ -549,7 +554,7 @@
"datePublished": "2017-06-05",
"programmingLanguage": "JSON-LD"
}
- result = MAPPINGS["CodemetaMapping"].translate(raw_content)
+ result = self.codemeta_mapping.translate(raw_content)
self.assertEqual(result, expected_result)
def test_compute_metadata_maven(self):
@@ -580,7 +585,7 @@
"""
- result = MAPPINGS["MavenMapping"].translate(raw_content)
+ result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
@@ -596,7 +601,7 @@
raw_content = b"""
"""
- result = MAPPINGS["MavenMapping"].translate(raw_content)
+ result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
@@ -607,20 +612,29 @@
"""
- result = MAPPINGS["MavenMapping"].translate(raw_content)
+ result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
})
def test_compute_metadata_maven_invalid_xml(self):
+ expected_warning = (
+ 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:'
+ 'Error parsing XML from foo')
raw_content = b"""
"""
- result = MAPPINGS["MavenMapping"].translate(raw_content)
+ with self.assertLogs('swh.indexer.metadata_dictionary',
+ level='WARNING') as cm:
+ result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
+ self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
raw_content = b"""
"""
- result = MAPPINGS["MavenMapping"].translate(raw_content)
+ with self.assertLogs('swh.indexer.metadata_dictionary',
+ level='WARNING') as cm:
+ result = MAPPINGS["MavenMapping"]('foo').translate(raw_content)
+ self.assertEqual(cm.output, [expected_warning])
self.assertEqual(result, None)
def test_compute_metadata_maven_minimal(self):
@@ -632,7 +646,7 @@
my-app
1.2.3
"""
- result = MAPPINGS["MavenMapping"].translate(raw_content)
+ result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
@@ -682,7 +696,7 @@
"""
- result = MAPPINGS["MavenMapping"].translate(raw_content)
+ result = self.maven_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
@@ -730,7 +744,7 @@
Description-Content-Type: text/markdown
Provides-Extra: testing
""") # noqa
- result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content)
+ result = self.pkginfo_mapping.translate(raw_content)
self.assertCountEqual(result['description'], [
'Software Heritage core utilities', # note the comma here
'swh-core\n'
@@ -763,7 +777,7 @@
Name: foo
License: MIT
""") # noqa
- result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content)
+ result = self.pkginfo_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
@@ -785,7 +799,7 @@
s.homepage = 'https://rubygems.org/gems/example'
s.metadata = { "source_code_uri" => "https://github.com/example/example" }
end"""
- result = MAPPINGS['GemspecMapping'].translate(raw_content)
+ result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(result.pop('description'), [
"This is an example!",
"Much longer explanation of the example!"
@@ -807,7 +821,7 @@
s.authors = ["Ruby Coder1"]
s.author = "Ruby Coder2"
end"""
- result = MAPPINGS['GemspecMapping'].translate(raw_content)
+ result = self.gemspec_mapping.translate(raw_content)
self.assertCountEqual(result.pop('author'), [
'Ruby Coder1', 'Ruby Coder2'])
self.assertEqual(result, {
@@ -820,7 +834,7 @@
Gem::Specification.new do |s|
s.author = ["Ruby Coder"]
end"""
- result = MAPPINGS['GemspecMapping'].translate(raw_content)
+ result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
@@ -829,7 +843,7 @@
Gem::Specification.new do |s|
s.author = "Ruby Coder1",
end"""
- result = MAPPINGS['GemspecMapping'].translate(raw_content)
+ result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',
@@ -838,7 +852,7 @@
Gem::Specification.new do |s|
s.authors = ["Ruby Coder1", ["Ruby Coder2"]]
end"""
- result = MAPPINGS['GemspecMapping'].translate(raw_content)
+ result = self.gemspec_mapping.translate(raw_content)
self.assertEqual(result, {
'@context': 'https://doi.org/10.5063/schema/codemeta-2.0',
'type': 'SoftwareSourceCode',