diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py --- a/swh/indexer/metadata.py +++ b/swh/indexer/metadata.py @@ -43,7 +43,7 @@ } for sha1 in ids )) - def index(self, id, data): + def index(self, id, data, log_suffix='unknown revision'): """Index sha1s' content and store result. Args: @@ -63,8 +63,9 @@ } try: mapping_name = self.tool['tool_configuration']['context'] - result['translated_metadata'] = MAPPINGS[mapping_name] \ - .translate(data) + log_suffix += ', content_id=%s' % hashutil.hash_to_hex(id) + result['translated_metadata'] = \ + MAPPINGS[mapping_name](log_suffix).translate(data) except Exception: self.log.exception( "Problem during metadata translation " @@ -111,7 +112,7 @@ 'version': '0.0.2', 'configuration': { 'type': 'local', - 'context': ['NpmMapping', 'CodemetaMapping'] + 'context': list(MAPPINGS), }, }), } @@ -158,7 +159,9 @@ files = [entry for entry in dir_ls if entry['type'] == 'file'] detected_files = detect_metadata(files) result['translated_metadata'] = self.translate_revision_metadata( - detected_files) + detected_files, + log_suffix='revision=%s' % hashutil.hash_to_hex(rev['id']) + ) except Exception as e: self.log.exception( 'Problem when indexing rev: %r', e) @@ -181,7 +184,7 @@ self.idx_storage.revision_metadata_add( results, conflict_update=(policy_update == 'update-dups')) - def translate_revision_metadata(self, detected_files): + def translate_revision_metadata(self, detected_files, log_suffix): """ Determine plan of action to translate metadata when containing one or multiple detected files: @@ -236,7 +239,8 @@ # content indexing try: c_metadata_indexer.run(sha1s_filtered, - policy_update='ignore-dups') + policy_update='ignore-dups', + log_suffix=log_suffix) # on the fly possibility: for result in c_metadata_indexer.results: local_metadata = result['translated_metadata'] diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -23,7 +23,7 @@ def register_mapping(cls): - MAPPINGS[cls.__name__] = cls() + MAPPINGS[cls.__name__] = cls return cls @@ -69,13 +69,15 @@ - inherit this class - override translate function """ - def __init__(self): + def __init__(self, log_suffix=''): + self.log_suffix = log_suffix self.log = logging.getLogger('%s.%s' % ( self.__class__.__module__, self.__class__.__name__)) + @classmethod @abc.abstractmethod - def detect_metadata_files(self, files): + def detect_metadata_files(cls, files): """ Detects files potentially containing metadata @@ -104,9 +106,10 @@ """The .json file to extract metadata from.""" pass - def detect_metadata_files(self, file_entries): + @classmethod + def detect_metadata_files(cls, file_entries): for entry in file_entries: - if entry['name'] == self.filename: + if entry['name'] == cls.filename: return [entry['sha1']] return [] @@ -184,12 +187,12 @@ try: raw_content = raw_content.decode() except UnicodeDecodeError: - self.log.warning('Error unidecoding %r', raw_content) + self.log.warning('Error unidecoding from %s', self.log_suffix) return try: content_dict = json.loads(raw_content) except json.JSONDecodeError: - self.log.warning('Error unjsoning %r' % raw_content) + self.log.warning('Error unjsoning from %s', self.log_suffix) return return self.translate_dict(content_dict) @@ -355,7 +358,7 @@ try: d = xmltodict.parse(content).get('project') or {} except xml.parsers.expat.ExpatError: - self.log.warning('Error parsing XML of %r', content) + self.log.warning('Error parsing XML from %s', self.log_suffix) return None metadata = self.translate_dict(d, normalize=False) metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) @@ -512,7 +515,8 @@ mapping = CROSSWALK_TABLE['Ruby Gem'] - def detect_metadata_files(self, file_entries): + @classmethod + def detect_metadata_files(cls, file_entries): for entry in file_entries: if entry['name'].endswith(b'.gemspec'): return [entry['sha1']] @@ -522,7 +526,7 @@ try: raw_content = raw_content.decode() except UnicodeDecodeError: - self.log.warning('Error unidecoding %r', raw_content) + self.log.warning('Error unidecoding from %s', self.log_suffix) return # Skip lines before 'Gem::Specification.new' @@ -533,8 +537,8 @@ try: next(lines) # Consume 'Gem::Specification.new' except StopIteration: - self.log.warning('Could not find Gem::Specification in %r', - raw_content) + self.log.warning('Could not find Gem::Specification in %s', + self.log_suffix) return content_dict = {} @@ -607,19 +611,3 @@ if isinstance(authors, list): return {"@list": [author for author in authors if isinstance(author, str)]} - - -def main(): - raw_content = """{"name": "test_name", "unknown_term": "ut"}""" - raw_content1 = b"""{"name": "test_name", - "unknown_term": "ut", - "prerequisites" :"packageXYZ"}""" - result = MAPPINGS["NpmMapping"].translate(raw_content) - result1 = MAPPINGS["MavenMapping"].translate(raw_content1) - - print(result) - print(result1) - - -if __name__ == "__main__": - main() diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -62,6 +62,11 @@ shows the entire diff in the results """ self.maxDiff = None + self.npm_mapping = MAPPINGS['NpmMapping']() + self.codemeta_mapping = MAPPINGS['CodemetaMapping']() + self.maven_mapping = MAPPINGS['MavenMapping']() + self.pkginfo_mapping = MAPPINGS['PythonPkginfoMapping']() + self.gemspec_mapping = MAPPINGS['GemspecMapping']() def test_crosstable(self): self.assertEqual(CROSSWALK_TABLE['NodeJS'], { @@ -137,7 +142,7 @@ # None if no metadata was found or an error occurred declared_metadata = None # when - result = MAPPINGS["NpmMapping"].translate(content) + result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) @@ -177,7 +182,7 @@ } # when - result = MAPPINGS["NpmMapping"].translate(content) + result = self.npm_mapping.translate(content) # then self.assertEqual(declared_metadata, result) @@ -302,7 +307,7 @@ "email": "foo@example.com" } }""" - result = MAPPINGS["NpmMapping"].translate(package_json) + result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', @@ -317,7 +322,7 @@ "email": "foo@example.com" } }""" - result = MAPPINGS["NpmMapping"].translate(package_json) + result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', @@ -329,7 +334,7 @@ "name": "foo", "bugs": "https://github.com/owner/project/issues" }""" - result = MAPPINGS["NpmMapping"].translate(package_json) + result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', @@ -346,7 +351,7 @@ "url" : "https://github.com/npm/cli.git" } }""" - result = MAPPINGS["NpmMapping"].translate(package_json) + result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', @@ -361,7 +366,7 @@ "type" : "git" } }""" - result = MAPPINGS["NpmMapping"].translate(package_json) + result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', @@ -373,7 +378,7 @@ "name": "foo", "repository": "github:npm/cli" }""" - result = MAPPINGS["NpmMapping"].translate(package_json) + result = self.npm_mapping.translate(package_json) expected_result = { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', @@ -387,7 +392,7 @@ "name": "foo", "repository": "npm/cli" }""" - result = MAPPINGS["NpmMapping"].translate(package_json) + result = self.npm_mapping.translate(package_json) self.assertEqual(result, expected_result) # gitlab shortcut @@ -395,7 +400,7 @@ "name": "foo", "repository": "gitlab:user/repo" }""" - result = MAPPINGS["NpmMapping"].translate(package_json) + result = self.npm_mapping.translate(package_json) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'name': 'foo', @@ -549,7 +554,7 @@ "datePublished": "2017-06-05", "programmingLanguage": "JSON-LD" } - result = MAPPINGS["CodemetaMapping"].translate(raw_content) + result = self.codemeta_mapping.translate(raw_content) self.assertEqual(result, expected_result) def test_compute_metadata_maven(self): @@ -580,7 +585,7 @@ """ - result = MAPPINGS["MavenMapping"].translate(raw_content) + result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', @@ -596,7 +601,7 @@ raw_content = b""" """ - result = MAPPINGS["MavenMapping"].translate(raw_content) + result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', @@ -607,20 +612,29 @@ """ - result = MAPPINGS["MavenMapping"].translate(raw_content) + result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', }) def test_compute_metadata_maven_invalid_xml(self): + expected_warning = ( + 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:' + 'Error parsing XML from foo') raw_content = b""" """ - result = MAPPINGS["MavenMapping"].translate(raw_content) + with self.assertLogs('swh.indexer.metadata_dictionary', + level='WARNING') as cm: + result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) + self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) raw_content = b""" """ - result = MAPPINGS["MavenMapping"].translate(raw_content) + with self.assertLogs('swh.indexer.metadata_dictionary', + level='WARNING') as cm: + result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) + self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) def test_compute_metadata_maven_minimal(self): @@ -632,7 +646,7 @@ my-app 1.2.3 """ - result = MAPPINGS["MavenMapping"].translate(raw_content) + result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', @@ -682,7 +696,7 @@ """ - result = MAPPINGS["MavenMapping"].translate(raw_content) + result = self.maven_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', @@ -730,7 +744,7 @@ Description-Content-Type: text/markdown Provides-Extra: testing """) # noqa - result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) + result = self.pkginfo_mapping.translate(raw_content) self.assertCountEqual(result['description'], [ 'Software Heritage core utilities', # note the comma here 'swh-core\n' @@ -763,7 +777,7 @@ Name: foo License: MIT """) # noqa - result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) + result = self.pkginfo_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', @@ -785,7 +799,7 @@ s.homepage = 'https://rubygems.org/gems/example' s.metadata = { "source_code_uri" => "https://github.com/example/example" } end""" - result = MAPPINGS['GemspecMapping'].translate(raw_content) + result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual(result.pop('description'), [ "This is an example!", "Much longer explanation of the example!" @@ -807,7 +821,7 @@ s.authors = ["Ruby Coder1"] s.author = "Ruby Coder2" end""" - result = MAPPINGS['GemspecMapping'].translate(raw_content) + result = self.gemspec_mapping.translate(raw_content) self.assertCountEqual(result.pop('author'), [ 'Ruby Coder1', 'Ruby Coder2']) self.assertEqual(result, { @@ -820,7 +834,7 @@ Gem::Specification.new do |s| s.author = ["Ruby Coder"] end""" - result = MAPPINGS['GemspecMapping'].translate(raw_content) + result = self.gemspec_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', @@ -829,7 +843,7 @@ Gem::Specification.new do |s| s.author = "Ruby Coder1", end""" - result = MAPPINGS['GemspecMapping'].translate(raw_content) + result = self.gemspec_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', @@ -838,7 +852,7 @@ Gem::Specification.new do |s| s.authors = ["Ruby Coder1", ["Ruby Coder2"]] end""" - result = MAPPINGS['GemspecMapping'].translate(raw_content) + result = self.gemspec_mapping.translate(raw_content) self.assertEqual(result, { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode',