diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -372,6 +372,14 @@ except xml.parsers.expat.ExpatError: self.log.warning('Error parsing XML from %s', self.log_suffix) return None + except UnicodeDecodeError: + self.log.warning('Error unidecoding XML from %s', self.log_suffix) + return None + except (LookupError, ValueError): + # unknown encoding or multi-byte encoding + self.log.warning('Error detecting XML encoding from %s', + self.log_suffix) + return None metadata = self.translate_dict(d, normalize=False) metadata[SCHEMA_URI+'codeRepository'] = self.parse_repositories(d) metadata[SCHEMA_URI+'license'] = self.parse_licenses(d) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -629,6 +629,7 @@ expected_warning = ( 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:' 'Error parsing XML from foo') + raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', @@ -636,6 +637,7 @@ result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) + raw_content = b""" """ with self.assertLogs('swh.indexer.metadata_dictionary', @@ -644,6 +646,43 @@ self.assertEqual(cm.output, [expected_warning]) self.assertEqual(result, None) + def test_compute_metadata_maven_unknown_encoding(self): + expected_warning = ( + 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:' + 'Error detecting XML encoding from foo') + + raw_content = b""" + + """ + with self.assertLogs('swh.indexer.metadata_dictionary', + level='WARNING') as cm: + result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) + self.assertEqual(cm.output, [expected_warning]) + self.assertEqual(result, None) + + raw_content = b""" + + """ + with self.assertLogs('swh.indexer.metadata_dictionary', + level='WARNING') as cm: + result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) + self.assertEqual(cm.output, [expected_warning]) + self.assertEqual(result, None) + + def test_compute_metadata_maven_invalid_encoding(self): + expected_warning = ( + 'WARNING:swh.indexer.metadata_dictionary.MavenMapping:' + 'Error unidecoding XML from foo') + + raw_content = b""" + + """ + with self.assertLogs('swh.indexer.metadata_dictionary', + level='WARNING') as cm: + result = MAPPINGS["MavenMapping"]('foo').translate(raw_content) + self.assertEqual(cm.output, [expected_warning]) + self.assertEqual(result, None) + def test_compute_metadata_maven_minimal(self): raw_content = b"""