diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -10,6 +10,7 @@ import logging import email.parser import xml.parsers.expat +import email.policy import xmltodict @@ -425,6 +426,14 @@ _normalize_pkginfo_key = str.lower +class LinebreakPreservingEmailPolicy(email.policy.EmailPolicy): + def header_fetch_parse(self, name, value): + if hasattr(value, 'name'): + return value + value = value.replace('\n ', '\n') + return self.header_factory(name, value) + + @register_mapping class PythonPkginfoMapping(DictMapping, SingleFileMapping): """Dedicated class for Python's PKG-INFO mapping and translation. @@ -434,7 +443,8 @@ mapping = {_normalize_pkginfo_key(k): v for (k, v) in CROSSWALK_TABLE['Python PKG-INFO'].items()} - _parser = email.parser.BytesHeaderParser() + _parser = email.parser.BytesHeaderParser( + policy=LinebreakPreservingEmailPolicy()) def translate(self, content): msg = self._parser.parsebytes(content) diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -690,14 +690,14 @@ self.assertCountEqual(result['description'], [ 'Software Heritage core utilities', # note the comma here 'swh-core\n' - ' ========\n' - ' \n' - " core library for swh's modules:\n" - ' - config parser\n' - ' - hash computations\n' - ' - serialization\n' - ' - logging mechanism\n' - ' '], + '========\n' + '\n' + "core library for swh's modules:\n" + '- config parser\n' + '- hash computations\n' + '- serialization\n' + '- logging mechanism\n' + ''], result) del result['description'] self.assertEqual(result, { @@ -713,6 +713,22 @@ 'version': '0.0.49', }) + def test_compute_metadata_pkginfo_utf8(self): + raw_content = (b'''\ +Metadata-Version: 1.1 +Name: snowpyt +Description-Content-Type: UNKNOWN +Description: foo + Hydrology N\xc2\xb083 +''') # noqa + result = MAPPINGS["PythonPkginfoMapping"].translate(raw_content) + self.assertEqual(result, { + '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', + 'type': 'SoftwareSourceCode', + 'name': 'snowpyt', + 'description': 'foo\nHydrology N°83', + }) + def test_compute_metadata_pkginfo_license(self): raw_content = (b"""\ Metadata-Version: 2.1