diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -7,7 +7,6 @@ import unittest from hypothesis import given, strategies -import xmltodict from swh.model.hashutil import hash_to_bytes @@ -23,7 +22,8 @@ from .utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage, - YARN_PARSER_METADATA, json_document_strategy + YARN_PARSER_METADATA, json_document_strategy, + xml_document_strategy, ) @@ -1069,11 +1069,12 @@ raw = json.dumps(doc).encode() self.codemeta_mapping.translate(raw) - @given(json_document_strategy( - keys=list(MAPPINGS['MavenMapping'].mapping))) + @given(xml_document_strategy( + keys=list(MAPPINGS['MavenMapping'].mapping), + root='project', + xmlns='http://maven.apache.org/POM/4.0.0')) def test_maven_adversarial(self, doc): - raw = xmltodict.unparse({'project': doc}, pretty=True) - self.maven_mapping.translate(raw) + self.maven_mapping.translate(doc) @given(strategies.dictionaries( # keys diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -5,8 +5,10 @@ import abc import datetime +import functools import hashlib import random +import unittest from hypothesis import strategies @@ -401,6 +403,7 @@ strategies.characters(), *map(strategies.just, ['type', 'url', 'name', 'email', '@id', '@context', 'repository', 'license', + 'repositories', 'licenses' ]), ) """Hypothesis strategy that generates strings, with an emphasis on those @@ -421,7 +424,7 @@ def json_document_strategy(keys=None): """Generates an hypothesis strategy that generates metadata files - for a format that uses the given keys.""" + for a JSON-based format that uses the given keys.""" if keys is None: keys = strategies.characters() else: @@ -430,6 +433,98 @@ return strategies.dictionaries(keys, generic_json_document, min_size=1) +def _tree_to_xml(root, xmlns, data): + def encode(s): + "Skips unpaired surrogates generated by json_document_strategy" + return s.encode('utf8', 'replace') + + def to_xml(data, indent=b' '): + if data is None: + return b'' + elif isinstance(data, (bool, str, int, float)): + return indent + encode(str(data)) + elif isinstance(data, list): + return b'\n'.join(to_xml(v, indent=indent) for v in data) + elif isinstance(data, dict): + lines = [] + for (key, value) in data.items(): + lines.append(indent + encode('<{}>'.format(key))) + lines.append(to_xml(value, indent=indent+b' ')) + lines.append(indent + encode(''.format(key))) + return b'\n'.join(lines) + else: + raise TypeError(data) + + return b'\n'.join([ + '<{} xmlns="{}">'.format(root, xmlns).encode(), + to_xml(data), + ''.format(root).encode(), + ]) + + +class TreeToXmlTest(unittest.TestCase): + def test_leaves(self): + self.assertEqual( + _tree_to_xml('root', 'http://example.com', None), + b'\n\n' + ) + self.assertEqual( + _tree_to_xml('root', 'http://example.com', True), + b'\n True\n' + ) + self.assertEqual( + _tree_to_xml('root', 'http://example.com', 'abc'), + b'\n abc\n' + ) + self.assertEqual( + _tree_to_xml('root', 'http://example.com', 42), + b'\n 42\n' + ) + self.assertEqual( + _tree_to_xml('root', 'http://example.com', 3.14), + b'\n 3.14\n' + ) + + def test_dict(self): + self.assertIn( + _tree_to_xml('root', 'http://example.com', { + 'foo': 'bar', + 'baz': 'qux' + }), + [ + b'\n' + b' \n bar\n \n' + b' \n qux\n \n' + b'', + b'\n' + b' \n qux\n \n' + b' \n bar\n \n' + b'' + ] + ) + + def test_list(self): + self.assertEqual( + _tree_to_xml('root', 'http://example.com', [ + {'foo': 'bar'}, + {'foo': 'baz'}, + ]), + b'\n' + b' \n bar\n \n' + b' \n baz\n \n' + b'' + ) + + +def xml_document_strategy(keys, root, xmlns): + """Generates an hypothesis strategy that generates metadata files + for an XML format that uses the given keys.""" + + return strategies.builds( + functools.partial(_tree_to_xml, root, xmlns), + json_document_strategy(keys)) + + def filter_dict(d, keys): 'return a copy of the dict with keys deleted' if not isinstance(keys, (list, tuple)):