diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py --- a/swh/indexer/tests/test_metadata.py +++ b/swh/indexer/tests/test_metadata.py @@ -7,7 +7,6 @@ import unittest from hypothesis import given, strategies, settings, HealthCheck -import xmltodict from swh.model.hashutil import hash_to_bytes @@ -23,7 +22,8 @@ from .utils import ( BASE_TEST_CONFIG, fill_obj_storage, fill_storage, - YARN_PARSER_METADATA, json_document_strategy + YARN_PARSER_METADATA, json_document_strategy, + xml_document_strategy, ) @@ -1086,11 +1086,12 @@ self.codemeta_mapping.translate(raw) @settings(suppress_health_check=[HealthCheck.too_slow]) - @given(json_document_strategy( - keys=list(MAPPINGS['MavenMapping'].mapping))) + @given(xml_document_strategy( + keys=list(MAPPINGS['MavenMapping'].mapping), + root='project', + xmlns='http://maven.apache.org/POM/4.0.0')) def test_maven_adversarial(self, doc): - raw = xmltodict.unparse({'project': doc}, pretty=True) - self.maven_mapping.translate(raw) + self.maven_mapping.translate(doc) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(strategies.dictionaries( diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py --- a/swh/indexer/tests/utils.py +++ b/swh/indexer/tests/utils.py @@ -432,7 +432,7 @@ def json_document_strategy(keys=None): """Generates an hypothesis strategy that generates metadata files - for a format that uses the given keys.""" + for a JSON-based format that uses the given keys.""" if keys is None: keys = strategies.characters() else: @@ -441,6 +441,40 @@ return strategies.dictionaries(keys, generic_json_document, min_size=1) +def xml_document_strategy(keys, root, xmlns): + """Generates an hypothesis strategy that generates metadata files + for an XML format that uses the given keys.""" + def encode(s): + "Skips unpaired surrogates generated by json_document_strategy" + return s.encode('utf8', 'replace') + + def to_xml(data, indent=b' '): + if data is None: + return b'' + elif isinstance(data, (bool, str, int, float)): + return encode(str(data)) + elif isinstance(data, list): + return b'\n'.join(to_xml(v, indent=indent) for v in data) + elif isinstance(data, dict): + lines = [] + for (key, value) in data.items(): + lines.append(indent + encode('<{}>'.format(key))) + lines.append(indent + to_xml(value, indent=indent+b' ')) + lines.append(indent + encode(''.format(key))) + return b'\n'.join(lines) + else: + raise TypeError(data) + + def to_root_xml(data): + return b'\n'.join([ + '<{} xmlns="{}">'.format(root, xmlns).encode(), + to_xml(data), + ''.format(root).encode(), + ]) + + return strategies.builds(to_root_xml, json_document_strategy(keys)) + + def filter_dict(d, keys): 'return a copy of the dict with keys deleted' if not isinstance(keys, (list, tuple)):