diff --git a/swh/indexer/tests/test_metadata.py b/swh/indexer/tests/test_metadata.py
--- a/swh/indexer/tests/test_metadata.py
+++ b/swh/indexer/tests/test_metadata.py
@@ -7,7 +7,6 @@
import unittest
from hypothesis import given, strategies
-import xmltodict
from swh.model.hashutil import hash_to_bytes
@@ -23,7 +22,8 @@
from .utils import (
BASE_TEST_CONFIG, fill_obj_storage, fill_storage,
- YARN_PARSER_METADATA, json_document_strategy
+ YARN_PARSER_METADATA, json_document_strategy,
+ xml_document_strategy,
)
@@ -1069,11 +1069,12 @@
raw = json.dumps(doc).encode()
self.codemeta_mapping.translate(raw)
- @given(json_document_strategy(
- keys=list(MAPPINGS['MavenMapping'].mapping)))
+ @given(xml_document_strategy(
+ keys=list(MAPPINGS['MavenMapping'].mapping),
+ root='project',
+ xmlns='http://maven.apache.org/POM/4.0.0'))
def test_maven_adversarial(self, doc):
- raw = xmltodict.unparse({'project': doc}, pretty=True)
- self.maven_mapping.translate(raw)
+ self.maven_mapping.translate(doc)
@given(strategies.dictionaries(
# keys
diff --git a/swh/indexer/tests/utils.py b/swh/indexer/tests/utils.py
--- a/swh/indexer/tests/utils.py
+++ b/swh/indexer/tests/utils.py
@@ -5,8 +5,10 @@
import abc
import datetime
+import functools
import hashlib
import random
+import unittest
from hypothesis import strategies
@@ -401,6 +403,7 @@
strategies.characters(),
*map(strategies.just, ['type', 'url', 'name', 'email', '@id',
'@context', 'repository', 'license',
+ 'repositories', 'licenses'
]),
)
"""Hypothesis strategy that generates strings, with an emphasis on those
@@ -421,7 +424,7 @@
def json_document_strategy(keys=None):
"""Generates an hypothesis strategy that generates metadata files
- for a format that uses the given keys."""
+ for a JSON-based format that uses the given keys."""
if keys is None:
keys = strategies.characters()
else:
@@ -430,6 +433,98 @@
return strategies.dictionaries(keys, generic_json_document, min_size=1)
+def _tree_to_xml(root, xmlns, data):
+ def encode(s):
+ "Skips unpaired surrogates generated by json_document_strategy"
+ return s.encode('utf8', 'replace')
+
+ def to_xml(data, indent=b' '):
+ if data is None:
+ return b''
+ elif isinstance(data, (bool, str, int, float)):
+ return indent + encode(str(data))
+ elif isinstance(data, list):
+ return b'\n'.join(to_xml(v, indent=indent) for v in data)
+ elif isinstance(data, dict):
+ lines = []
+ for (key, value) in data.items():
+ lines.append(indent + encode('<{}>'.format(key)))
+ lines.append(to_xml(value, indent=indent+b' '))
+ lines.append(indent + encode('{}>'.format(key)))
+ return b'\n'.join(lines)
+ else:
+ raise TypeError(data)
+
+ return b'\n'.join([
+ '<{} xmlns="{}">'.format(root, xmlns).encode(),
+ to_xml(data),
+ '{}>'.format(root).encode(),
+ ])
+
+
+class TreeToXmlTest(unittest.TestCase):
+ def test_leaves(self):
+ self.assertEqual(
+ _tree_to_xml('root', 'http://example.com', None),
+ b'\n\n'
+ )
+ self.assertEqual(
+ _tree_to_xml('root', 'http://example.com', True),
+ b'\n True\n'
+ )
+ self.assertEqual(
+ _tree_to_xml('root', 'http://example.com', 'abc'),
+ b'\n abc\n'
+ )
+ self.assertEqual(
+ _tree_to_xml('root', 'http://example.com', 42),
+ b'\n 42\n'
+ )
+ self.assertEqual(
+ _tree_to_xml('root', 'http://example.com', 3.14),
+ b'\n 3.14\n'
+ )
+
+ def test_dict(self):
+ self.assertIn(
+ _tree_to_xml('root', 'http://example.com', {
+ 'foo': 'bar',
+ 'baz': 'qux'
+ }),
+ [
+ b'\n'
+ b' \n bar\n \n'
+ b' \n qux\n \n'
+ b'',
+ b'\n'
+ b' \n qux\n \n'
+ b' \n bar\n \n'
+ b''
+ ]
+ )
+
+ def test_list(self):
+ self.assertEqual(
+ _tree_to_xml('root', 'http://example.com', [
+ {'foo': 'bar'},
+ {'foo': 'baz'},
+ ]),
+ b'\n'
+ b' \n bar\n \n'
+ b' \n baz\n \n'
+ b''
+ )
+
+
+def xml_document_strategy(keys, root, xmlns):
+ """Generates an hypothesis strategy that generates metadata files
+ for an XML format that uses the given keys."""
+
+ return strategies.builds(
+ functools.partial(_tree_to_xml, root, xmlns),
+ json_document_strategy(keys))
+
+
def filter_dict(d, keys):
'return a copy of the dict with keys deleted'
if not isinstance(keys, (list, tuple)):