diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py
--- a/swh/deposit/parsers.py
+++ b/swh/deposit/parsers.py
@@ -8,6 +8,7 @@
"""
+from collections import defaultdict
from decimal import Decimal
from rest_framework.parsers import FileUploadParser
from rest_framework.parsers import MultiPartParser
@@ -22,13 +23,76 @@
class SWHFileUploadTarParser(FileUploadParser):
- """File upload parser limited to zip archive.
+ """File upload parser limited to tarball (tar, tar.gz, tar.*) archives.
"""
media_type = 'application/x-tar'
-class SWHXMLParser(XMLParser):
+class ListXMLParser(XMLParser):
+ """Patch XMLParser behavior to not merge duplicated key entries.
+
+ """
+ # special tags that must be cast to list
+ _tags = [
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author',
+ ]
+
+ # converted tags to list
+ _lists = None
+
+ def __init__(self):
+ self._reset()
+
+ def _reset(self):
+ self._lists = defaultdict(list)
+
+ def parse(self, stream, media_type=None, parser_context=None):
+ data = super().parse(
+ stream, media_type=media_type, parser_context=parser_context)
+ # Overriding and updating the list values
+ for key, value in self._lists.items():
+ data[key] = value
+ self._reset()
+ return data
+
+ def _xml_convert(self, element):
+ """This patches the default behavior to detect entries that must be
+ list. The current XMLParser's behavior is not correct as it
+ merges entries with the same name.
+
+ """
+ children = list(element)
+ if len(children) == 0:
+ data = self._type_convert(element.text)
+ if element.tag in self._tags:
+ if data not in self._lists[element.tag]:
+ self._lists[element.tag].append(data)
+ return data
+
+ # if the first child tag is list-item, it means all
+ # children are list-item
+ if children[0].tag == "list-item":
+ data = []
+ for child in children:
+ data.append(self._xml_convert(child))
+ return data
+
+ data = {}
+ for child in children:
+ data[child.tag] = self._xml_convert(child)
+
+ if element.tag in self._tags:
+ if data not in self._lists[element.tag]:
+ self._lists[element.tag].append(data)
+
+ return data
+
+
+class SWHXMLParser(ListXMLParser):
def _type_convert(self, value):
"""Override the default type converter to avoid having decimal in the
resulting output.
diff --git a/swh/deposit/tests/api/test_parser.py b/swh/deposit/tests/api/test_parser.py
new file mode 100644
--- /dev/null
+++ b/swh/deposit/tests/api/test_parser.py
@@ -0,0 +1,116 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import io
+
+from nose.tools import istest
+from rest_framework.test import APITestCase
+
+from swh.deposit.parsers import SWHXMLParser
+
+
+class ParsingTest(APITestCase):
+ """Access to main entry point is ok without authentication
+
+ """
+ @istest
+ def parsing_without_duplicates(self):
+ xml_no_duplicate = io.BytesIO(b'''
+
+ Awesome Compiler
+
+ GPL3.0
+ https://opensource.org/licenses/GPL-3.0
+
+ Python3
+
+ author1
+ Inria
+
+ ocaml
+ http://issuetracker.com
+ ''')
+
+ actual_result = SWHXMLParser().parse(xml_no_duplicate)
+ expected_dict = {
+ '{http://www.w3.org/2005/Atom}title':
+ 'Awesome Compiler',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author':
+ [{'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation':
+ 'Inria',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'author1'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}issueTracker':
+ 'http://issuetracker.com',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license':
+ [{'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'GPL3.0',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url':
+ 'https://opensource.org/licenses/GPL-3.0'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage':
+ ['ocaml'],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform':
+ ['Python3']
+ }
+ self.assertEqual(expected_dict, actual_result)
+
+ @istest
+ def parsing_with_duplicates(self):
+ xml_with_duplicates = io.BytesIO(b'''
+
+ Another Compiler
+ GNU/Linux
+
+ GPL3.0
+ https://opensource.org/licenses/GPL-3.0
+
+ Un*x
+
+ author1
+ Inria
+
+
+ author2
+ Inria
+
+ ocaml
+ haskell
+
+ spdx
+ http://spdx.org
+
+ python3
+ ''')
+
+ actual_result = SWHXMLParser().parse(xml_with_duplicates)
+
+ expected_dict = {
+ '{http://www.w3.org/2005/Atom}title':
+ 'Another Compiler',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author': [
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation':
+ 'Inria',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'author1'},
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation':
+ 'Inria',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'author2'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license': [
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'GPL3.0',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url':
+ 'https://opensource.org/licenses/GPL-3.0'},
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'spdx',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url':
+ 'http://spdx.org'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage':
+ [ 'ocaml', 'haskell', 'python3'],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform':
+ ['GNU/Linux', 'Un*x'] }
+ self.assertEqual(expected_dict, actual_result)
diff --git a/swh/deposit/tests/loader/test_loader.py b/swh/deposit/tests/loader/test_loader.py
--- a/swh/deposit/tests/loader/test_loader.py
+++ b/swh/deposit/tests/loader/test_loader.py
@@ -232,7 +232,7 @@
"""
self.deposit_metadata_id = self.add_metadata_to_deposit(
- self.deposit_id)
+ self.deposit_id)
args = [self.collection.name, self.deposit_metadata_id]
archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args)
@@ -262,16 +262,20 @@
atom + 'name': 'HAL'
},
codemeta + 'url':
- 'https://hal-test.archives-ouvertes.fr/hal-01243065',
- codemeta + 'runtimePlatform': 'phpstorm',
- codemeta + 'license': {
- codemeta + 'name':
- 'CeCILL Free Software License Agreement v1.1'
- },
- codemeta + 'author': {
+ 'https://hal-test.archives-ouvertes.fr/hal-01243065',
+ codemeta + 'runtimePlatform': ['phpstorm'],
+ codemeta + 'license': [
+ {
+ codemeta + 'name': 'GNU General Public License v3.0 only'
+ },
+ {
+ codemeta + 'name': 'CeCILL Free Software License Agreement v1.1' # noqa
+ }
+ ],
+ codemeta + 'author': [{
codemeta + 'name': 'Morane Gruenpeter'
- },
- codemeta + 'programmingLanguage': 'C',
+ }],
+ codemeta + 'programmingLanguage': ['php', 'python', 'C'],
codemeta + 'applicationCategory': 'test',
codemeta + 'dateCreated': '2017-05-03T16:08:47+02:00',
codemeta + 'version': 1,