diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -8,6 +8,7 @@ """ +from collections import defaultdict from decimal import Decimal from rest_framework.parsers import FileUploadParser from rest_framework.parsers import MultiPartParser @@ -22,13 +23,76 @@ class SWHFileUploadTarParser(FileUploadParser): - """File upload parser limited to zip archive. + """File upload parser limited to tarball (tar, tar.gz, tar.*) archives. """ media_type = 'application/x-tar' -class SWHXMLParser(XMLParser): +class ListXMLParser(XMLParser): + """Patch XMLParser behavior to not merge duplicated key entries. + + """ + # special tags that must be cast to list + _tags = [ + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author', + ] + + # converted tags to list + _lists = None + + def __init__(self): + self._reset() + + def _reset(self): + self._lists = defaultdict(list) + + def parse(self, stream, media_type=None, parser_context=None): + data = super().parse( + stream, media_type=media_type, parser_context=parser_context) + # Overriding and updating the list values + for key, value in self._lists.items(): + data[key] = value + self._reset() + return data + + def _xml_convert(self, element): + """This patches the default behavior to detect entries that must be + list. The current XMLParser's behavior is not correct as it + merges entries with the same name. + + """ + children = list(element) + if len(children) == 0: + data = self._type_convert(element.text) + if element.tag in self._tags: + if data not in self._lists[element.tag]: + self._lists[element.tag].append(data) + return data + + # if the first child tag is list-item, it means all + # children are list-item + if children[0].tag == "list-item": + data = [] + for child in children: + data.append(self._xml_convert(child)) + return data + + data = {} + for child in children: + data[child.tag] = self._xml_convert(child) + + if element.tag in self._tags: + if data not in self._lists[element.tag]: + self._lists[element.tag].append(data) + + return data + + +class SWHXMLParser(ListXMLParser): def _type_convert(self, value): """Override the default type converter to avoid having decimal in the resulting output. diff --git a/swh/deposit/tests/api/test_parser.py b/swh/deposit/tests/api/test_parser.py new file mode 100644 --- /dev/null +++ b/swh/deposit/tests/api/test_parser.py @@ -0,0 +1,116 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import io + +from nose.tools import istest +from rest_framework.test import APITestCase + +from swh.deposit.parsers import SWHXMLParser + + +class ParsingTest(APITestCase): + """Access to main entry point is ok without authentication + + """ + @istest + def parsing_without_duplicates(self): + xml_no_duplicate = io.BytesIO(b''' + + Awesome Compiler + + GPL3.0 + https://opensource.org/licenses/GPL-3.0 + + Python3 + + author1 + Inria + + ocaml + http://issuetracker.com + ''') + + actual_result = SWHXMLParser().parse(xml_no_duplicate) + expected_dict = { + '{http://www.w3.org/2005/Atom}title': + 'Awesome Compiler', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author': + [{'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation': + 'Inria', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name': + 'author1'}], + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}issueTracker': + 'http://issuetracker.com', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license': + [{'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name': + 'GPL3.0', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url': + 'https://opensource.org/licenses/GPL-3.0'}], + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage': + ['ocaml'], + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform': + ['Python3'] + } + self.assertEqual(expected_dict, actual_result) + + @istest + def parsing_with_duplicates(self): + xml_with_duplicates = io.BytesIO(b''' + + Another Compiler + GNU/Linux + + GPL3.0 + https://opensource.org/licenses/GPL-3.0 + + Un*x + + author1 + Inria + + + author2 + Inria + + ocaml + haskell + + spdx + http://spdx.org + + python3 + ''') + + actual_result = SWHXMLParser().parse(xml_with_duplicates) + + expected_dict = { + '{http://www.w3.org/2005/Atom}title': + 'Another Compiler', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author': [ + {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation': + 'Inria', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name': + 'author1'}, + {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation': + 'Inria', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name': + 'author2'}], + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license': [ + {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name': + 'GPL3.0', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url': + 'https://opensource.org/licenses/GPL-3.0'}, + {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name': + 'spdx', + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url': + 'http://spdx.org'}], + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage': + [ 'ocaml', 'haskell', 'python3'], + '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform': + ['GNU/Linux', 'Un*x'] } + self.assertEqual(expected_dict, actual_result) diff --git a/swh/deposit/tests/loader/test_loader.py b/swh/deposit/tests/loader/test_loader.py --- a/swh/deposit/tests/loader/test_loader.py +++ b/swh/deposit/tests/loader/test_loader.py @@ -232,7 +232,7 @@ """ self.deposit_metadata_id = self.add_metadata_to_deposit( - self.deposit_id) + self.deposit_id) args = [self.collection.name, self.deposit_metadata_id] archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args) @@ -262,16 +262,20 @@ atom + 'name': 'HAL' }, codemeta + 'url': - 'https://hal-test.archives-ouvertes.fr/hal-01243065', - codemeta + 'runtimePlatform': 'phpstorm', - codemeta + 'license': { - codemeta + 'name': - 'CeCILL Free Software License Agreement v1.1' - }, - codemeta + 'author': { + 'https://hal-test.archives-ouvertes.fr/hal-01243065', + codemeta + 'runtimePlatform': ['phpstorm'], + codemeta + 'license': [ + { + codemeta + 'name': 'GNU General Public License v3.0 only' + }, + { + codemeta + 'name': 'CeCILL Free Software License Agreement v1.1' # noqa + } + ], + codemeta + 'author': [{ codemeta + 'name': 'Morane Gruenpeter' - }, - codemeta + 'programmingLanguage': 'C', + }], + codemeta + 'programmingLanguage': ['php', 'python', 'C'], codemeta + 'applicationCategory': 'test', codemeta + 'dateCreated': '2017-05-03T16:08:47+02:00', codemeta + 'version': 1,