diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py
index 5c4198c3..c95fa6eb 100644
--- a/swh/deposit/parsers.py
+++ b/swh/deposit/parsers.py
@@ -1,68 +1,128 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Module in charge of defining parsers with SWORD 2.0 supported mediatypes.
+from collections import defaultdict
from decimal import Decimal
from rest_framework.parsers import FileUploadParser
from rest_framework.parsers import MultiPartParser
from rest_framework_xml.parsers import XMLParser
class SWHFileUploadZipParser(FileUploadParser):
"""File upload parser limited to zip archive.
media_type = 'application/zip'
class SWHFileUploadTarParser(FileUploadParser):
"""File upload parser limited to zip archive.
media_type = 'application/x-tar'
-class SWHXMLParser(XMLParser):
+class ListXMLParser(XMLParser):
+ """Patch XMLParser behavior to not merge duplicated key entries.
+ """
+ # special tags that must be cast to list
+ _tags = [
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author',
+ ]
+ # converted tags to list
+ _lists = None
+ def __init__(self):
+ self._reset()
+ def _reset(self):
+ self._lists = defaultdict(list)
+ def parse(self, stream, media_type=None, parser_context=None):
+ data = super().parse(
+ stream, media_type=media_type, parser_context=parser_context)
+ # Update the special values
+ for key, value in self._lists.items():
+ data[key] = value
+ self._reset()
+ return data
+ def _xml_convert(self, element):
+ children = list(element)
+ if len(children) == 0:
+ data = self._type_convert(element.text)
+ if element.tag in self._tags:
+ if data not in self._lists[element.tag]:
+ self._lists[element.tag].append(data)
+ return data
+ # if the first child tag is list-item, it means all
+ # children are list-item
+ if children[0].tag == "list-item":
+ data = []
+ for child in children:
+ data.append(self._xml_convert(child))
+ return data
+ data = {}
+ for child in children:
+ data[child.tag] = self._xml_convert(child)
+ if element.tag in self._tags:
+ if data not in self._lists[element.tag]:
+ self._lists[element.tag].append(data)
+ return data
+class SWHXMLParser(ListXMLParser):
def _type_convert(self, value):
"""Override the default type converter to avoid having decimal in the
resulting output.
value = super()._type_convert(value)
if isinstance(value, Decimal):
value = str(value)
return value
class SWHAtomEntryParser(SWHXMLParser):
"""Atom entry parser limited to specific mediatype
media_type = 'application/atom+xml;type=entry'
class SWHMultiPartParser(MultiPartParser):
"""Multipart parser limited to a subset of mediatypes.
media_type = 'multipart/*; *'
def parse_xml(raw_content):
"""Parse xml body.
raw_content (bytes): The content to parse
content parsed as dict.
return SWHXMLParser().parse(raw_content)
diff --git a/swh/deposit/tests/api/test_parser.py b/swh/deposit/tests/api/test_parser.py
new file mode 100644
index 00000000..78a237bc
--- /dev/null
+++ b/swh/deposit/tests/api/test_parser.py
@@ -0,0 +1,116 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+import io
+from nose.tools import istest
+from rest_framework.test import APITestCase
+from swh.deposit.parsers import SWHXMLParser
+class ParsingTest(APITestCase):
+ """Access to main entry point is ok without authentication
+ """
+ @istest
+ def parsing_without_duplicates(self):
+ xml_no_duplicate = io.BytesIO(b'''
+ Awesome Compiler
+ GPL3.0
+ https://opensource.org/licenses/GPL-3.0
+ Python3
+ author1
+ Inria
+ ocaml
+ http://issuetracker.com
+ ''')
+ actual_result = SWHXMLParser().parse(xml_no_duplicate)
+ expected_dict = {
+ '{http://www.w3.org/2005/Atom}title':
+ 'Awesome Compiler',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author':
+ [{'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation':
+ 'Inria',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'author1'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}issueTracker':
+ 'http://issuetracker.com',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license':
+ [{'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'GPL3.0',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url':
+ 'https://opensource.org/licenses/GPL-3.0'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage':
+ ['ocaml'],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform':
+ ['Python3']
+ }
+ self.assertEqual(expected_dict, actual_result)
+ @istest
+ def parsing_with_duplicates(self):
+ xml_with_duplicates = io.BytesIO(b'''
+ Another Compiler
+ GNU/Linux
+ GPL3.0
+ https://opensource.org/licenses/GPL-3.0
+ Un*x
+ author1
+ Inria
+ author2
+ Inria
+ ocaml
+ haskell
+ spdx
+ http://spdx.org
+ python3
+ ''')
+ actual_result = SWHXMLParser().parse(xml_with_duplicates)
+ expected_dict = {
+ '{http://www.w3.org/2005/Atom}title':
+ 'Another Compiler',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author': [
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation':
+ 'Inria',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'author1'},
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation':
+ 'Inria',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'author2'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license': [
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'GPL3.0',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url':
+ 'https://opensource.org/licenses/GPL-3.0'},
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'spdx',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url':
+ 'http://spdx.org'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage':
+ [ 'ocaml', 'haskell', 'python3'],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform':
+ ['GNU/Linux', 'Un*x'] }
+ self.assertEqual(expected_dict, actual_result)
diff --git a/swh/deposit/tests/loader/test_loader.py b/swh/deposit/tests/loader/test_loader.py
index 66c04fec..02eaa05f 100644
--- a/swh/deposit/tests/loader/test_loader.py
+++ b/swh/deposit/tests/loader/test_loader.py
@@ -1,289 +1,293 @@
# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import unittest
import shutil
from nose.tools import istest
from nose.plugins.attrib import attr
from rest_framework.test import APITestCase
from swh.model import hashutil
from swh.deposit.loader import loader
from swh.deposit.config import PRIVATE_GET_RAW_CONTENT
from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA
from swh.deposit.config import PRIVATE_PUT_DEPOSIT
from django.core.urlresolvers import reverse
from .common import SWHDepositTestClient, CLIENT_TEST_CONFIG
from .. import TEST_LOADER_CONFIG
from ..common import BasicTestCase, WithAuthTestCase, CommonCreationRoutine
from ..common import FileSystemCreationRoutine
TOOL_ID = 99
class DepositLoaderInhibitsStorage:
"""Mixin class to inhibit the persistence and keep in memory the data
sent for storage.
cf. SWHDepositLoaderNoStorage
def __init__(self, client=None):
# client is not used here, transit it nonetheless to other mixins
# typed data
self.state = {
'origin': [],
'origin_visit': [],
'origin_metadata': [],
'content': [],
'directory': [],
'revision': [],
'release': [],
'snapshot': [],
'tool': [],
'provider': []
def _add(self, type, l):
"""Add without duplicates and keeping the insertion order.
type (str): Type of objects concerned by the action
l ([object]): List of 'type' object
col = self.state[type]
for o in l:
if o in col:
def send_origin(self, origin):
origin.update({'id': 1})
self._add('origin', [origin])
return origin['id']
def send_origin_visit(self, origin_id, visit_date):
origin_visit = {
'origin': origin_id,
'visit_date': visit_date,
'visit': 1,
self._add('origin_visit', [origin_visit])
return origin_visit
def send_origin_metadata(self, origin_id, visit_date, provider_id, tool_id,
origin_metadata = {
'origin_id': origin_id,
'visit_date': visit_date,
'provider_id': provider_id,
'tool_id': tool_id,
'metadata': metadata
self._add('origin_metadata', [origin_metadata])
return origin_metadata
def send_tool(self, tool):
tool = {
'tool_name': tool['tool_name'],
'tool_version': tool['tool_version'],
'tool_configuration': tool['tool_configuration']
self._add('tool', [tool])
tool_id = TOOL_ID
return tool_id
def send_provider(self, provider):
provider = {
'provider_name': provider['provider_name'],
'provider_type': provider['provider_type'],
'provider_url': provider['provider_url'],
'metadata': provider['metadata']
self._add('provider', [provider])
provider_id = PROVIDER_ID
return provider_id
def maybe_load_contents(self, contents):
self._add('content', contents)
def maybe_load_directories(self, directories):
self._add('directory', directories)
def maybe_load_revisions(self, revisions):
self._add('revision', revisions)
def maybe_load_releases(self, releases):
self._add('release', releases)
def maybe_load_snapshot(self, snapshot):
self._add('snapshot', [snapshot])
def open_fetch_history(self):
def close_fetch_history_failure(self, fetch_history_id):
def close_fetch_history_success(self, fetch_history_id):
def update_origin_visit(self, origin_id, visit, status):
self.status = status
# Override to do nothing at the end
def close_failure(self):
def close_success(self):
class TestLoaderUtils(unittest.TestCase):
def assertRevisionsOk(self, expected_revisions): # noqa: N802
"""Check the loader's revisions match the expected revisions.
Expects self.loader to be instantiated and ready to be
inspected (meaning the loading took place).
expected_revisions (dict): Dict with key revision id,
value the targeted directory id.
# The last revision being the one used later to start back from
for rev in self.loader.state['revision']:
rev_id = hashutil.hash_to_hex(rev['id'])
directory_id = hashutil.hash_to_hex(rev['directory'])
self.assertEquals(expected_revisions[rev_id], directory_id)
class SWHDepositLoaderNoStorage(DepositLoaderInhibitsStorage,
"""Loader to test.
It inherits from the actual deposit loader to actually test its
correct behavior. It also inherits from
DepositLoaderInhibitsStorage so that no persistence takes place.
class DepositLoaderScenarioTest(APITestCase, WithAuthTestCase,
BasicTestCase, CommonCreationRoutine,
FileSystemCreationRoutine, TestLoaderUtils):
def setUp(self):
# create the extraction dir used by the loader
os.makedirs(TEST_LOADER_CONFIG['extraction_dir'], exist_ok=True)
# 1. create a deposit with archive and metadata
self.deposit_id = self.create_simple_binary_deposit()
# 2. Sets a basic client which accesses the test data
loader_client = SWHDepositTestClient(self.client,
# 3. setup loader with no persistence and that client
self.loader = SWHDepositLoaderNoStorage(client=loader_client)
def tearDown(self):
def inject_deposit_ready(self):
"""Load a deposit which is ready
args = [self.collection.name, self.deposit_id]
archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args)
deposit_meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args)
deposit_update_url = reverse(PRIVATE_PUT_DEPOSIT, args=args)
# when
# then
self.assertEquals(len(self.loader.state['content']), 1)
self.assertEquals(len(self.loader.state['directory']), 1)
self.assertEquals(len(self.loader.state['revision']), 1)
self.assertEquals(len(self.loader.state['release']), 0)
self.assertEquals(len(self.loader.state['snapshot']), 1)
def inject_deposit_verify_metadata(self):
"""Load a deposit with metadata, test metadata integrity
self.deposit_metadata_id = self.add_metadata_to_deposit(
- self.deposit_id)
+ self.deposit_id)
args = [self.collection.name, self.deposit_metadata_id]
archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args)
deposit_meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args)
deposit_update_url = reverse(PRIVATE_PUT_DEPOSIT, args=args)
# when
# then
self.assertEquals(len(self.loader.state['content']), 1)
self.assertEquals(len(self.loader.state['directory']), 1)
self.assertEquals(len(self.loader.state['revision']), 1)
self.assertEquals(len(self.loader.state['release']), 0)
self.assertEquals(len(self.loader.state['snapshot']), 1)
self.assertEquals(len(self.loader.state['origin_metadata']), 1)
self.assertEquals(len(self.loader.state['tool']), 1)
self.assertEquals(len(self.loader.state['provider']), 1)
atom = '{http://www.w3.org/2005/Atom}'
codemeta = '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}'
expected_origin_metadata = {
atom + 'author': {
atom + 'email': 'hal@ccsd.cnrs.fr',
atom + 'name': 'HAL'
codemeta + 'url':
- 'https://hal-test.archives-ouvertes.fr/hal-01243065',
- codemeta + 'runtimePlatform': 'phpstorm',
- codemeta + 'license': {
- codemeta + 'name':
- 'CeCILL Free Software License Agreement v1.1'
- },
- codemeta + 'author': {
+ 'https://hal-test.archives-ouvertes.fr/hal-01243065',
+ codemeta + 'runtimePlatform': ['phpstorm'],
+ codemeta + 'license': [
+ {
+ codemeta + 'name': 'GNU General Public License v3.0 only'
+ },
+ {
+ codemeta + 'name': 'CeCILL Free Software License Agreement v1.1' # noqa
+ }
+ ],
+ codemeta + 'author': [{
codemeta + 'name': 'Morane Gruenpeter'
- },
- codemeta + 'programmingLanguage': 'C',
+ }],
+ codemeta + 'programmingLanguage': ['php', 'python', 'C'],
codemeta + 'applicationCategory': 'test',
codemeta + 'dateCreated': '2017-05-03T16:08:47+02:00',
codemeta + 'version': 1,
atom + 'external_identifier': 'hal-01243065',
atom + 'title': 'Composing a Web of Audio Applications',
codemeta + 'description': 'this is the description',
atom + 'id': 'hal-01243065',
atom + 'client': 'hal',
codemeta + 'keywords': 'DSP programming,Web',
codemeta + 'developmentStatus': 'stable'
result = self.loader.state['origin_metadata'][0]
self.assertEquals(result['metadata'], expected_origin_metadata)
self.assertEquals(result['tool_id'], TOOL_ID)
self.assertEquals(result['provider_id'], PROVIDER_ID)