diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py
index 5c4198c3..c95fa6eb 100644
--- a/swh/deposit/parsers.py
+++ b/swh/deposit/parsers.py
@@ -1,68 +1,128 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Module in charge of defining parsers with SWORD 2.0 supported mediatypes.
"""
+from collections import defaultdict
from decimal import Decimal
from rest_framework.parsers import FileUploadParser
from rest_framework.parsers import MultiPartParser
from rest_framework_xml.parsers import XMLParser
class SWHFileUploadZipParser(FileUploadParser):
"""File upload parser limited to zip archive.
"""
media_type = 'application/zip'
class SWHFileUploadTarParser(FileUploadParser):
"""File upload parser limited to zip archive.
"""
media_type = 'application/x-tar'
-class SWHXMLParser(XMLParser):
+class ListXMLParser(XMLParser):
+ """Patch XMLParser behavior to not merge duplicated key entries.
+
+ """
+ # special tags that must be cast to list
+ _tags = [
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author',
+ ]
+
+ # converted tags to list
+ _lists = None
+
+ def __init__(self):
+ self._reset()
+
+ def _reset(self):
+ self._lists = defaultdict(list)
+
+ def parse(self, stream, media_type=None, parser_context=None):
+ data = super().parse(
+ stream, media_type=media_type, parser_context=parser_context)
+ # Update the special values
+ for key, value in self._lists.items():
+ data[key] = value
+ self._reset()
+
+ return data
+
+ def _xml_convert(self, element):
+ children = list(element)
+ if len(children) == 0:
+ data = self._type_convert(element.text)
+ if element.tag in self._tags:
+ if data not in self._lists[element.tag]:
+ self._lists[element.tag].append(data)
+ return data
+
+ # if the first child tag is list-item, it means all
+ # children are list-item
+ if children[0].tag == "list-item":
+ data = []
+ for child in children:
+ data.append(self._xml_convert(child))
+ return data
+
+ data = {}
+ for child in children:
+ data[child.tag] = self._xml_convert(child)
+
+ if element.tag in self._tags:
+ if data not in self._lists[element.tag]:
+ self._lists[element.tag].append(data)
+
+ return data
+
+
+class SWHXMLParser(ListXMLParser):
def _type_convert(self, value):
"""Override the default type converter to avoid having decimal in the
resulting output.
"""
value = super()._type_convert(value)
if isinstance(value, Decimal):
value = str(value)
return value
class SWHAtomEntryParser(SWHXMLParser):
"""Atom entry parser limited to specific mediatype
"""
media_type = 'application/atom+xml;type=entry'
class SWHMultiPartParser(MultiPartParser):
"""Multipart parser limited to a subset of mediatypes.
"""
media_type = 'multipart/*; *'
def parse_xml(raw_content):
"""Parse xml body.
Args:
raw_content (bytes): The content to parse
Returns:
content parsed as dict.
"""
return SWHXMLParser().parse(raw_content)
diff --git a/swh/deposit/tests/api/test_parser.py b/swh/deposit/tests/api/test_parser.py
new file mode 100644
index 00000000..78a237bc
--- /dev/null
+++ b/swh/deposit/tests/api/test_parser.py
@@ -0,0 +1,116 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import io
+
+from nose.tools import istest
+from rest_framework.test import APITestCase
+
+from swh.deposit.parsers import SWHXMLParser
+
+
+class ParsingTest(APITestCase):
+ """Access to main entry point is ok without authentication
+
+ """
+ @istest
+ def parsing_without_duplicates(self):
+ xml_no_duplicate = io.BytesIO(b'''
+
+ Awesome Compiler
+
+ GPL3.0
+ https://opensource.org/licenses/GPL-3.0
+
+ Python3
+
+ author1
+ Inria
+
+ ocaml
+ http://issuetracker.com
+ ''')
+
+ actual_result = SWHXMLParser().parse(xml_no_duplicate)
+ expected_dict = {
+ '{http://www.w3.org/2005/Atom}title':
+ 'Awesome Compiler',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author':
+ [{'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation':
+ 'Inria',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'author1'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}issueTracker':
+ 'http://issuetracker.com',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license':
+ [{'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'GPL3.0',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url':
+ 'https://opensource.org/licenses/GPL-3.0'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage':
+ ['ocaml'],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform':
+ ['Python3']
+ }
+ self.assertEqual(expected_dict, actual_result)
+
+ @istest
+ def parsing_with_duplicates(self):
+ xml_with_duplicates = io.BytesIO(b'''
+
+ Another Compiler
+ GNU/Linux
+
+ GPL3.0
+ https://opensource.org/licenses/GPL-3.0
+
+ Un*x
+
+ author1
+ Inria
+
+
+ author2
+ Inria
+
+ ocaml
+ haskell
+
+ spdx
+ http://spdx.org
+
+ python3
+ ''')
+
+ actual_result = SWHXMLParser().parse(xml_with_duplicates)
+
+ expected_dict = {
+ '{http://www.w3.org/2005/Atom}title':
+ 'Another Compiler',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author': [
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation':
+ 'Inria',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'author1'},
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}affiliation':
+ 'Inria',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'author2'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}license': [
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'GPL3.0',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url':
+ 'https://opensource.org/licenses/GPL-3.0'},
+ {'{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}name':
+ 'spdx',
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}url':
+ 'http://spdx.org'}],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}programmingLanguage':
+ [ 'ocaml', 'haskell', 'python3'],
+ '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}runtimePlatform':
+ ['GNU/Linux', 'Un*x'] }
+ self.assertEqual(expected_dict, actual_result)
diff --git a/swh/deposit/tests/loader/test_loader.py b/swh/deposit/tests/loader/test_loader.py
index 66c04fec..02eaa05f 100644
--- a/swh/deposit/tests/loader/test_loader.py
+++ b/swh/deposit/tests/loader/test_loader.py
@@ -1,289 +1,293 @@
# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import unittest
import shutil
from nose.tools import istest
from nose.plugins.attrib import attr
from rest_framework.test import APITestCase
from swh.model import hashutil
from swh.deposit.loader import loader
from swh.deposit.config import PRIVATE_GET_RAW_CONTENT
from swh.deposit.config import PRIVATE_GET_DEPOSIT_METADATA
from swh.deposit.config import PRIVATE_PUT_DEPOSIT
from django.core.urlresolvers import reverse
from .common import SWHDepositTestClient, CLIENT_TEST_CONFIG
from .. import TEST_LOADER_CONFIG
from ..common import BasicTestCase, WithAuthTestCase, CommonCreationRoutine
from ..common import FileSystemCreationRoutine
TOOL_ID = 99
PROVIDER_ID = 12
class DepositLoaderInhibitsStorage:
"""Mixin class to inhibit the persistence and keep in memory the data
sent for storage.
cf. SWHDepositLoaderNoStorage
"""
def __init__(self, client=None):
# client is not used here, transit it nonetheless to other mixins
super().__init__(client=client)
# typed data
self.state = {
'origin': [],
'origin_visit': [],
'origin_metadata': [],
'content': [],
'directory': [],
'revision': [],
'release': [],
'snapshot': [],
'tool': [],
'provider': []
}
def _add(self, type, l):
"""Add without duplicates and keeping the insertion order.
Args:
type (str): Type of objects concerned by the action
l ([object]): List of 'type' object
"""
col = self.state[type]
for o in l:
if o in col:
continue
col.extend([o])
def send_origin(self, origin):
origin.update({'id': 1})
self._add('origin', [origin])
return origin['id']
def send_origin_visit(self, origin_id, visit_date):
origin_visit = {
'origin': origin_id,
'visit_date': visit_date,
'visit': 1,
}
self._add('origin_visit', [origin_visit])
return origin_visit
def send_origin_metadata(self, origin_id, visit_date, provider_id, tool_id,
metadata):
origin_metadata = {
'origin_id': origin_id,
'visit_date': visit_date,
'provider_id': provider_id,
'tool_id': tool_id,
'metadata': metadata
}
self._add('origin_metadata', [origin_metadata])
return origin_metadata
def send_tool(self, tool):
tool = {
'tool_name': tool['tool_name'],
'tool_version': tool['tool_version'],
'tool_configuration': tool['tool_configuration']
}
self._add('tool', [tool])
tool_id = TOOL_ID
return tool_id
def send_provider(self, provider):
provider = {
'provider_name': provider['provider_name'],
'provider_type': provider['provider_type'],
'provider_url': provider['provider_url'],
'metadata': provider['metadata']
}
self._add('provider', [provider])
provider_id = PROVIDER_ID
return provider_id
def maybe_load_contents(self, contents):
self._add('content', contents)
def maybe_load_directories(self, directories):
self._add('directory', directories)
def maybe_load_revisions(self, revisions):
self._add('revision', revisions)
def maybe_load_releases(self, releases):
self._add('release', releases)
def maybe_load_snapshot(self, snapshot):
self._add('snapshot', [snapshot])
def open_fetch_history(self):
pass
def close_fetch_history_failure(self, fetch_history_id):
pass
def close_fetch_history_success(self, fetch_history_id):
pass
def update_origin_visit(self, origin_id, visit, status):
self.status = status
# Override to do nothing at the end
def close_failure(self):
pass
def close_success(self):
pass
class TestLoaderUtils(unittest.TestCase):
def assertRevisionsOk(self, expected_revisions): # noqa: N802
"""Check the loader's revisions match the expected revisions.
Expects self.loader to be instantiated and ready to be
inspected (meaning the loading took place).
Args:
expected_revisions (dict): Dict with key revision id,
value the targeted directory id.
"""
# The last revision being the one used later to start back from
for rev in self.loader.state['revision']:
rev_id = hashutil.hash_to_hex(rev['id'])
directory_id = hashutil.hash_to_hex(rev['directory'])
self.assertEquals(expected_revisions[rev_id], directory_id)
class SWHDepositLoaderNoStorage(DepositLoaderInhibitsStorage,
loader.DepositLoader):
"""Loader to test.
It inherits from the actual deposit loader to actually test its
correct behavior. It also inherits from
DepositLoaderInhibitsStorage so that no persistence takes place.
"""
pass
@attr('fs')
class DepositLoaderScenarioTest(APITestCase, WithAuthTestCase,
BasicTestCase, CommonCreationRoutine,
FileSystemCreationRoutine, TestLoaderUtils):
def setUp(self):
super().setUp()
# create the extraction dir used by the loader
os.makedirs(TEST_LOADER_CONFIG['extraction_dir'], exist_ok=True)
# 1. create a deposit with archive and metadata
self.deposit_id = self.create_simple_binary_deposit()
# 2. Sets a basic client which accesses the test data
loader_client = SWHDepositTestClient(self.client,
config=CLIENT_TEST_CONFIG)
# 3. setup loader with no persistence and that client
self.loader = SWHDepositLoaderNoStorage(client=loader_client)
def tearDown(self):
super().tearDown()
shutil.rmtree(TEST_LOADER_CONFIG['extraction_dir'])
@istest
def inject_deposit_ready(self):
"""Load a deposit which is ready
"""
args = [self.collection.name, self.deposit_id]
archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args)
deposit_meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args)
deposit_update_url = reverse(PRIVATE_PUT_DEPOSIT, args=args)
# when
self.loader.load(archive_url=archive_url,
deposit_meta_url=deposit_meta_url,
deposit_update_url=deposit_update_url)
# then
self.assertEquals(len(self.loader.state['content']), 1)
self.assertEquals(len(self.loader.state['directory']), 1)
self.assertEquals(len(self.loader.state['revision']), 1)
self.assertEquals(len(self.loader.state['release']), 0)
self.assertEquals(len(self.loader.state['snapshot']), 1)
@istest
def inject_deposit_verify_metadata(self):
"""Load a deposit with metadata, test metadata integrity
"""
self.deposit_metadata_id = self.add_metadata_to_deposit(
- self.deposit_id)
+ self.deposit_id)
args = [self.collection.name, self.deposit_metadata_id]
archive_url = reverse(PRIVATE_GET_RAW_CONTENT, args=args)
deposit_meta_url = reverse(PRIVATE_GET_DEPOSIT_METADATA, args=args)
deposit_update_url = reverse(PRIVATE_PUT_DEPOSIT, args=args)
# when
self.loader.load(archive_url=archive_url,
deposit_meta_url=deposit_meta_url,
deposit_update_url=deposit_update_url)
# then
self.assertEquals(len(self.loader.state['content']), 1)
self.assertEquals(len(self.loader.state['directory']), 1)
self.assertEquals(len(self.loader.state['revision']), 1)
self.assertEquals(len(self.loader.state['release']), 0)
self.assertEquals(len(self.loader.state['snapshot']), 1)
self.assertEquals(len(self.loader.state['origin_metadata']), 1)
self.assertEquals(len(self.loader.state['tool']), 1)
self.assertEquals(len(self.loader.state['provider']), 1)
atom = '{http://www.w3.org/2005/Atom}'
codemeta = '{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}'
expected_origin_metadata = {
atom + 'author': {
atom + 'email': 'hal@ccsd.cnrs.fr',
atom + 'name': 'HAL'
},
codemeta + 'url':
- 'https://hal-test.archives-ouvertes.fr/hal-01243065',
- codemeta + 'runtimePlatform': 'phpstorm',
- codemeta + 'license': {
- codemeta + 'name':
- 'CeCILL Free Software License Agreement v1.1'
- },
- codemeta + 'author': {
+ 'https://hal-test.archives-ouvertes.fr/hal-01243065',
+ codemeta + 'runtimePlatform': ['phpstorm'],
+ codemeta + 'license': [
+ {
+ codemeta + 'name': 'GNU General Public License v3.0 only'
+ },
+ {
+ codemeta + 'name': 'CeCILL Free Software License Agreement v1.1' # noqa
+ }
+ ],
+ codemeta + 'author': [{
codemeta + 'name': 'Morane Gruenpeter'
- },
- codemeta + 'programmingLanguage': 'C',
+ }],
+ codemeta + 'programmingLanguage': ['php', 'python', 'C'],
codemeta + 'applicationCategory': 'test',
codemeta + 'dateCreated': '2017-05-03T16:08:47+02:00',
codemeta + 'version': 1,
atom + 'external_identifier': 'hal-01243065',
atom + 'title': 'Composing a Web of Audio Applications',
codemeta + 'description': 'this is the description',
atom + 'id': 'hal-01243065',
atom + 'client': 'hal',
codemeta + 'keywords': 'DSP programming,Web',
codemeta + 'developmentStatus': 'stable'
}
result = self.loader.state['origin_metadata'][0]
self.assertEquals(result['metadata'], expected_origin_metadata)
self.assertEquals(result['tool_id'], TOOL_ID)
self.assertEquals(result['provider_id'], PROVIDER_ID)