diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -27,7 +27,8 @@ MAX_UPLOAD_SIZE_EXCEEDED, BAD_REQUEST, ERROR_CONTENT, CHECKSUM_MISMATCH, make_error_dict, MEDIATION_NOT_ALLOWED, make_error_response_from_dict, FORBIDDEN, - NOT_FOUND, make_error_response, METHOD_NOT_ALLOWED + NOT_FOUND, make_error_response, METHOD_NOT_ALLOWED, + ParserError, PARSING_ERROR ) from ..models import ( Deposit, DepositRequest, DepositCollection, @@ -502,8 +503,15 @@ if precondition_status_response: return precondition_status_response - raw_metadata, metadata = self._read_metadata( - data['application/atom+xml']) + try: + raw_metadata, metadata = self._read_metadata( + data['application/atom+xml']) + except ParserError: + return make_error_dict( + PARSING_ERROR, + 'Malformed xml metadata', + "The xml received is malformed. " + "Please ensure your metadata file is correctly formatted.") # actual storage of data deposit = self._deposit_put(deposit_id=deposit_id, @@ -560,7 +568,15 @@ - 415 (unsupported media type) if a wrong media type is provided """ - raw_metadata, metadata = self._read_metadata(req.data) + try: + raw_metadata, metadata = self._read_metadata(req.data) + except ParserError: + return make_error_dict( + BAD_REQUEST, + 'Malformed xml metadata', + "The xml received is malformed. " + "Please ensure your metadata file is correctly formatted.") + if not metadata: return make_error_dict( BAD_REQUEST, diff --git a/swh/deposit/errors.py b/swh/deposit/errors.py --- a/swh/deposit/errors.py +++ b/swh/deposit/errors.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -20,6 +20,14 @@ MEDIATION_NOT_ALLOWED = 'mediation-not-allowed' METHOD_NOT_ALLOWED = 'method-not-allowed' MAX_UPLOAD_SIZE_EXCEEDED = 'max_upload_size_exceeded' +PARSING_ERROR = 'parsing-error' + + +class ParserError(ValueError): + """Specific parsing error detected when parsing the xml metadata input + + """ + pass ERRORS = { @@ -53,6 +61,11 @@ 'iri': 'http://purl.org/net/sword/error/ErrorBadRequest', 'tag': 'sword:ErrorBadRequest', }, + PARSING_ERROR: { + 'status': status.HTTP_400_BAD_REQUEST, + 'iri': 'http://purl.org/net/sword/error/ErrorBadRequest', + 'tag': 'sword:ErrorBadRequest', + }, MEDIATION_NOT_ALLOWED: { 'status': status.HTTP_412_PRECONDITION_FAILED, 'iri': 'http://purl.org/net/sword/error/MediationNotAllowed', diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,6 +14,9 @@ from rest_framework.parsers import BaseParser from rest_framework.parsers import FileUploadParser from rest_framework.parsers import MultiPartParser +from xml.parsers.expat import ExpatError + +from swh.deposit.errors import ParserError class SWHFileUploadZipParser(FileUploadParser): @@ -76,8 +79,14 @@ Args: raw_content (bytes): The content to parse + Raises: + ParserError in case of a malformed xml + Returns: content parsed as dict. """ - return SWHXMLParser().parse(raw_content) + try: + return SWHXMLParser().parse(raw_content) + except ExpatError as e: + raise ParserError(str(e)) diff --git a/swh/deposit/tests/api/test_deposit_atom.py b/swh/deposit/tests/api/test_deposit_atom.py --- a/swh/deposit/tests/api/test_deposit_atom.py +++ b/swh/deposit/tests/api/test_deposit_atom.py @@ -74,139 +74,6 @@ all """ - self.atom_entry_data2 = b""" - - %s -""" - - self.atom_entry_data_empty_body = b""" -""" - - self.atom_entry_data3 = b""" - - something -""" - - self.atom_entry_data_atom_only = b""" - - Awesome Compiler - urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a - 1785io25c695 - 2017-10-07T15:17:08Z - some awesome author - """ - - self.atom_entry_data_codemeta = b""" - - Awesome Compiler - urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a - 1785io25c695 - 1785io25c695 - origin url - other identifier, DOI, ARK - Domain - - description - key-word 1 - key-word 2 - creation date - publication date - comment - - article name - article id - - - Collaboration/Projet - project name - id - - see also - Sponsor A - Sponsor B - Platform/OS - dependencies - Version - active - - license - url spdx - - .Net Framework 3.0 - Python2.3 - - author1 - Inria - UPMC - - - author2 - Inria - UPMC - - http://code.com - language 1 - language 2 - http://issuetracker.com - """ # noqa - - self.atom_entry_data_dc_codemeta = b""" - - - - %s - hal-01587361 - https://hal.inria.fr/hal-01587361 - https://hal.inria.fr/hal-01587361/document - https://hal.inria.fr/hal-01587361/file/AffectationRO-v1.0.0.zip - doi:10.5281/zenodo.438684 - The assignment problem - AffectationRO - Gruenpeter, Morane - [INFO] Computer Science [cs] - [INFO.INFO-RO] Computer Science [cs]/Operations Research [cs.RO] - SOFTWARE - Project in OR: The assignment problemA java implementation for the assignment problem first release - description fr - 2015-06-01 - 2017-10-19 - en - - - url stable - Version sur hal - Version entre par lutilisateur - Mots-cls - Commentaire - Rfrence interne - - Collaboration/Projet - nom du projet - id - - Voir aussi - Financement - Projet ANR - Projet Europen - Platform/OS - Dpendances - Etat du dveloppement - - license - url spdx - - Outils de dveloppement- outil no1 - Outils de dveloppement- outil no2 - http://code.com - language 1 - language 2 - """ # noqa - - self.atom_entry_tei = b"""HAL TEI export of hal-01587083CCSDDistributed under a Creative Commons Attribution 4.0 International License

HAL API platform

questionnaire software metadataMoraneGruenpeter7de56c632362954fa84172cad80afe4einria.fr1556733MoraneGruenpeterf85a43a5fb4a2e0778a77e017f28c8fdgmail.com2017-09-29 11:21:322017-10-03 17:20:132017-10-03 17:20:132017-09-292017-09-29contributorMoraneGruenpeterf85a43a5fb4a2e0778a77e017f28c8fdgmail.comCCSDhal-01587083https://hal.inria.fr/hal-01587083gruenpeter:hal-0158708320172017questionnaire software metadataMoraneGruenpeter7de56c632362954fa84172cad80afe4einria.fr1556733EnglishComputer Science [cs]SoftwareIRILLInitiative pour la Recherche et l'Innovation sur le Logiciel Libre
https://www.irill.org/
Universite Pierre et Marie Curie - Paris 6UPMC
4 place Jussieu - 75005 Paris
http://www.upmc.fr/
Institut National de Recherche en Informatique et en AutomatiqueInria
Domaine de VoluceauRocquencourt - BP 10578153 Le Chesnay Cedex
http://www.inria.fr/en/
Universite Paris Diderot - Paris 7UPD7
5 rue Thomas-Mann - 75205 Paris cedex 13
http://www.univ-paris-diderot.fr
""" # noqa - self.atom_entry_data_badly_formatted = b""" """ @@ -250,7 +117,7 @@ """ # noqa - def test_post_deposit_atom_entry_serialization_error(self): + def test_post_deposit_atom_201_even_with_decimal(self): """Posting an initial atom entry should return 201 with deposit receipt """ @@ -276,17 +143,20 @@ sw_version = dr.metadata.get('codemeta:softwareVersion') self.assertEqual(sw_version, '10.4') - def test_post_deposit_atom_empty_body_request(self): + def test_post_deposit_atom_400_with_empty_body(self): """Posting empty body request should return a 400 response """ + atom_entry_data_empty_body = b""" +""" + response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', - data=self.atom_entry_data_empty_body) + data=atom_entry_data_empty_body) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - def test_post_deposit_atom_badly_formatted_is_a_bad_request(self): + def test_post_deposit_atom_400_badly_formatted_atom(self): """Posting a badly formatted atom should return a 400 response """ @@ -296,7 +166,23 @@ data=self.atom_entry_data_badly_formatted) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - def test_post_deposit_atom_without_slug_header_is_bad_request(self): + def test_post_deposit_atom_400_with_parsing_error(self): + """Posting parsing error prone atom should return 400 + + """ + atom_entry_data_parsing_error_prone = b""" + + Composing a Web of Audio Applications + + +""" + response = self.client.post( + reverse(COL_IRI, args=[self.collection.name]), + content_type='application/atom+xml;type=entry', + data=atom_entry_data_parsing_error_prone) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + def test_post_deposit_atom_400_without_slug_header(self): """Posting an atom entry without a slug header should return a 400 """ @@ -314,14 +200,19 @@ self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - def test_post_deposit_atom_unknown_collection(self): + def test_post_deposit_atom_404_unknown_collection(self): """Posting an atom entry to an unknown collection should return a 404 """ + atom_entry_data3 = b""" + + something +""" + response = self.client.post( reverse(COL_IRI, args=['unknown-one']), content_type='application/atom+xml;type=entry', - data=self.atom_entry_data3, + data=atom_entry_data3, HTTP_SLUG='something') self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) @@ -374,8 +265,59 @@ with self.assertRaises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) - atom_entry_data = self.atom_entry_data_dc_codemeta % ( - external_id.encode('utf-8'), ) + atom_entry_data = b""" + + + + %s + hal-01587361 + https://hal.inria.fr/hal-01587361 + https://hal.inria.fr/hal-01587361/document + https://hal.inria.fr/hal-01587361/file/AffectationRO-v1.0.0.zip + doi:10.5281/zenodo.438684 + The assignment problem + AffectationRO + Gruenpeter, Morane + [INFO] Computer Science [cs] + [INFO.INFO-RO] Computer Science [cs]/Operations Research [cs.RO] + SOFTWARE + Project in OR: The assignment problemA java implementation for the assignment problem first release + description fr + 2015-06-01 + 2017-10-19 + en + + + url stable + Version sur hal + Version entre par lutilisateur + Mots-cls + Commentaire + Rfrence interne + + Collaboration/Projet + nom du projet + id + + Voir aussi + Financement + Projet ANR + Projet Europen + Platform/OS + Dpendances + Etat du dveloppement + + license + url spdx + + Outils de dveloppement- outil no1 + Outils de dveloppement- outil no2 + http://code.com + language 1 + language 2 + """ % external_id.encode('utf-8') # noqa # when response = self.client.post( @@ -415,7 +357,7 @@ with self.assertRaises(Deposit.DoesNotExist): Deposit.objects.get(external_id=external_id) - atom_entry_data = self.atom_entry_tei + atom_entry_data = b"""HAL TEI export of hal-01587083CCSDDistributed under a Creative Commons Attribution 4.0 International License

HAL API platform

questionnaire software metadataMoraneGruenpeter7de56c632362954fa84172cad80afe4einria.fr1556733MoraneGruenpeterf85a43a5fb4a2e0778a77e017f28c8fdgmail.com2017-09-29 11:21:322017-10-03 17:20:132017-10-03 17:20:132017-09-292017-09-29contributorMoraneGruenpeterf85a43a5fb4a2e0778a77e017f28c8fdgmail.comCCSDhal-01587083https://hal.inria.fr/hal-01587083gruenpeter:hal-0158708320172017questionnaire software metadataMoraneGruenpeter7de56c632362954fa84172cad80afe4einria.fr1556733EnglishComputer Science [cs]SoftwareIRILLInitiative pour la Recherche et l'Innovation sur le Logiciel Libre
https://www.irill.org/
Universite Pierre et Marie Curie - Paris 6UPMC
4 place Jussieu - 75005 Paris
http://www.upmc.fr/
Institut National de Recherche en Informatique et en AutomatiqueInria
Domaine de VoluceauRocquencourt - BP 10578153 Le Chesnay Cedex
http://www.inria.fr/en/
Universite Paris Diderot - Paris 7UPD7
5 rue Thomas-Mann - 75205 Paris cedex 13
http://www.univ-paris-diderot.fr
""" # noqa # when response = self.client.post( @@ -478,7 +420,10 @@ deposit_requests = DepositRequest.objects.filter(deposit=deposit) self.assertEqual(len(deposit_requests), 1) - atom_entry_data = self.atom_entry_data2 % external_id.encode('utf-8') + atom_entry_data = b""" + + %s +""" % external_id.encode('utf-8') update_uri = response._headers['location'][1] diff --git a/swh/deposit/tests/api/test_deposit_multipart.py b/swh/deposit/tests/api/test_deposit_multipart.py --- a/swh/deposit/tests/api/test_deposit_multipart.py +++ b/swh/deposit/tests/api/test_deposit_multipart.py @@ -400,3 +400,49 @@ 'application/x-tar) and 1 atom+xml entry for ' 'multipart deposit' in response.content.decode('utf-8') ) + + def test_post_deposit_multipart_400_when_badly_formatted_xml(self): + # given + url = reverse(COL_IRI, args=[self.collection.name]) + + data_atom_entry_ko = b""" + + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + +""" + + archive_content = b'some content representing archive' + archive = InMemoryUploadedFile( + BytesIO(archive_content), + field_name='archive0', + name='archive0', + content_type='application/zip', + size=len(archive_content), + charset=None) + + atom_entry = InMemoryUploadedFile( + BytesIO(data_atom_entry_ko), + field_name='atom0', + name='atom0', + content_type='application/atom+xml; charset="utf-8"', + size=len(data_atom_entry_ko), + charset='utf-8') + + # when + response = self.client.post( + url, + format='multipart', + data={ + 'archive': archive, + 'atom_entry': atom_entry, + }, + # + headers + HTTP_IN_PROGRESS='false', + HTTP_SLUG='external-id', + ) + + self.assertIn(b'Malformed xml metadata', response.content) + self.assertEqual(response.status_code, + status.HTTP_400_BAD_REQUEST)