diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py index 9f52a3af..e3213b4b 100644 --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -1,94 +1,103 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining parsers with SWORD 2.0 supported mediatypes. """ from xml.parsers.expat import ExpatError from django.conf import settings from rest_framework.parsers import BaseParser, FileUploadParser, MultiPartParser import xmltodict from swh.deposit.errors import ParserError class SWHFileUploadZipParser(FileUploadParser): """File upload parser limited to zip archive. """ media_type = "application/zip" class SWHFileUploadTarParser(FileUploadParser): """File upload parser limited to tarball (tar, tar.gz, tar.*) archives. """ media_type = "application/x-tar" class SWHXMLParser(BaseParser): """ XML parser. """ media_type = "application/xml" def parse(self, stream, media_type=None, parser_context=None): """ Parses the incoming bytestream as XML and returns the resulting data. """ parser_context = parser_context or {} encoding = parser_context.get("encoding", settings.DEFAULT_CHARSET) - data = xmltodict.parse(stream, encoding=encoding, process_namespaces=False) + namespaces = { + "http://www.w3.org/2005/Atom": None, + "http://purl.org/dc/terms/": None, + "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0": "codemeta", + "http://purl.org/net/sword/": "sword", + } + + data = xmltodict.parse( + stream, encoding=encoding, namespaces=namespaces, process_namespaces=True + ) if "entry" in data: data = data["entry"] return data class SWHAtomEntryParser(SWHXMLParser): """Atom entry parser limited to specific mediatype """ media_type = "application/atom+xml;type=entry" def parse(self, stream, media_type=None, parser_context=None): # We do not actually want to parse the stream yet # because we want to keep the raw data as well # this is done later in the atom entry call # (cf. swh.deposit.api.common.APIBase._atom_entry) return stream class SWHMultiPartParser(MultiPartParser): """Multipart parser limited to a subset of mediatypes. """ media_type = "multipart/*; *" def parse_xml(raw_content): """Parse xml body. Args: raw_content (bytes): The content to parse Raises: ParserError in case of a malformed xml Returns: content parsed as dict. """ try: return SWHXMLParser().parse(raw_content) except ExpatError as e: raise ParserError(str(e)) diff --git a/swh/deposit/tests/api/test_deposit_private_read_metadata.py b/swh/deposit/tests/api/test_deposit_private_read_metadata.py index 76d1efde..cf2c2fcf 100644 --- a/swh/deposit/tests/api/test_deposit_private_read_metadata.py +++ b/swh/deposit/tests/api/test_deposit_private_read_metadata.py @@ -1,551 +1,543 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.urls import reverse from rest_framework import status from swh.deposit import __version__ from swh.deposit.config import EDIT_SE_IRI, PRIVATE_GET_DEPOSIT_METADATA, SWH_PERSON from swh.deposit.models import Deposit PRIVATE_GET_DEPOSIT_METADATA_NC = PRIVATE_GET_DEPOSIT_METADATA + "-nc" def private_get_raw_url_endpoints(collection, deposit): """There are 2 endpoints to check (one with collection, one without)""" deposit_id = deposit if isinstance(deposit, int) else deposit.id return [ reverse(PRIVATE_GET_DEPOSIT_METADATA, args=[collection.name, deposit_id]), reverse(PRIVATE_GET_DEPOSIT_METADATA_NC, args=[deposit_id]), ] def update_deposit(authenticated_client, collection, deposit, atom_dataset): for atom_data in ["entry-data2", "entry-data3"]: update_deposit_with_metadata( authenticated_client, collection, deposit, atom_dataset[atom_data] ) return deposit def update_deposit_with_metadata(authenticated_client, collection, deposit, metadata): # update deposit's metadata response = authenticated_client.post( reverse(EDIT_SE_IRI, args=[collection.name, deposit.id]), content_type="application/atom+xml;type=entry", data=metadata, HTTP_SLUG=deposit.external_id, HTTP_IN_PROGRESS=True, ) assert response.status_code == status.HTTP_201_CREATED return deposit def test_read_metadata( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private metadata read api to existing deposit should return metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "origin_metadata": { "metadata": { - "@xmlns": ["http://www.w3.org/2005/Atom"], "author": ["some awesome author", "another one", "no one"], "codemeta:dateCreated": "2017-10-07T15:17:08Z", "external_identifier": "some-external-id", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", # noqa }, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [], }, } assert data == expected_meta def test_read_metadata_revision_with_parent( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """Private read metadata to a deposit (with parent) returns metadata """ deposit = partial_deposit deposit.external_id = "some-external-id" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) rev_id = "da78a9d4cf1d5d29873693fd496142e3a18c20fa" swh_id = "swh:1:rev:%s" % rev_id fake_parent = Deposit( swh_id=swh_id, client=deposit.client, collection=deposit.collection ) fake_parent.save() deposit.parent = fake_parent deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", }, "origin_metadata": { "metadata": { - "@xmlns": ["http://www.w3.org/2005/Atom"], "author": ["some awesome author", "another one", "no one"], "codemeta:dateCreated": "2017-10-07T15:17:08Z", "external_identifier": "some-external-id", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", # noqa }, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": "test", "id": deposit.id, "collection": "test", "revision_parents": [rev_id], }, } assert data == expected_meta def test_read_metadata_3( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """date(Created|Published) provided, uses author/committer date """ deposit = partial_deposit deposit.external_id = "hal-01243065" deposit.save() deposit = update_deposit( authenticated_client, deposit_collection, deposit, atom_dataset ) # add metadata to the deposit with datePublished and dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 """ ) update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() metadata = { - "@xmlns": ["http://www.w3.org/2005/Atom"], - "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": [ "some awesome author", "another one", "no one", {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, ], "client": "hal", "codemeta:applicationCategory": "test", "codemeta:author": {"codemeta:name": "Morane Gruenpeter"}, "codemeta:dateCreated": [ "2017-10-07T15:17:08Z", "2015-04-06T17:08:47+02:00", ], "codemeta:datePublished": "2017-05-03T16:08:47+02:00", "codemeta:description": "this is the description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ {"codemeta:name": "GNU General Public License v3.0 only"}, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", # noqa "codemeta:version": "1", "external_identifier": ["some-external-id", "hal-01243065"], "id": "hal-01243065", "title": "Composing a Web of Audio Applications", "url": "https://hal-test.archives-ouvertes.fr/some-external-id", } expected_meta = { "origin": { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/hal-01243065", }, "origin_metadata": { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, }, "deposit": { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1507389428}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], }, } assert data == expected_meta def test_read_metadata_4( authenticated_client, deposit_collection, atom_dataset, partial_deposit ): """dateCreated/datePublished not provided, revision uses complete_date """ deposit = partial_deposit codemeta_entry_data = atom_dataset["metadata"] % "" deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) # will use the deposit completed date as fallback date deposit.complete_date = "2016-04-06" deposit.save() for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() metadata = { - "@xmlns": "http://www.w3.org/2005/Atom", - "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, "client": "hal", "codemeta:applicationCategory": "test", "codemeta:author": {"codemeta:name": "Morane Gruenpeter"}, "codemeta:description": "this is the description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ { "codemeta:name": "GNU " "General " "Public " "License " "v3.0 " "only" }, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", "codemeta:version": "1", "external_identifier": "hal-01243065", "id": "hal-01243065", "title": "Composing a Web of Audio Applications", } expected_origin = { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/%s" % (deposit.external_id), } expected_origin_metadata = { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, } expected_deposit_info = { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "author_date": { "negative_utc": False, "offset": 0, "timestamp": {"microseconds": 0, "seconds": 1459900800}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], } expected_meta = { "origin": expected_origin, "origin_metadata": expected_origin_metadata, "deposit": expected_deposit_info, } assert data == expected_meta def test_read_metadata_5( authenticated_client, deposit_collection, partial_deposit, atom_dataset ): """dateCreated/datePublished provided, revision uses author/committer date If multiple dateCreated provided, the first occurrence (of dateCreated) is selected. If multiple datePublished provided, the first occurrence (of datePublished) is selected. """ deposit = partial_deposit # add metadata to the deposit with multiple datePublished/dateCreated codemeta_entry_data = ( atom_dataset["metadata"] % """ 2015-04-06T17:08:47+02:00 2017-05-03T16:08:47+02:00 2016-04-06T17:08:47+02:00 2018-05-03T16:08:47+02:00 """ ) deposit = update_deposit_with_metadata( authenticated_client, deposit_collection, deposit, codemeta_entry_data ) for url in private_get_raw_url_endpoints(deposit_collection, deposit): response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK assert response._headers["content-type"][1] == "application/json" data = response.json() expected_origin = { "type": "deposit", "url": "https://hal-test.archives-ouvertes.fr/external-id-partial", } metadata = { - "@xmlns": "http://www.w3.org/2005/Atom", - "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "author": {"email": "hal@ccsd.cnrs.fr", "name": "HAL"}, "client": "hal", "codemeta:applicationCategory": "test", "codemeta:author": {"codemeta:name": "Morane Gruenpeter"}, "codemeta:dateCreated": [ "2015-04-06T17:08:47+02:00", "2016-04-06T17:08:47+02:00", ], "codemeta:datePublished": [ "2017-05-03T16:08:47+02:00", "2018-05-03T16:08:47+02:00", ], "codemeta:description": "this is the description", "codemeta:developmentStatus": "stable", "codemeta:keywords": "DSP programming", "codemeta:license": [ { "codemeta:name": "GNU " "General " "Public " "License " "v3.0 " "only" }, { "codemeta:name": "CeCILL " "Free " "Software " "License " "Agreement " "v1.1" }, ], "codemeta:programmingLanguage": ["php", "python", "C"], "codemeta:runtimePlatform": "phpstorm", "codemeta:url": "https://hal-test.archives-ouvertes.fr/hal-01243065", # noqa "codemeta:version": "1", "external_identifier": "hal-01243065", "id": "hal-01243065", "title": "Composing a Web of Audio Applications", } expected_origin_metadata = { "metadata": metadata, "provider": { "metadata": {}, "provider_name": "", "provider_type": "deposit_client", "provider_url": "https://hal-test.archives-ouvertes.fr/", }, "tool": { "configuration": {"sword_version": "2"}, "name": "swh-deposit", "version": __version__, }, } expected_deposit_info = { "author": SWH_PERSON, "committer": SWH_PERSON, "committer_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1493820527}, }, "author_date": { "negative_utc": False, "offset": 120, "timestamp": {"microseconds": 0, "seconds": 1428332927}, }, "client": deposit_collection.name, "id": deposit.id, "collection": deposit_collection.name, "revision_parents": [], } expected_meta = { "origin": expected_origin, "origin_metadata": expected_origin_metadata, "deposit": expected_deposit_info, } assert data == expected_meta def test_access_to_nonexisting_deposit_returns_404_response( authenticated_client, deposit_collection, ): """Read unknown collection should return a 404 response """ unknown_id = 999 try: Deposit.objects.get(pk=unknown_id) except Deposit.DoesNotExist: assert True for url in private_get_raw_url_endpoints(deposit_collection, unknown_id): response = authenticated_client.get(url) assert response.status_code == status.HTTP_404_NOT_FOUND msg = "Deposit with id %s does not exist" % unknown_id assert msg in response.content.decode("utf-8") diff --git a/swh/deposit/tests/api/test_parser.py b/swh/deposit/tests/api/test_parser.py index 0adea4f5..d29d27c4 100644 --- a/swh/deposit/tests/api/test_parser.py +++ b/swh/deposit/tests/api/test_parser.py @@ -1,133 +1,129 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import OrderedDict import io from swh.deposit.parsers import SWHXMLParser def test_parsing_without_duplicates(): xml_no_duplicate = io.BytesIO( b""" Awesome Compiler GPL3.0 https://opensource.org/licenses/GPL-3.0 Python3 author1 Inria ocaml http://issuetracker.com """ ) actual_result = SWHXMLParser().parse(xml_no_duplicate) expected_dict = OrderedDict( [ - ("@xmlns", "http://www.w3.org/2005/Atom"), - ("@xmlns:codemeta", "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"), ("title", "Awesome Compiler"), ( "codemeta:license", OrderedDict( [ ("codemeta:name", "GPL3.0"), ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), ] ), ), ("codemeta:runtimePlatform", "Python3"), ( "codemeta:author", OrderedDict( [("codemeta:name", "author1"), ("codemeta:affiliation", "Inria")] ), ), ("codemeta:programmingLanguage", "ocaml"), ("codemeta:issueTracker", "http://issuetracker.com"), ] ) assert expected_dict == actual_result def test_parsing_with_duplicates(): xml_with_duplicates = io.BytesIO( b""" Another Compiler GNU/Linux GPL3.0 https://opensource.org/licenses/GPL-3.0 Un*x author1 Inria author2 Inria ocaml haskell spdx http://spdx.org python3 """ ) actual_result = SWHXMLParser().parse(xml_with_duplicates) expected_dict = OrderedDict( [ - ("@xmlns", "http://www.w3.org/2005/Atom"), - ("@xmlns:codemeta", "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"), ("title", "Another Compiler"), ("codemeta:runtimePlatform", ["GNU/Linux", "Un*x"]), ( "codemeta:license", [ OrderedDict( [ ("codemeta:name", "GPL3.0"), ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), ] ), OrderedDict( [("codemeta:name", "spdx"), ("codemeta:url", "http://spdx.org")] ), ], ), ( "codemeta:author", [ OrderedDict( [ ("codemeta:name", "author1"), ("codemeta:affiliation", "Inria"), ] ), OrderedDict( [ ("codemeta:name", "author2"), ("codemeta:affiliation", "Inria"), ] ), ], ), ("codemeta:programmingLanguage", ["ocaml", "haskell", "python3"]), ] ) assert expected_dict == actual_result diff --git a/swh/deposit/tests/data/atom/entry-data3.xml b/swh/deposit/tests/data/atom/entry-data3.xml index c75d9739..0e5b7dc6 100644 --- a/swh/deposit/tests/data/atom/entry-data3.xml +++ b/swh/deposit/tests/data/atom/entry-data3.xml @@ -1,6 +1,6 @@ - + another one no one 2017-10-07T15:17:08Z