diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py --- a/swh/deposit/api/common.py +++ b/swh/deposit/api/common.py @@ -28,6 +28,7 @@ from swh.deposit.api.converters import convert_status_detail from swh.deposit.auth import HasDepositPermission, KeycloakBasicAuthentication from swh.deposit.models import Deposit +from swh.deposit.parsers import parse_xml from swh.deposit.utils import NAMESPACES, compute_metadata_context from swh.model import hashutil from swh.model.model import ( @@ -53,7 +54,6 @@ DEPOSIT_STATUS_PARTIAL, EDIT_IRI, EM_IRI, - METADATA_KEY, METADATA_TYPE, RAW_METADATA_KEY, SE_IRI, @@ -74,7 +74,6 @@ ParserError, ) from ..models import DepositClient, DepositCollection, DepositRequest -from ..parsers import parse_xml from ..utils import ( extended_swhid_from_qualified, parse_swh_deposit_origin, @@ -319,15 +318,11 @@ ) deposit_request.save() - metadata = deposit_request_data.get(METADATA_KEY) - if metadata: - # TODO: remove non-raw metadata? we don't use these anymore except in - # manual queries to the deposit DB - raw_metadata = deposit_request_data[RAW_METADATA_KEY] + raw_metadata = deposit_request_data.get(RAW_METADATA_KEY) + if raw_metadata: deposit_request = DepositRequest( type=METADATA_TYPE, deposit=deposit, - metadata=metadata, raw_metadata=raw_metadata.decode("utf-8"), ) deposit_request.save() @@ -505,9 +500,7 @@ archive=filehandler.name, ) - def _read_metadata( - self, metadata_stream - ) -> Tuple[bytes, Dict[str, Any], ElementTree.Element]: + def _read_metadata(self, metadata_stream) -> Tuple[bytes, ElementTree.Element]: """ Given a metadata stream, reads the metadata and returns the metadata in three forms: @@ -517,11 +510,8 @@ * parsed as ElementTree, to extract information immediately """ raw_metadata = metadata_stream.read() - metadata_dict = parse_xml(raw_metadata) - metadata_tree = ElementTree.fromstring(raw_metadata) - # TODO: remove metadata_dict? we don't use it anymore, except in manual - # queries to the deposit DB - return raw_metadata, metadata_dict, metadata_tree + metadata_tree = parse_xml(raw_metadata) + return raw_metadata, metadata_tree def _multipart_upload( self, @@ -608,7 +598,7 @@ self._check_file_md5sum(filehandler, headers.content_md5sum) try: - raw_metadata, metadata_dict, metadata_tree = self._read_metadata( + raw_metadata, metadata_tree = self._read_metadata( data["application/atom+xml"] ) except ParserError: @@ -627,7 +617,6 @@ ) deposit_request_data = { ARCHIVE_KEY: filehandler, - METADATA_KEY: metadata_dict, RAW_METADATA_KEY: raw_metadata, } self._deposit_request_put( @@ -646,7 +635,6 @@ self, deposit: Deposit, swhid_reference: Union[str, QualifiedSWHID], - metadata_dict: Dict, metadata_tree: ElementTree.Element, raw_metadata: bytes, deposit_origin: Optional[str] = None, @@ -663,8 +651,6 @@ Args: deposit: Deposit reference swhid_reference: The swhid or the origin to attach metadata information to - metadata_dict: Full dict of metadata for storage in the deposit DB as jsonb - (parsed out of raw_metadata) metadata_tree: Full element tree of metadata to check for validity (parsed out of raw_metadata) raw_metadata: The actual raw metadata to send in the storage metadata @@ -696,7 +682,6 @@ # replace metadata within the deposit backend deposit_request_data = { - METADATA_KEY: metadata_dict, RAW_METADATA_KEY: raw_metadata, } @@ -839,9 +824,7 @@ ) try: - raw_metadata, metadata_dict, metadata_tree = self._read_metadata( - metadata_stream - ) + raw_metadata, metadata_tree = self._read_metadata(metadata_stream) except ParserError: raise DepositError( BAD_REQUEST, @@ -850,7 +833,7 @@ "Please ensure your metadata file is correctly formatted.", ) - if metadata_dict is None: + if len(metadata_tree) == 0: raise DepositError( BAD_REQUEST, empty_atom_entry_summary, empty_atom_entry_desc ) @@ -878,7 +861,7 @@ if swhid_ref is not None: deposit.save() # We need a deposit id target_swhid, depo, depo_request = self._store_metadata_deposit( - deposit, swhid_ref, metadata_dict, metadata_tree, raw_metadata + deposit, swhid_ref, metadata_tree, raw_metadata ) deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS @@ -902,7 +885,7 @@ self._deposit_request_put( deposit, - {METADATA_KEY: metadata_dict, RAW_METADATA_KEY: raw_metadata}, + {RAW_METADATA_KEY: raw_metadata}, replace_metadata, replace_archives, ) diff --git a/swh/deposit/api/edit.py b/swh/deposit/api/edit.py --- a/swh/deposit/api/edit.py +++ b/swh/deposit/api/edit.py @@ -108,9 +108,7 @@ ) try: - raw_metadata, metadata_dict, metadata_tree = self._read_metadata( - request.data - ) + raw_metadata, metadata_tree = self._read_metadata(request.data) except ParserError: raise DepositError( BAD_REQUEST, @@ -119,7 +117,7 @@ "Please ensure your metadata file is correctly formatted.", ) - if not metadata_dict: + if len(metadata_tree) == 0: raise DepositError( BAD_REQUEST, "Empty body request is not supported", @@ -130,7 +128,6 @@ _, deposit, deposit_request = self._store_metadata_deposit( deposit, QualifiedSWHID.from_string(swhid), - metadata_dict, metadata_tree, raw_metadata, deposit.origin_url, diff --git a/swh/deposit/cli/client.py b/swh/deposit/cli/client.py --- a/swh/deposit/cli/client.py +++ b/swh/deposit/cli/client.py @@ -19,6 +19,7 @@ import click from swh.deposit.cli import deposit +from swh.deposit.utils import NAMESPACES logger = logging.getLogger(__name__) @@ -160,10 +161,9 @@ if "error" in sd_content: msg = sd_content["error"] raise InputError(f"Service document retrieval: {msg}") - collection = sd_content["app:service"]["app:workspace"]["app:collection"][ - "sword:name" - ] - return collection + return sd_content.findtext( + "app:workspace/app:collection/sword:name", namespaces=NAMESPACES + ) def client_command_parse_input( diff --git a/swh/deposit/client.py b/swh/deposit/client.py --- a/swh/deposit/client.py +++ b/swh/deposit/client.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2021 The Software Heritage developers +# Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -10,9 +10,10 @@ import hashlib import logging import os -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urljoin import warnings +from xml.etree import ElementTree import requests from requests import Response @@ -20,7 +21,7 @@ from swh.core.config import load_from_envvar from swh.deposit import __version__ as swh_deposit_version -from swh.deposit.utils import parse_xml +from swh.deposit.utils import NAMESPACES logger = logging.getLogger(__name__) @@ -303,12 +304,13 @@ 'detail': Some more detail about the error if any """ - data = parse_xml(xml_content) - sword_error = data["sword:error"] + data = ElementTree.fromstring(xml_content) return { - "summary": sword_error["atom:summary"], - "detail": sword_error.get("detail", ""), - "sword:verboseDescription": sword_error.get("sword:verboseDescription", ""), + "summary": data.findtext("atom:summary", namespaces=NAMESPACES), + "detail": data.findtext("detail", "", namespaces=NAMESPACES).strip(), + "sword:verboseDescription": data.findtext( + "sword:verboseDescription", "", namespaces=NAMESPACES + ).strip(), } def do_execute(self, method: str, url: str, info: Dict, **kwargs) -> Response: @@ -397,7 +399,44 @@ """Parse service document's success response. """ - return parse_xml(xml_content) + namespaces = dict(NAMESPACES) + namespaces["dcterms"] = namespaces.pop("dc") # for backward compatibility + single_keys = [ + "atom:title", + "sword:collectionPolicy", + "dcterms:abstract", + "sword:treatment", + "sword:mediation", + "sword:metadataRelevantHeader", + "sword:service", + "sword:name", + ] + multi_keys = [ + "app:accept", + "sword:acceptPackaging", + ] + data = ElementTree.fromstring(xml_content) + workspace: List[Dict[str, Any]] = [ + { + "app:collection": { + **{ + key: collection.findtext(key, namespaces=namespaces) + for key in single_keys + }, + **{ + key: [ + elt.text + for elt in collection.findall(key, namespaces=namespaces) + ] + for key in multi_keys + }, + } + } + for collection in data.findall( + "app:workspace/app:collection", namespaces=namespaces + ) + ] + return {"app:service": {"app:workspace": workspace}} def parse_result_error(self, xml_content: str) -> Dict[str, Any]: result = super().parse_result_error(xml_content) @@ -434,7 +473,7 @@ """Given an xml content as string, returns a deposit dict. """ - data = parse_xml(xml_content) + data = ElementTree.fromstring(xml_content) keys = [ "deposit_id", "deposit_status", @@ -443,7 +482,7 @@ "deposit_swh_id_context", "deposit_external_id", ] - return {key: data.get("swh:" + key) for key in keys} + return {key: data.findtext("swh:" + key, namespaces=NAMESPACES) for key in keys} class CollectionListDepositClient(BaseDepositClient): @@ -481,8 +520,8 @@ """ link_header = headers.get("Link", "") if headers else "" links = parse_header_links(link_header) - data = parse_xml(xml_content)["atom:feed"] - total_result = data.get("swh:count", 0) + data = ElementTree.fromstring(xml_content) + total_result = data.findtext("swh:count", "0", namespaces=NAMESPACES).strip() keys = [ "id", "reception_date", @@ -494,13 +533,12 @@ "swhid_context", "origin_url", ] - entries_ = data.get("atom:entry", []) - entries = [entries_] if isinstance(entries_, dict) else entries_ + entries = data.findall("atom:entry", namespaces=NAMESPACES) deposits_d = [ { - key: deposit.get(f"swh:{key}") + key: deposit.findtext(f"swh:{key}", namespaces=NAMESPACES) for key in keys - if deposit.get(f"swh:{key}") is not None + if deposit.find(f"swh:{key}", namespaces=NAMESPACES) is not None } for deposit in entries ] @@ -538,14 +576,14 @@ """Given an xml content as string, returns a deposit dict. """ - data = parse_xml(xml_content) + data = ElementTree.fromstring(xml_content) keys = [ "deposit_id", "deposit_status", "deposit_status_detail", "deposit_date", ] - return {key: data.get("swh:" + key) for key in keys} + return {key: data.findtext("swh:" + key, namespaces=NAMESPACES) for key in keys} def compute_headers(self, info: Dict[str, Any]) -> Dict[str, Any]: return info @@ -648,13 +686,13 @@ """Given an xml content as string, returns a deposit dict. """ - data = parse_xml(xml_content) + data = ElementTree.fromstring(xml_content) keys = [ "deposit_id", "deposit_status", "deposit_date", ] - return {key: data.get("swh:" + key) for key in keys} + return {key: data.findtext("swh:" + key, namespaces=NAMESPACES) for key in keys} class CreateMultipartDepositClient(BaseCreateDepositClient): diff --git a/swh/deposit/config.py b/swh/deposit/config.py --- a/swh/deposit/config.py +++ b/swh/deposit/config.py @@ -29,7 +29,6 @@ PRIVATE_LIST_DEPOSITS = "private-deposit-list" ARCHIVE_KEY = "archive" -METADATA_KEY = "metadata" RAW_METADATA_KEY = "raw-metadata" ARCHIVE_TYPE = "archive" diff --git a/swh/deposit/parsers.py b/swh/deposit/parsers.py --- a/swh/deposit/parsers.py +++ b/swh/deposit/parsers.py @@ -9,13 +9,12 @@ """ import logging -from xml.parsers.expat import ExpatError +from xml.etree import ElementTree from django.conf import settings from rest_framework.parsers import BaseParser, FileUploadParser, MultiPartParser from swh.deposit.errors import ParserError -from swh.deposit.utils import parse_xml as _parse_xml logger = logging.getLogger(__name__) @@ -49,7 +48,8 @@ """ parser_context = parser_context or {} encoding = parser_context.get("encoding", settings.DEFAULT_CHARSET) - return _parse_xml(stream, encoding=encoding) + parser = ElementTree.XMLParser(encoding=encoding) + return ElementTree.parse(stream, parser=parser) class SWHAtomEntryParser(SWHXMLParser): @@ -89,6 +89,6 @@ """ try: - return SWHXMLParser().parse(raw_content) - except ExpatError as e: + return ElementTree.fromstring(raw_content) + except ElementTree.ParseError as e: raise ParserError(str(e)) diff --git a/swh/deposit/tests/api/conftest.py b/swh/deposit/tests/api/conftest.py --- a/swh/deposit/tests/api/conftest.py +++ b/swh/deposit/tests/api/conftest.py @@ -17,6 +17,7 @@ ) from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml +from swh.deposit.utils import NAMESPACES @pytest.fixture @@ -81,7 +82,7 @@ ) response_content = parse_xml(response.content) - deposit_id = int(response_content["swh:deposit_id"]) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) deposit.status = DEPOSIT_STATUS_DEPOSITED deposit.save() diff --git a/swh/deposit/tests/api/test_collection_add_to_origin.py b/swh/deposit/tests/api/test_collection_add_to_origin.py --- a/swh/deposit/tests/api/test_collection_add_to_origin.py +++ b/swh/deposit/tests/api/test_collection_add_to_origin.py @@ -3,8 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from io import BytesIO - from django.urls import reverse_lazy as reverse from rest_framework import status @@ -12,6 +10,7 @@ from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom +from swh.deposit.utils import NAMESPACES from ..conftest import internal_create_deposit @@ -40,8 +39,8 @@ ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id diff --git a/swh/deposit/tests/api/test_collection_list.py b/swh/deposit/tests/api/test_collection_list.py --- a/swh/deposit/tests/api/test_collection_list.py +++ b/swh/deposit/tests/api/test_collection_list.py @@ -3,8 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from io import BytesIO - from django.urls import reverse_lazy as reverse from requests.utils import parse_header_links from rest_framework import status @@ -12,6 +10,7 @@ from swh.deposit.config import COL_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL from swh.deposit.models import DepositCollection from swh.deposit.parsers import parse_xml +from swh.deposit.utils import NAMESPACES def test_deposit_collection_list_is_auth_protected(anonymous_client): @@ -64,9 +63,9 @@ response = authenticated_client.get(f"{url}?page_size=1") assert response.status_code == status.HTTP_200_OK - data = parse_xml(BytesIO(response.content))["atom:feed"] + data = parse_xml(response.content) assert ( - data["swh:count"] == "2" + data.findtext("swh:count", namespaces=NAMESPACES) == "2" ) # total result of 2 deposits if consuming all results header_link = parse_header_links(response["Link"]) assert len(header_link) == 1 # only 1 next link @@ -75,17 +74,21 @@ assert header_link[0]["rel"] == "next" # only one deposit in the response - deposit = data["atom:entry"] # dict as only 1 value (a-la js) - assert isinstance(deposit, dict) - assert deposit["swh:id"] == deposit_id - assert deposit["swh:status"] == DEPOSIT_STATUS_PARTIAL + assert len(data.findall("atom:entry", namespaces=NAMESPACES)) == 1 + assert data.findtext("atom:entry/swh:id", namespaces=NAMESPACES) == str(deposit_id) + assert ( + data.findtext("atom:entry/swh:status", namespaces=NAMESPACES) + == DEPOSIT_STATUS_PARTIAL + ) # then 2nd page response2 = authenticated_client.get(expected_next) assert response2.status_code == status.HTTP_200_OK - data2 = parse_xml(BytesIO(response2.content))["atom:feed"] - assert data2["swh:count"] == "2" # still total of 2 deposits across all results + data2 = parse_xml(response2.content) + assert ( + data2.findtext("swh:count", namespaces=NAMESPACES) == "2" + ) # still total of 2 deposits across all results expected_previous = f"{url}?page_size=1" header_link2 = parse_header_links(response2["Link"]) @@ -94,20 +97,26 @@ assert header_link2[0]["rel"] == "previous" # only 1 deposit in the response - deposit2 = data2["atom:entry"] # dict as only 1 value (a-la js) - assert isinstance(deposit2, dict) - assert deposit2["swh:id"] == deposit_id2 - assert deposit2["swh:status"] == DEPOSIT_STATUS_DEPOSITED + assert len(data2.findall("atom:entry", namespaces=NAMESPACES)) == 1 + assert data2.findtext("atom:entry/swh:id", namespaces=NAMESPACES) == str( + deposit_id2 + ) + assert ( + data2.findtext("atom:entry/swh:status", namespaces=NAMESPACES) + == DEPOSIT_STATUS_DEPOSITED + ) # Retrieve every deposit in one query (no page_size parameter) response3 = authenticated_client.get(url) assert response3.status_code == status.HTTP_200_OK - data3 = parse_xml(BytesIO(response3.content))["atom:feed"] - assert data3["swh:count"] == "2" # total result of 2 deposits across all results - deposits3 = data3["atom:entry"] # list here + data3 = parse_xml(response3.content) + assert ( + data3.findtext("swh:count", namespaces=NAMESPACES) == "2" + ) # total result of 2 deposits across all results + deposits3 = data3.findall("atom:entry/swh:id", namespaces=NAMESPACES) # list here assert isinstance(deposits3, list) assert len(deposits3) == 2 header_link3 = parse_header_links(response3["Link"]) assert header_link3 == [] # no pagination as all results received in one round - assert deposit in deposits3 - assert deposit2 in deposits3 + assert deposits3[0].text == str(deposit_id) + assert deposits3[1].text == str(deposit_id2) diff --git a/swh/deposit/tests/api/test_collection_post_atom.py b/swh/deposit/tests/api/test_collection_post_atom.py --- a/swh/deposit/tests/api/test_collection_post_atom.py +++ b/swh/deposit/tests/api/test_collection_post_atom.py @@ -5,10 +5,10 @@ """Tests the handling of the Atom content when doing a POST Col-IRI.""" -from io import BytesIO import textwrap import uuid import warnings +from xml.etree import ElementTree import attr from django.urls import reverse_lazy as reverse @@ -22,9 +22,12 @@ APIConfig, ) from swh.deposit.models import Deposit, DepositCollection, DepositRequest -from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom -from swh.deposit.utils import compute_metadata_context, extended_swhid_from_qualified +from swh.deposit.utils import ( + NAMESPACES, + compute_metadata_context, + extended_swhid_from_qualified, +) from swh.model.hypothesis_strategies import ( directories, present_contents, @@ -121,14 +124,16 @@ # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = ElementTree.fromstring(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) dr = DepositRequest.objects.get(deposit=deposit) - assert dr.metadata is not None - sw_version = dr.metadata.get("codemeta:softwareVersion") + assert dr.raw_metadata is not None + sw_version = ElementTree.fromstring(dr.raw_metadata).findtext( + "codemeta:softwareVersion", namespaces=NAMESPACES + ) assert sw_version == "10.4" @@ -145,7 +150,9 @@ data=atom_content, HTTP_SLUG="external-id", ) - assert response.status_code == status.HTTP_400_BAD_REQUEST + assert ( + response.status_code == status.HTTP_400_BAD_REQUEST + ), response.content.decode() assert b"Empty body request is not supported" in response.content @@ -259,8 +266,8 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = ElementTree.fromstring(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection @@ -288,8 +295,8 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = ElementTree.fromstring(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection @@ -318,8 +325,8 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = ElementTree.fromstring(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection @@ -443,8 +450,8 @@ # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = ElementTree.fromstring(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection @@ -453,7 +460,6 @@ # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) - assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False @@ -482,9 +488,9 @@ # then assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) + response_content = ElementTree.fromstring(response.content) - deposit_id = response_content["swh:deposit_id"] + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection @@ -493,7 +499,6 @@ # one associated request to a deposit deposit_request = DepositRequest.objects.get(deposit=deposit) - assert deposit_request.metadata is not None assert deposit_request.raw_metadata == atom_entry_data assert bool(deposit_request.archive) is False @@ -572,10 +577,10 @@ ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() - response_content = parse_xml(BytesIO(response.content)) + response_content = ElementTree.fromstring(response.content) # Ensure the deposit is finalized - deposit_id = int(response_content["swh:deposit_id"]) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.swhid == str(swhid_target) assert deposit.swhid_context == str(swhid_reference) @@ -650,9 +655,9 @@ ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() - response_content = parse_xml(BytesIO(response.content)) + response_content = ElementTree.fromstring(response.content) # Ensure the deposit is finalized - deposit_id = int(response_content["swh:deposit_id"]) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) # we got not swhid as input so we cannot have those assert deposit.swhid is None @@ -738,8 +743,10 @@ assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() - response_content = parse_xml(BytesIO(response.content)) - assert "object does not exist" in response_content["sword:error"]["atom:summary"] + response_content = ElementTree.fromstring(response.content) + assert "object does not exist" in response_content.findtext( + "atom:summary", namespaces=NAMESPACES + ) @pytest.mark.parametrize( @@ -767,8 +774,10 @@ assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() - response_content = parse_xml(BytesIO(response.content)) - assert "Invalid SWHID reference" in response_content["sword:error"]["atom:summary"] + response_content = ElementTree.fromstring(response.content) + assert "Invalid SWHID reference" in response_content.findtext( + "atom:summary", namespaces=NAMESPACES + ) def test_deposit_metadata_unknown_origin( @@ -788,5 +797,7 @@ assert ( response.status_code == status.HTTP_400_BAD_REQUEST ), response.content.decode() - response_content = parse_xml(BytesIO(response.content)) - assert "known to the archive" in response_content["sword:error"]["atom:summary"] + response_content = ElementTree.fromstring(response.content) + assert "known to the archive" in response_content.findtext( + "atom:summary", namespaces=NAMESPACES + ) diff --git a/swh/deposit/tests/api/test_collection_post_binary.py b/swh/deposit/tests/api/test_collection_post_binary.py --- a/swh/deposit/tests/api/test_collection_post_binary.py +++ b/swh/deposit/tests/api/test_collection_post_binary.py @@ -5,7 +5,6 @@ """Tests the handling of the binary content when doing a POST Col-IRI.""" -from io import BytesIO import uuid from django.urls import reverse_lazy as reverse @@ -20,6 +19,7 @@ create_arborescence_archive, post_archive, ) +from swh.deposit.utils import NAMESPACES def test_post_deposit_binary_no_slug( @@ -39,8 +39,8 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection @@ -96,9 +96,9 @@ ) # then - response_content = parse_xml(BytesIO(response.content)) + response_content = parse_xml(response.content) assert response.status_code == status.HTTP_201_CREATED - deposit_id = response_content["swh:deposit_id"] + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED @@ -112,16 +112,34 @@ assert deposit_request.metadata is None assert deposit_request.raw_metadata is None - response_content = parse_xml(BytesIO(response.content)) + response_content = parse_xml(response.content) - assert response_content["swh:deposit_archive"] == sample_archive["name"] - assert int(response_content["swh:deposit_id"]) == deposit.id - assert response_content["swh:deposit_status"] == deposit.status + assert ( + response_content.findtext("swh:deposit_archive", namespaces=NAMESPACES) + == sample_archive["name"] + ) + assert ( + int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) + == deposit.id + ) + assert ( + response_content.findtext("swh:deposit_status", namespaces=NAMESPACES) + == deposit.status + ) # deprecated tags - assert response_content["atom:deposit_archive"] == sample_archive["name"] - assert int(response_content["atom:deposit_id"]) == deposit.id - assert response_content["atom:deposit_status"] == deposit.status + assert ( + response_content.findtext("atom:deposit_archive", namespaces=NAMESPACES) + == sample_archive["name"] + ) + assert ( + int(response_content.findtext("atom:deposit_id", namespaces=NAMESPACES)) + == deposit.id + ) + assert ( + response_content.findtext("atom:deposit_status", namespaces=NAMESPACES) + == deposit.status + ) from django.urls import reverse as reverse_strict @@ -303,8 +321,8 @@ # then assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) @@ -324,8 +342,10 @@ assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id2 = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id2 = int( + response_content.findtext("swh:deposit_id", namespaces=NAMESPACES) + ) deposit2 = Deposit.objects.get(pk=deposit_id2) diff --git a/swh/deposit/tests/api/test_collection_post_multipart.py b/swh/deposit/tests/api/test_collection_post_multipart.py --- a/swh/deposit/tests/api/test_collection_post_multipart.py +++ b/swh/deposit/tests/api/test_collection_post_multipart.py @@ -17,6 +17,7 @@ from swh.deposit.models import Deposit, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import check_archive, post_multipart +from swh.deposit.utils import NAMESPACES def test_post_deposit_multipart( @@ -43,8 +44,8 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection @@ -76,8 +77,8 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection @@ -109,8 +110,8 @@ # then assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED @@ -124,11 +125,12 @@ assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) - assert deposit_request.metadata is None assert deposit_request.raw_metadata is None else: assert ( - deposit_request.metadata["atom:id"] + parse_xml(deposit_request.raw_metadata).findtext( + "atom:id", namespaces=NAMESPACES + ) == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry @@ -158,8 +160,8 @@ # then assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED @@ -173,11 +175,12 @@ assert deposit_request.deposit == deposit if deposit_request.type == "archive": check_archive(sample_archive["name"], deposit_request.archive.name) - assert deposit_request.metadata is None assert deposit_request.raw_metadata is None else: assert ( - deposit_request.metadata["atom:id"] + parse_xml(deposit_request.raw_metadata).findtext( + "atom:id", namespaces=NAMESPACES + ) == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry @@ -208,8 +211,8 @@ # then assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == "partial" @@ -226,7 +229,9 @@ check_archive(sample_archive["name"], deposit_request.archive.name) else: assert ( - deposit_request.metadata["atom:id"] + parse_xml(deposit_request.raw_metadata).findtext( + "atom:id", namespaces=NAMESPACES + ) == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert deposit_request.raw_metadata == data_atom_entry @@ -256,7 +261,9 @@ check_archive(sample_archive["name"], deposit_request.archive.name) else: assert ( - deposit_request.metadata["atom:id"] + parse_xml(deposit_request.raw_metadata).findtext( + "atom:id", namespaces=NAMESPACES + ) == "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a" ) assert ( diff --git a/swh/deposit/tests/api/test_collection_reuse_slug.py b/swh/deposit/tests/api/test_collection_reuse_slug.py --- a/swh/deposit/tests/api/test_collection_reuse_slug.py +++ b/swh/deposit/tests/api/test_collection_reuse_slug.py @@ -3,11 +3,8 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from io import BytesIO - from django.urls import reverse_lazy as reverse from rest_framework import status -import xmltodict from swh.deposit.config import ( COL_IRI, @@ -19,6 +16,7 @@ from swh.deposit.models import Deposit from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom +from swh.deposit.utils import NAMESPACES from ..conftest import internal_create_deposit @@ -36,8 +34,9 @@ ) assert response.status_code == status.HTTP_400_BAD_REQUEST + print(response.content) assert ( - xmltodict.parse(response.content)["sword:error"]["summary"] + parse_xml(response.content).findtext("atom:summary", namespaces=NAMESPACES) == f"You can only act on deposit with status '{DEPOSIT_STATUS_PARTIAL}'" ) @@ -65,8 +64,8 @@ ) assert response.status_code == status.HTTP_201_CREATED, response.content.decode() - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id # new deposit @@ -97,8 +96,8 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id @@ -134,8 +133,8 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id @@ -174,8 +173,8 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id @@ -220,8 +219,8 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert other_deposit.id != deposit_id @@ -269,8 +268,8 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert deposit_id != deposit.id assert other_deposit.id != deposit.id diff --git a/swh/deposit/tests/api/test_delete.py b/swh/deposit/tests/api/test_delete.py --- a/swh/deposit/tests/api/test_delete.py +++ b/swh/deposit/tests/api/test_delete.py @@ -11,11 +11,11 @@ import xmltodict from swh.deposit.config import ( - ARCHIVE_KEY, + ARCHIVE_TYPE, DEPOSIT_STATUS_DEPOSITED, EDIT_IRI, EM_IRI, - METADATA_KEY, + METADATA_TYPE, ) from swh.deposit.models import Deposit, DepositRequest @@ -39,7 +39,7 @@ # deposit request type: 'archive', 1 'metadata' deposit_request_types = count_deposit_request_types(deposit_requests) - assert deposit_request_types == {ARCHIVE_KEY: 1, METADATA_KEY: 1} + assert deposit_request_types == {ARCHIVE_TYPE: 1, METADATA_TYPE: 1} # when update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit_id]) @@ -52,7 +52,7 @@ deposit_requests2 = DepositRequest.objects.filter(deposit=deposit) deposit_request_types = count_deposit_request_types(deposit_requests2) - assert deposit_request_types == {METADATA_KEY: 1} + assert deposit_request_types == {METADATA_TYPE: 1} def test_delete_archive_on_undefined_deposit_fails( diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py --- a/swh/deposit/tests/api/test_deposit_private_check.py +++ b/swh/deposit/tests/api/test_deposit_private_check.py @@ -26,6 +26,7 @@ create_arborescence_archive, create_archive_with_archive, ) +from swh.deposit.utils import NAMESPACES PRIVATE_CHECK_DEPOSIT_NC = PRIVATE_CHECK_DEPOSIT + "-nc" @@ -204,9 +205,11 @@ # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(response.content) - deposit_status = response_content["swh:deposit_status"] + deposit_status = response_content.findtext( + "swh:deposit_status", namespaces=NAMESPACES + ) assert deposit_status == DEPOSIT_STATUS_DEPOSITED - deposit_id = int(response_content["swh:deposit_id"]) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert DEPOSIT_STATUS_DEPOSITED == deposit.status diff --git a/swh/deposit/tests/api/test_deposit_schedule.py b/swh/deposit/tests/api/test_deposit_schedule.py --- a/swh/deposit/tests/api/test_deposit_schedule.py +++ b/swh/deposit/tests/api/test_deposit_schedule.py @@ -5,7 +5,6 @@ import copy import datetime -from io import BytesIO from django.urls import reverse_lazy as reverse import pytest @@ -18,6 +17,7 @@ SE_IRI, ) from swh.deposit.parsers import parse_xml +from swh.deposit.utils import NAMESPACES @pytest.fixture() @@ -81,10 +81,12 @@ assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - actual_state = response_content["swh:deposit_status"] + response_content = parse_xml(response.content) + actual_state = response_content.findtext( + "swh:deposit_status", namespaces=NAMESPACES + ) assert actual_state == DEPOSIT_STATUS_DEPOSITED - deposit_id = int(response_content["swh:deposit_id"]) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) assert_task_for_deposit( swh_scheduler, deposit_id, timestamp_before_call, timestamp_after_call @@ -123,10 +125,14 @@ assert response.status_code == status.HTTP_200_OK - response_content = parse_xml(BytesIO(response.content)) - actual_state = response_content["swh:deposit_status"] + response_content = parse_xml(response.content) + actual_state = response_content.findtext( + "swh:deposit_status", namespaces=NAMESPACES + ) assert actual_state == DEPOSIT_STATUS_DEPOSITED - assert deposit.id == int(response_content["swh:deposit_id"]) + assert deposit.id == int( + response_content.findtext("swh:deposit_id", namespaces=NAMESPACES) + ) assert_task_for_deposit( swh_scheduler, deposit.id, timestamp_before_call, timestamp_after_call diff --git a/swh/deposit/tests/api/test_deposit_state.py b/swh/deposit/tests/api/test_deposit_state.py --- a/swh/deposit/tests/api/test_deposit_state.py +++ b/swh/deposit/tests/api/test_deposit_state.py @@ -4,7 +4,7 @@ # See top-level LICENSE file for more information -from io import BytesIO +from xml.etree import ElementTree from django.urls import reverse_lazy as reverse from rest_framework import status @@ -15,7 +15,7 @@ STATE_IRI, ) from swh.deposit.models import DEPOSIT_STATUS_DETAIL, DEPOSIT_STATUS_LOAD_SUCCESS -from swh.deposit.parsers import parse_xml +from swh.deposit.utils import NAMESPACES def test_post_deposit_with_status_check(authenticated_client, deposited_deposit): @@ -29,16 +29,25 @@ status_response = authenticated_client.get(status_url) assert status_response.status_code == status.HTTP_200_OK - r = parse_xml(BytesIO(status_response.content)) + r = ElementTree.fromstring(status_response.content) - assert int(r["swh:deposit_id"]) == deposit.id - assert r["swh:deposit_status"] == DEPOSIT_STATUS_DEPOSITED + assert int(r.findtext("swh:deposit_id", namespaces=NAMESPACES)) == deposit.id assert ( - r["swh:deposit_status_detail"] + r.findtext("swh:deposit_status", namespaces=NAMESPACES) + == DEPOSIT_STATUS_DEPOSITED + ) + assert ( + r.findtext("swh:deposit_status_detail", namespaces=NAMESPACES) == DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_DEPOSITED] ) - assert r["swh:deposit_external_id"] == deposit.external_id - assert r["swh:deposit_origin_url"] == deposit.origin_url + assert ( + r.findtext("swh:deposit_external_id", namespaces=NAMESPACES) + == deposit.external_id + ) + assert ( + r.findtext("swh:deposit_origin_url", namespaces=NAMESPACES) + == deposit.origin_url + ) def test_status_unknown_deposit(authenticated_client, deposit_collection): @@ -74,12 +83,18 @@ # then assert status_response.status_code == status.HTTP_200_OK - r = parse_xml(BytesIO(status_response.content)) - assert int(r["swh:deposit_id"]) == deposit.id - assert r["swh:deposit_status"] == DEPOSIT_STATUS_REJECTED - assert r["swh:deposit_status_detail"] == "Deposit failed the checks" + r = ElementTree.fromstring(status_response.content) + assert int(r.findtext("swh:deposit_id", namespaces=NAMESPACES)) == deposit.id + assert ( + r.findtext("swh:deposit_status", namespaces=NAMESPACES) + == DEPOSIT_STATUS_REJECTED + ) + assert ( + r.findtext("swh:deposit_status_detail", namespaces=NAMESPACES) + == "Deposit failed the checks" + ) if deposit.swhid: - assert r["swh:deposit_swhid"] == deposit.swhid + assert r.findtext("swh:deposit_swhid", namespaces=NAMESPACES) == deposit.swhid def test_status_with_http_accept_header_should_not_break( @@ -113,15 +128,24 @@ # then assert status_response.status_code == status.HTTP_200_OK - r = parse_xml(BytesIO(status_response.content)) - assert int(r["swh:deposit_id"]) == deposit.id - assert r["swh:deposit_status"] == DEPOSIT_STATUS_LOAD_SUCCESS + r = ElementTree.fromstring(status_response.content) + assert int(r.findtext("swh:deposit_id", namespaces=NAMESPACES)) == deposit.id + assert ( + r.findtext("swh:deposit_status", namespaces=NAMESPACES) + == DEPOSIT_STATUS_LOAD_SUCCESS + ) assert ( - r["swh:deposit_status_detail"] + r.findtext("swh:deposit_status_detail", namespaces=NAMESPACES) == DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_LOAD_SUCCESS] ) assert deposit.swhid is not None - assert r["swh:deposit_swh_id"] == deposit.swhid + assert r.findtext("swh:deposit_swh_id", namespaces=NAMESPACES) == deposit.swhid assert deposit.swhid_context is not None - assert r["swh:deposit_swh_id_context"] == deposit.swhid_context - assert r["swh:deposit_origin_url"] == deposit.origin_url + assert ( + r.findtext("swh:deposit_swh_id_context", namespaces=NAMESPACES) + == deposit.swhid_context + ) + assert ( + r.findtext("swh:deposit_origin_url", namespaces=NAMESPACES) + == deposit.origin_url + ) diff --git a/swh/deposit/tests/api/test_deposit_update_atom.py b/swh/deposit/tests/api/test_deposit_update_atom.py --- a/swh/deposit/tests/api/test_deposit_update_atom.py +++ b/swh/deposit/tests/api/test_deposit_update_atom.py @@ -3,8 +3,6 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from io import BytesIO - from django.urls import reverse_lazy as reverse import pytest from rest_framework import status @@ -21,6 +19,7 @@ from swh.deposit.models import Deposit, DepositCollection, DepositRequest from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import post_atom, put_atom +from swh.deposit.utils import NAMESPACES from swh.model.hashutil import hash_to_bytes from swh.model.model import ( MetadataAuthority, @@ -55,8 +54,8 @@ # then assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = int(response_content["swh:deposit_id"]) + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection @@ -69,12 +68,9 @@ atom_entry_data = atom_dataset["entry-only-create-origin"] % (origin_url) - for link in response_content["atom:link"]: - if link["@rel"] == "http://purl.org/net/sword/terms/add": - se_iri = link["@href"] - break - else: - assert False, f"missing SE-IRI from {response_content['link']}" + se_iri = response_content.find( + "atom:link[@rel='http://purl.org/net/sword/terms/add']", namespaces=NAMESPACES + ).attrib["href"] # when updating the first deposit post response = post_atom( @@ -84,8 +80,8 @@ # then assert response.status_code == status.HTTP_201_CREATED, response.content.decode() - response_content = parse_xml(BytesIO(response.content)) - deposit_id = int(response_content["swh:deposit_id"]) + response_content = parse_xml(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.collection == deposit_collection @@ -100,14 +96,12 @@ atom_entry_data1 = atom_dataset["entry-data1"] expected_meta = [ - {"metadata": parse_xml(atom_entry_data1), "raw_metadata": atom_entry_data1}, - {"metadata": parse_xml(atom_entry_data), "raw_metadata": atom_entry_data}, + atom_entry_data1, + atom_entry_data, ] for i, deposit_request in enumerate(deposit_requests): - actual_metadata = deposit_request.metadata - assert actual_metadata == expected_meta[i]["metadata"] - assert deposit_request.raw_metadata == expected_meta[i]["raw_metadata"] + assert deposit_request.raw_metadata == expected_meta[i] assert bool(deposit_request.archive) is False @@ -216,8 +210,8 @@ response = post_atom(authenticated_client, url, data=atom_dataset["entry-data1"],) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) - assert ( - "Deposit 1000 does not exist" in response_content["sword:error"]["atom:summary"] + assert "Deposit 1000 does not exist" in response_content.findtext( + "atom:summary", namespaces=NAMESPACES ) @@ -238,7 +232,9 @@ response = post_atom(authenticated_client, url, data=atom_dataset["entry-data1"],) assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) - assert "Unknown collection name" in response_content["sword:error"]["atom:summary"] + assert "Unknown collection name" in response_content.findtext( + "atom:summary", namespaces=NAMESPACES + ) def test_replace_metadata_to_unknown_deposit( @@ -257,8 +253,8 @@ assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( - "Deposit %s does not exist" % unknown_deposit_id - == response_content["sword:error"]["atom:summary"] + response_content.findtext("atom:summary", namespaces=NAMESPACES) + == "Deposit %s does not exist" % unknown_deposit_id ) @@ -542,14 +538,11 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) + response_content = parse_xml(response.content) - for link in response_content["atom:link"]: - if link["@rel"] == "edit": - edit_iri = link["@href"] - break - else: - assert False, response_content + edit_iri = response_content.find( + "atom:link[@rel='edit']", namespaces=NAMESPACES + ).attrib["href"] # when response = put_atom( @@ -581,14 +574,11 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - - for link in response_content["atom:link"]: - if link["@rel"] == "edit": - edit_iri = link["@href"] - break - else: - assert False, response_content + response_content = parse_xml(response.content) + + edit_iri = response_content.find( + "atom:link[@rel='edit']", namespaces=NAMESPACES + ).attrib["href"] # when response = put_atom( diff --git a/swh/deposit/tests/api/test_deposit_update_binary.py b/swh/deposit/tests/api/test_deposit_update_binary.py --- a/swh/deposit/tests/api/test_deposit_update_binary.py +++ b/swh/deposit/tests/api/test_deposit_update_binary.py @@ -23,6 +23,7 @@ put_archive, put_atom, ) +from swh.deposit.utils import NAMESPACES def test_post_deposit_binary_and_post_to_add_another_archive( @@ -48,8 +49,8 @@ # then assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = response_content.findtext("swh:deposit_id", namespaces=NAMESPACES) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == "partial" @@ -76,7 +77,7 @@ ) assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) + response_content = parse_xml(response.content) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED @@ -182,8 +183,8 @@ assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( - "Deposit %s does not exist" % unknown_deposit_id - == response_content["sword:error"]["atom:summary"] + response_content.findtext("atom:summary", namespaces=NAMESPACES) + == "Deposit %s does not exist" % unknown_deposit_id ) @@ -206,8 +207,8 @@ assert response.status_code == status.HTTP_404_NOT_FOUND response_content = parse_xml(response.content) assert ( - "Deposit %s does not exist" % unknown_deposit_id - == response_content["sword:error"]["atom:summary"] + response_content.findtext("atom:summary", namespaces=NAMESPACES) + == "Deposit %s does not exist" % unknown_deposit_id ) @@ -288,8 +289,8 @@ # then assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(BytesIO(response.content)) - deposit_id = response_content["swh:deposit_id"] + response_content = parse_xml(response.content) + deposit_id = response_content.findtext("swh:deposit_id", namespaces=NAMESPACES) deposit = Deposit.objects.get(pk=deposit_id) assert deposit.status == DEPOSIT_STATUS_DEPOSITED diff --git a/swh/deposit/tests/api/test_get_file.py b/swh/deposit/tests/api/test_get_file.py --- a/swh/deposit/tests/api/test_get_file.py +++ b/swh/deposit/tests/api/test_get_file.py @@ -11,6 +11,7 @@ from swh.deposit.config import CONT_FILE_IRI from swh.deposit.models import DEPOSIT_STATUS_DETAIL from swh.deposit.parsers import parse_xml +from swh.deposit.utils import NAMESPACES def test_api_deposit_content_nominal( @@ -21,18 +22,21 @@ """ for deposit in [complete_deposit, partial_deposit_only_metadata]: - expected_deposit = { - "swh:deposit_id": str(deposit.id), - "swh:deposit_status": deposit.status, - "swh:deposit_status_detail": DEPOSIT_STATUS_DETAIL[deposit.status], - } - url = reverse(CONT_FILE_IRI, args=[deposit.collection.name, deposit.id]) response = authenticated_client.get(url) assert response.status_code == status.HTTP_200_OK - actual_deposit = dict(parse_xml(response.content)) - del actual_deposit["swh:deposit_date"] - assert set(actual_deposit.items()) >= set(expected_deposit.items()) + actual_deposit = parse_xml(response.content) + assert actual_deposit.findtext("swh:deposit_id", namespaces=NAMESPACES) == str( + deposit.id + ) + assert ( + actual_deposit.findtext("swh:deposit_status", namespaces=NAMESPACES) + == deposit.status + ) + assert ( + actual_deposit.findtext("swh:deposit_status_detail", namespaces=NAMESPACES) + == DEPOSIT_STATUS_DETAIL[deposit.status] + ) def test_api_deposit_content_unknown( diff --git a/swh/deposit/tests/api/test_parsers.py b/swh/deposit/tests/api/test_parsers.py --- a/swh/deposit/tests/api/test_parsers.py +++ b/swh/deposit/tests/api/test_parsers.py @@ -1,12 +1,12 @@ -# Copyright (C) 2018-2020 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from collections import OrderedDict import io from swh.deposit.parsers import SWHXMLParser +from swh.deposit.utils import NAMESPACES def test_parsing_without_duplicates(): @@ -30,30 +30,23 @@ ) actual_result = SWHXMLParser().parse(xml_no_duplicate) - expected_dict = OrderedDict( - [ - ("atom:title", "Awesome Compiler"), - ( - "codemeta:license", - OrderedDict( - [ - ("codemeta:name", "GPL3.0"), - ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), - ] - ), - ), - ("codemeta:runtimePlatform", "Python3"), - ( - "codemeta:author", - OrderedDict( - [("codemeta:name", "author1"), ("codemeta:affiliation", "Inria")] - ), - ), - ("codemeta:programmingLanguage", "ocaml"), - ("codemeta:issueTracker", "http://issuetracker.com"), - ] + + assert ( + actual_result.findtext( + "codemeta:license/codemeta:name", + namespaces={"codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"}, + ) + == "GPL3.0" + ) + assert ( + actual_result.findtext("codemeta:license/codemeta:name", namespaces=NAMESPACES) + == "GPL3.0" + ) + authors = actual_result.findall( + "codemeta:author/codemeta:name", namespaces=NAMESPACES ) - assert expected_dict == actual_result + assert len(authors) == 1 + assert authors[0].text == "author1" def test_parsing_with_duplicates(): @@ -88,42 +81,20 @@ actual_result = SWHXMLParser().parse(xml_with_duplicates) - expected_dict = OrderedDict( - [ - ("atom:title", "Another Compiler"), - ("codemeta:runtimePlatform", ["GNU/Linux", "Un*x"]), - ( - "codemeta:license", - [ - OrderedDict( - [ - ("codemeta:name", "GPL3.0"), - ("codemeta:url", "https://opensource.org/licenses/GPL-3.0"), - ] - ), - OrderedDict( - [("codemeta:name", "spdx"), ("codemeta:url", "http://spdx.org")] - ), - ], - ), - ( - "codemeta:author", - [ - OrderedDict( - [ - ("codemeta:name", "author1"), - ("codemeta:affiliation", "Inria"), - ] - ), - OrderedDict( - [ - ("codemeta:name", "author2"), - ("codemeta:affiliation", "Inria"), - ] - ), - ], - ), - ("codemeta:programmingLanguage", ["ocaml", "haskell", "python3"]), - ] + assert ( + actual_result.findtext( + "codemeta:license/codemeta:name", + namespaces={"codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"}, + ) + == "GPL3.0" + ) + assert ( + actual_result.findtext("codemeta:license/codemeta:name", namespaces=NAMESPACES) + == "GPL3.0" + ) + authors = actual_result.findall( + "codemeta:author/codemeta:name", namespaces=NAMESPACES ) - assert expected_dict == actual_result + assert len(authors) == 2 + assert authors[0].text == "author1" + assert authors[1].text == "author2" diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py --- a/swh/deposit/tests/cli/test_client.py +++ b/swh/deposit/tests/cli/test_client.py @@ -4,7 +4,6 @@ # See top-level LICENSE file for more information import ast -from collections import OrderedDict import contextlib import json import logging @@ -29,6 +28,7 @@ ServiceDocumentDepositClient, ) from swh.deposit.parsers import parse_xml +from swh.deposit.utils import NAMESPACES from swh.model.exceptions import ValidationError from ..conftest import TEST_USER @@ -183,22 +183,39 @@ metadata_provenance_url="meta-prov-url", ) - actual_metadata = dict(parse_xml(actual_metadata_xml)) - assert actual_metadata["atom:author"] == "deposit-client" - assert actual_metadata["atom:title"] == "project-name" - assert actual_metadata["atom:updated"] is not None - assert actual_metadata["codemeta:name"] == "project-name" - assert actual_metadata["codemeta:identifier"] == "external-id" - assert actual_metadata["codemeta:author"] == [ - OrderedDict([("codemeta:name", "some")]), - OrderedDict([("codemeta:name", "authors")]), - ] + actual_metadata = parse_xml(actual_metadata_xml) assert ( - actual_metadata["swh:deposit"]["swh:create_origin"]["swh:origin"]["@url"] + actual_metadata.findtext("atom:author", namespaces=NAMESPACES) + == "deposit-client" + ) + assert ( + actual_metadata.findtext("atom:title", namespaces=NAMESPACES) == "project-name" + ) + assert actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None + assert ( + actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) + == "project-name" + ) + assert ( + actual_metadata.findtext("codemeta:identifier", namespaces=NAMESPACES) + == "external-id" + ) + authors = actual_metadata.findall( + "codemeta:author/codemeta:name", namespaces=NAMESPACES + ) + assert len(authors) == 2 + assert authors[0].text == "some" + assert authors[1].text == "authors" + assert ( + actual_metadata.find( + "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES + ).attrib["url"] == "origin-url" ) assert ( - actual_metadata["swh:deposit"]["swh:metadata-provenance"]["schema:url"] + actual_metadata.findtext( + "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES + ) == "meta-prov-url" ) @@ -216,17 +233,27 @@ "deposit-client", "project-name", authors=["some", "authors"], ) - actual_metadata = dict(parse_xml(actual_metadata_xml)) - assert actual_metadata["atom:author"] == "deposit-client" - assert actual_metadata["atom:title"] == "project-name" - assert actual_metadata["atom:updated"] is not None - assert actual_metadata["codemeta:name"] == "project-name" - assert actual_metadata["codemeta:author"] == [ - OrderedDict([("codemeta:name", "some")]), - OrderedDict([("codemeta:name", "authors")]), - ] - assert actual_metadata.get("codemeta:identifier") is None - assert actual_metadata.get("swh:deposit") is None + actual_metadata = parse_xml(actual_metadata_xml) + assert ( + actual_metadata.findtext("atom:author", namespaces=NAMESPACES) + == "deposit-client" + ) + assert ( + actual_metadata.findtext("atom:title", namespaces=NAMESPACES) == "project-name" + ) + assert actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None + assert ( + actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) + == "project-name" + ) + authors = actual_metadata.findall( + "codemeta:author/codemeta:name", namespaces=NAMESPACES + ) + assert len(authors) == 2 + assert authors[0].text == "some" + assert authors[1].text == "authors" + assert actual_metadata.find("codemeta:identifier", namespaces=NAMESPACES) is None + assert actual_metadata.find("swh:deposit", namespaces=NAMESPACES) is None checks_ok, detail = check_metadata(ElementTree.fromstring(actual_metadata_xml)) @@ -273,15 +300,31 @@ } with open(metadata_path) as fd: - actual_metadata = dict(parse_xml(fd.read())) - assert actual_metadata["atom:author"] == TEST_USER["username"] - assert actual_metadata["codemeta:name"] == "test-project" - assert actual_metadata["atom:title"] == "test-project" - assert actual_metadata["atom:updated"] is not None - assert actual_metadata["codemeta:identifier"] == slug - assert actual_metadata["codemeta:author"] == OrderedDict( - [("codemeta:name", "Jane Doe")] + actual_metadata = parse_xml(fd.read()) + assert ( + actual_metadata.findtext("atom:author", namespaces=NAMESPACES) + == TEST_USER["username"] ) + assert ( + actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) + == "test-project" + ) + assert ( + actual_metadata.findtext("atom:title", namespaces=NAMESPACES) + == "test-project" + ) + assert ( + actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None + ) + assert ( + actual_metadata.findtext("codemeta:identifier", namespaces=NAMESPACES) + == slug + ) + authors = actual_metadata.findall( + "codemeta:author/codemeta:name", namespaces=NAMESPACES + ) + assert len(authors) == 1 + assert authors[0].text == "Jane Doe" count_warnings = 0 for (_, log_level, _) in caplog.record_tuples: @@ -329,22 +372,39 @@ } with open(metadata_path) as fd: - actual_metadata = dict(parse_xml(fd.read())) - assert actual_metadata["atom:author"] == TEST_USER["username"] - assert actual_metadata["codemeta:name"] == "test-project" - assert actual_metadata["atom:title"] == "test-project" - assert actual_metadata["atom:updated"] is not None + actual_metadata = parse_xml(fd.read()) + assert ( + actual_metadata.findtext("atom:author", namespaces=NAMESPACES) + == TEST_USER["username"] + ) + assert ( + actual_metadata.findtext("codemeta:name", namespaces=NAMESPACES) + == "test-project" + ) + assert ( + actual_metadata.findtext("atom:title", namespaces=NAMESPACES) + == "test-project" + ) + assert ( + actual_metadata.findtext("atom:updated", namespaces=NAMESPACES) is not None + ) assert ( - actual_metadata["swh:deposit"]["swh:create_origin"]["swh:origin"]["@url"] + actual_metadata.find( + "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES + ).attrib["url"] == origin ) assert ( - actual_metadata["swh:deposit"]["swh:metadata-provenance"]["schema:url"] + actual_metadata.findtext( + "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES + ) == "meta-prov-url" ) - assert actual_metadata["codemeta:author"] == OrderedDict( - [("codemeta:name", "Jane Doe")] + authors = actual_metadata.findall( + "codemeta:author/codemeta:name", namespaces=NAMESPACES ) + assert len(authors) == 1 + assert authors[0].text == "Jane Doe" count_warnings = 0 for (_, log_level, _) in caplog.record_tuples: @@ -569,7 +629,7 @@ with open(metadata_path) as fd: metadata_xml = fd.read() - actual_metadata = dict(parse_xml(metadata_xml)) + actual_metadata = parse_xml(metadata_xml) assert "codemeta:identifier" not in actual_metadata diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py --- a/swh/deposit/tests/conftest.py +++ b/swh/deposit/tests/conftest.py @@ -9,6 +9,7 @@ import os import re from typing import TYPE_CHECKING, Dict, Mapping +from xml.etree import ElementTree from django.test.utils import setup_databases # type: ignore from django.urls import reverse_lazy as reverse @@ -34,12 +35,12 @@ SE_IRI, setup_django_for, ) -from swh.deposit.parsers import parse_xml from swh.deposit.tests.common import ( create_arborescence_archive, post_archive, post_atom, ) +from swh.deposit.utils import NAMESPACES from swh.model.hashutil import hash_to_bytes from swh.model.swhids import CoreSWHID, ObjectType, QualifiedSWHID from swh.scheduler import get_scheduler @@ -445,8 +446,10 @@ assert response.status_code == status.HTTP_201_CREATED, response.content.decode() from swh.deposit.models import Deposit - response_content = parse_xml(response.content) - deposit_id = response_content["swh:deposit_id"] + response_content = ElementTree.fromstring(response.content) + deposit_id = int( + response_content.findtext("swh:deposit_id", "", namespaces=NAMESPACES) + ) deposit = Deposit._default_manager.get(id=deposit_id) if deposit.status != deposit_status: @@ -562,8 +565,8 @@ assert response.status_code == status.HTTP_201_CREATED - response_content = parse_xml(response.content) - deposit_id = response_content["swh:deposit_id"] + response_content = ElementTree.fromstring(response.content) + deposit_id = int(response_content.findtext("swh:deposit_id", namespaces=NAMESPACES)) from swh.deposit.models import Deposit deposit = Deposit._default_manager.get(pk=deposit_id) diff --git a/swh/deposit/tests/test_client_module.py b/swh/deposit/tests/test_client_module.py --- a/swh/deposit/tests/test_client_module.py +++ b/swh/deposit/tests/test_client_module.py @@ -29,7 +29,7 @@ assert isinstance(result, dict) - collection = result["app:service"]["app:workspace"]["app:collection"] + collection = result["app:service"]["app:workspace"][0]["app:collection"] assert collection["sword:name"] == "test" diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py --- a/swh/deposit/utils.py +++ b/swh/deposit/utils.py @@ -8,7 +8,6 @@ from xml.etree import ElementTree import iso8601 -import xmltodict from swh.model.exceptions import ValidationError from swh.model.model import TimestampWithTimezone @@ -28,19 +27,6 @@ } -def parse_xml(stream, encoding="utf-8"): - data = xmltodict.parse( - stream, - encoding=encoding, - namespaces={uri: prefix for (prefix, uri) in NAMESPACES.items()}, - process_namespaces=True, - dict_constructor=dict, - ) - if "atom:entry" in data: - data = data["atom:entry"] - return data - - def normalize_date(date): """Normalize date fields as expected by swh workers.