Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123260
D7215.id26160.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
42 KB
Subscribers
None
D7215.id26160.diff
View Options
diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py
--- a/swh/deposit/api/checks.py
+++ b/swh/deposit/api/checks.py
@@ -15,10 +15,11 @@
"""
from typing import Dict, Optional, Tuple
+from xml.etree import ElementTree
import iso8601
-from swh.deposit.utils import normalize_date, parse_swh_metadata_provenance
+from swh.deposit.utils import NAMESPACES, normalize_date, parse_swh_metadata_provenance
MANDATORY_FIELDS_MISSING = "Mandatory fields are missing"
INVALID_DATE_FORMAT = "Invalid date format"
@@ -27,7 +28,7 @@
METADATA_PROVENANCE_KEY = "swh:metadata-provenance"
-def check_metadata(metadata: Dict) -> Tuple[bool, Optional[Dict]]:
+def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]]:
"""Check metadata for mandatory field presence and date format.
Args:
@@ -47,9 +48,9 @@
("atom:author", "codemeta:author"): False,
}
- for field, value in metadata.items():
- for possible_names in alternate_fields:
- if field in possible_names:
+ for possible_names in alternate_fields:
+ for possible_name in possible_names:
+ if metadata.find(possible_name, namespaces=NAMESPACES) is not None:
alternate_fields[possible_names] = True
continue
@@ -68,18 +69,17 @@
fields = []
- commit_date = metadata.get("codemeta:datePublished")
- author_date = metadata.get("codemeta:dateCreated")
-
- if commit_date:
+ for commit_date in metadata.findall(
+ "codemeta:datePublished", namespaces=NAMESPACES
+ ):
try:
- normalize_date(commit_date)
+ normalize_date(commit_date.text)
except iso8601.iso8601.ParseError:
fields.append("codemeta:datePublished")
- if author_date:
+ for author_date in metadata.findall("codemeta:dateCreated", namespaces=NAMESPACES):
try:
- normalize_date(author_date)
+ normalize_date(author_date.text)
except iso8601.iso8601.ParseError:
fields.append("codemeta:dateCreated")
diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -9,6 +9,7 @@
import json
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
import uuid
+from xml.etree import ElementTree
import attr
from django.core.files.uploadedfile import UploadedFile
@@ -27,7 +28,7 @@
from swh.deposit.api.converters import convert_status_detail
from swh.deposit.auth import HasDepositPermission, KeycloakBasicAuthentication
from swh.deposit.models import Deposit
-from swh.deposit.utils import compute_metadata_context
+from swh.deposit.utils import NAMESPACES, compute_metadata_context
from swh.model import hashutil
from swh.model.model import (
MetadataAuthority,
@@ -316,6 +317,8 @@
metadata = deposit_request_data.get(METADATA_KEY)
if metadata:
+ # TODO: remove non-raw metadata? we don't use these anymore except in
+ # manual queries to the deposit DB
raw_metadata = deposit_request_data[RAW_METADATA_KEY]
deposit_request = DepositRequest(
type=METADATA_TYPE,
@@ -498,14 +501,23 @@
archive=filehandler.name,
)
- def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]:
- """Given a metadata stream, reads the metadata and returns both the
- parsed and the raw metadata.
+ def _read_metadata(
+ self, metadata_stream
+ ) -> Tuple[bytes, Dict[str, Any], ElementTree.Element]:
+ """
+ Given a metadata stream, reads the metadata and returns the metadata in three
+ forms:
+ * verbatim (as raw bytes), for archival in long-term storage
+ * parsed as a Python dict, for archival in postgresql's jsonb type
+ * parsed as ElementTree, to extract information immediately
"""
raw_metadata = metadata_stream.read()
- metadata = parse_xml(raw_metadata)
- return raw_metadata, metadata
+ metadata_dict = parse_xml(raw_metadata)
+ metadata_tree = ElementTree.fromstring(raw_metadata)
+ # TODO: remove metadata_dict? we don't use it anymore, except in manual
+ # queries to the deposit DB
+ return raw_metadata, metadata_dict, metadata_tree
def _multipart_upload(
self,
@@ -592,7 +604,9 @@
self._check_file_md5sum(filehandler, headers.content_md5sum)
try:
- raw_metadata, metadata = self._read_metadata(data["application/atom+xml"])
+ raw_metadata, metadata_dict, metadata_tree = self._read_metadata(
+ data["application/atom+xml"]
+ )
except ParserError:
raise DepositError(
PARSING_ERROR,
@@ -601,7 +615,7 @@
"Please ensure your metadata file is correctly formatted.",
)
- self._set_deposit_origin_from_metadata(deposit, metadata, headers)
+ self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers)
# actual storage of data
self._deposit_put(
@@ -609,7 +623,7 @@
)
deposit_request_data = {
ARCHIVE_KEY: filehandler,
- METADATA_KEY: metadata,
+ METADATA_KEY: metadata_dict,
RAW_METADATA_KEY: raw_metadata,
}
self._deposit_request_put(
@@ -628,7 +642,8 @@
self,
deposit: Deposit,
swhid_reference: Union[str, QualifiedSWHID],
- metadata: Dict,
+ metadata_dict: Dict,
+ metadata_tree: ElementTree.Element,
raw_metadata: bytes,
deposit_origin: Optional[str] = None,
) -> Tuple[ExtendedSWHID, Deposit, DepositRequest]:
@@ -644,8 +659,10 @@
Args:
deposit: Deposit reference
swhid_reference: The swhid or the origin to attach metadata information to
- metadata: Full dict of metadata to check for validity (parsed out of
- raw_metadata)
+ metadata_dict: Full dict of metadata for storage in the deposit DB as jsonb
+ (parsed out of raw_metadata)
+ metadata_tree: Full element tree of metadata to check for validity
+ (parsed out of raw_metadata)
raw_metadata: The actual raw metadata to send in the storage metadata
deposit_origin: Optional deposit origin url to use if any (e.g. deposit
update scenario provides one)
@@ -658,7 +675,7 @@
Tuple of target swhid, deposit, and deposit request
"""
- metadata_ok, error_details = check_metadata(metadata)
+ metadata_ok, error_details = check_metadata(metadata_tree)
if not metadata_ok:
assert error_details, "Details should be set when a failure occurs"
raise DepositError(
@@ -675,7 +692,7 @@
# replace metadata within the deposit backend
deposit_request_data = {
- METADATA_KEY: metadata,
+ METADATA_KEY: metadata_dict,
RAW_METADATA_KEY: raw_metadata,
}
@@ -818,7 +835,9 @@
)
try:
- raw_metadata, metadata = self._read_metadata(metadata_stream)
+ raw_metadata, metadata_dict, metadata_tree = self._read_metadata(
+ metadata_stream
+ )
except ParserError:
raise DepositError(
BAD_REQUEST,
@@ -827,16 +846,16 @@
"Please ensure your metadata file is correctly formatted.",
)
- if metadata is None:
+ if metadata_dict is None:
raise DepositError(
BAD_REQUEST, empty_atom_entry_summary, empty_atom_entry_desc
)
- self._set_deposit_origin_from_metadata(deposit, metadata, headers)
+ self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers)
# Determine if we are in the metadata-only deposit case
try:
- swhid_ref = parse_swh_reference(metadata)
+ swhid_ref = parse_swh_reference(metadata_tree)
except ValidationError as e:
raise DepositError(
PARSING_ERROR, "Invalid SWHID reference", str(e),
@@ -855,7 +874,7 @@
if swhid_ref is not None:
deposit.save() # We need a deposit id
target_swhid, depo, depo_request = self._store_metadata_deposit(
- deposit, swhid_ref, metadata, raw_metadata
+ deposit, swhid_ref, metadata_dict, metadata_tree, raw_metadata
)
deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS
@@ -879,7 +898,7 @@
self._deposit_request_put(
deposit,
- {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata},
+ {METADATA_KEY: metadata_dict, RAW_METADATA_KEY: raw_metadata},
replace_metadata,
replace_archives,
)
@@ -892,10 +911,14 @@
)
def _set_deposit_origin_from_metadata(self, deposit, metadata, headers):
- create_origin = metadata.get("swh:deposit", {}).get("swh:create_origin")
- add_to_origin = metadata.get("swh:deposit", {}).get("swh:add_to_origin")
+ create_origin = metadata.find(
+ "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES
+ )
+ add_to_origin = metadata.find(
+ "swh:deposit/swh:add_to_origin/swh:origin", namespaces=NAMESPACES
+ )
- if create_origin and add_to_origin:
+ if create_origin is not None and add_to_origin is not None:
raise DepositError(
BAD_REQUEST,
"<swh:create_origin> and <swh:add_to_origin> are mutually exclusive, "
@@ -903,13 +926,13 @@
"origin.",
)
- if create_origin:
- origin_url = create_origin["swh:origin"]["@url"]
+ if create_origin is not None:
+ origin_url = create_origin.attrib["url"]
check_client_origin(deposit.client, origin_url)
deposit.origin_url = origin_url
- if add_to_origin:
- origin_url = add_to_origin["swh:origin"]["@url"]
+ if add_to_origin is not None:
+ origin_url = add_to_origin.attrib["url"]
check_client_origin(deposit.client, origin_url)
deposit.parent = (
Deposit.objects.filter(
@@ -922,7 +945,10 @@
)
deposit.origin_url = origin_url
- if "atom:external_identifier" in metadata:
+ external_identifier_element = metadata.find(
+ "atom:external_identifier", namespaces=NAMESPACES
+ )
+ if external_identifier_element is not None:
# Deprecated tag.
# When clients stopped using it, this should raise an error
# unconditionally
@@ -934,7 +960,7 @@
"<swh:create_origin> and <swh:add_to_origin> from now on.",
)
- if headers.slug and metadata["atom:external_identifier"] != headers.slug:
+ if headers.slug and external_identifier_element.text != headers.slug:
raise DepositError(
BAD_REQUEST,
"The <external_identifier> tag and Slug header are deprecated, "
diff --git a/swh/deposit/api/edit.py b/swh/deposit/api/edit.py
--- a/swh/deposit/api/edit.py
+++ b/swh/deposit/api/edit.py
@@ -108,7 +108,9 @@
)
try:
- raw_metadata, metadata = self._read_metadata(request.data)
+ raw_metadata, metadata_dict, metadata_tree = self._read_metadata(
+ request.data
+ )
except ParserError:
raise DepositError(
BAD_REQUEST,
@@ -117,7 +119,7 @@
"Please ensure your metadata file is correctly formatted.",
)
- if not metadata:
+ if not metadata_dict:
raise DepositError(
BAD_REQUEST,
"Empty body request is not supported",
@@ -128,7 +130,8 @@
_, deposit, deposit_request = self._store_metadata_deposit(
deposit,
QualifiedSWHID.from_string(swhid),
- metadata,
+ metadata_dict,
+ metadata_tree,
raw_metadata,
deposit.origin_url,
)
diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py
--- a/swh/deposit/api/private/__init__.py
+++ b/swh/deposit/api/private/__init__.py
@@ -3,7 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Any, Dict, Optional, Tuple
+from typing import Optional
from rest_framework.permissions import AllowAny
from rest_framework.views import APIView
@@ -36,24 +36,20 @@
for deposit_request in deposit_requests:
yield deposit_request
- def _metadata_get(self, deposit: Deposit) -> Tuple[Dict[str, Any], Optional[bytes]]:
- """Given a deposit, retrieve all metadata requests into one Dict and returns both that
- aggregated metadata dict and the list of raw_metdadata.
+ def _metadata_get(self, deposit: Deposit) -> Optional[bytes]:
+ """Retrieve the last non-empty raw metadata object for that deposit, if any
Args:
deposit: The deposit instance to extract metadata from
- Returns:
- Tuple of last metadata dict and last raw_metadata
-
"""
for deposit_request in self._deposit_requests(
deposit, request_type=METADATA_TYPE
):
if deposit_request.raw_metadata is not None:
- return (deposit_request.metadata, deposit_request.raw_metadata)
+ return deposit_request.raw_metadata
- return ({}, None)
+ return None
class APIPrivateView(APIConfig, APIView):
diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py
--- a/swh/deposit/api/private/deposit_check.py
+++ b/swh/deposit/api/private/deposit_check.py
@@ -8,6 +8,7 @@
from shutil import get_unpack_formats
import tarfile
from typing import Dict, Optional, Tuple
+from xml.etree import ElementTree
import zipfile
from rest_framework import status
@@ -151,7 +152,7 @@
Tuple (status, json response, content-type)
"""
- metadata, _ = self._metadata_get(deposit)
+ raw_metadata = self._metadata_get(deposit)
details_dict: Dict = {}
# will check each deposit's associated request (both of type
# archive and metadata) for errors
@@ -160,11 +161,18 @@
assert details is not None
details_dict.update(details)
- metadata_status_ok, details = check_metadata(metadata)
- # Ensure in case of error, we do have the rejection details
- assert metadata_status_ok or (not metadata_status_ok and details is not None)
- # we can have warnings even if checks are ok (e.g. missing suggested field)
- details_dict.update(details or {})
+ if raw_metadata is None:
+ metadata_status_ok = False
+ details_dict["metadata"] = [{"summary": "Missing Atom document"}]
+ else:
+ metadata_tree = ElementTree.fromstring(raw_metadata)
+ metadata_status_ok, details = check_metadata(metadata_tree)
+ # Ensure in case of error, we do have the rejection details
+ assert metadata_status_ok or (
+ not metadata_status_ok and details is not None
+ )
+ # we can have warnings even if checks are ok (e.g. missing suggested field)
+ details_dict.update(details or {})
deposit_status_ok = archives_status_ok and metadata_status_ok
# if any details_dict arose, the deposit is rejected
diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2020 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -7,12 +7,13 @@
import os
import shutil
import tempfile
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Optional, Tuple
+from xml.etree import ElementTree
from rest_framework import status
from swh.core import tarball
-from swh.deposit.utils import normalize_date
+from swh.deposit.utils import NAMESPACES, normalize_date
from swh.model.hashutil import hash_to_hex
from swh.model.model import MetadataAuthorityType
from swh.model.swhids import CoreSWHID
@@ -20,6 +21,7 @@
from . import APIPrivateView, DepositReadMixin
from ...config import ARCHIVE_TYPE, SWH_PERSON
from ...models import Deposit
+from ...utils import parse_xml
from ..common import APIGet
@@ -105,31 +107,33 @@
"""
- def _normalize_dates(self, deposit, metadata):
+ def _parse_dates(
+ self, deposit: Deposit, metadata: ElementTree.Element
+ ) -> Tuple[dict, dict]:
"""Normalize the date to use as a tuple of author date, committer date
from the incoming metadata.
- Args:
- deposit (Deposit): Deposit model representation
- metadata (Dict): Metadata dict representation
-
Returns:
Tuple of author date, committer date. Those dates are
swh normalized.
"""
- commit_date = metadata.get("codemeta:datePublished")
- author_date = metadata.get("codemeta:dateCreated")
-
- if author_date and commit_date:
- pass
- elif commit_date:
- author_date = commit_date
- elif author_date:
- commit_date = author_date
+ commit_date_elt = metadata.find("codemeta:datePublished", namespaces=NAMESPACES)
+ author_date_elt = metadata.find("codemeta:dateCreated", namespaces=NAMESPACES)
+
+ author_date: Any
+ commit_date: Any
+
+ if author_date_elt is None and commit_date_elt is None:
+ author_date = commit_date = deposit.complete_date
+ elif commit_date_elt is None:
+ author_date = commit_date = author_date_elt.text # type: ignore
+ elif author_date_elt is None:
+ author_date = commit_date = commit_date_elt.text
else:
- author_date = deposit.complete_date
- commit_date = deposit.complete_date
+ author_date = author_date_elt.text
+ commit_date = commit_date_elt.text
+
return (normalize_date(author_date), normalize_date(commit_date))
def metadata_read(self, deposit: Deposit) -> Dict[str, Any]:
@@ -158,8 +162,14 @@
(author_date, committer_date, etc...)
"""
- metadata, raw_metadata = self._metadata_get(deposit)
- author_date, commit_date = self._normalize_dates(deposit, metadata)
+ raw_metadata = self._metadata_get(deposit)
+ author_date: Optional[dict]
+ commit_date: Optional[dict]
+ if raw_metadata:
+ metadata_tree = ElementTree.fromstring(raw_metadata)
+ author_date, commit_date = self._parse_dates(deposit, metadata_tree)
+ else:
+ author_date = commit_date = None
if deposit.parent:
parent_swhid = deposit.parent.swhid
@@ -170,10 +180,15 @@
else:
parents = []
- release_notes = metadata.get("codemeta:releaseNotes")
- if isinstance(release_notes, list):
- release_notes = "\n\n".join(release_notes)
- if not release_notes:
+ release_notes_elements = metadata_tree.findall(
+ "codemeta:releaseNotes", namespaces=NAMESPACES
+ )
+ release_notes: Optional[str]
+ if release_notes_elements:
+ release_notes = "\n\n".join(
+ element.text for element in release_notes_elements if element.text
+ )
+ else:
release_notes = None
return {
@@ -186,7 +201,7 @@
},
"tool": self.tool,
"metadata_raw": raw_metadata,
- "metadata_dict": metadata,
+ "metadata_dict": parse_xml(raw_metadata),
"deposit": {
"id": deposit.id,
"client": deposit.client.username,
diff --git a/swh/deposit/cli/client.py b/swh/deposit/cli/client.py
--- a/swh/deposit/cli/client.py
+++ b/swh/deposit/cli/client.py
@@ -556,13 +556,15 @@
"""Deposit metadata only upload
"""
+ from xml.etree import ElementTree
+
from swh.deposit.client import PublicApiDepositClient
- from swh.deposit.utils import parse_swh_reference, parse_xml
+ from swh.deposit.utils import parse_swh_reference
# Parse to check for a swhid presence within the metadata file
with open(metadata_path, "r") as f:
metadata_raw = f.read()
- actual_swhid = parse_swh_reference(parse_xml(metadata_raw))
+ actual_swhid = parse_swh_reference(ElementTree.fromstring(metadata_raw))
if not actual_swhid:
raise InputError("A SWHID must be provided for a metadata-only deposit")
diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py
--- a/swh/deposit/tests/api/test_checks.py
+++ b/swh/deposit/tests/api/test_checks.py
@@ -3,7 +3,9 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import textwrap
from typing import Any, Dict
+from xml.etree import ElementTree
import pytest
@@ -19,42 +21,77 @@
}
}
+XMLNS = """xmlns="http://www.w3.org/2005/Atom"
+ xmlns:swh="https://www.softwareheritage.org/schema/2018/deposit"
+ xmlns:codemeta="https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"
+ xmlns:schema="http://schema.org/"
+"""
+
+PROVENANCE_XML = """
+ <swh:deposit>
+ <swh:metadata-provenance>
+ <schema:url>some-metadata-provenance-url</schema:url>
+ </swh:metadata-provenance>
+ </swh:deposit>
+"""
+
+_parameters1 = [
+ textwrap.dedent(metadata_ok)
+ for (metadata_ok,) in [
+ (
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <name>foo</name>
+ <author>someone</author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ ),
+ (
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <name>foo</name>
+ <author>no one</author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ ),
+ (
+ f"""
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>no one</codemeta:author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ ),
+ (
+ f"""
+ <entry {XMLNS}>
+ <url>some url</url>
+ <external_identifier>some id</external_identifier>
+ <name>nar</name>
+ <author>no one</author>
+ <codemeta:datePublished>2020-12-21</codemeta:datePublished>
+ <codemeta:dateCreated>2020-12-21</codemeta:dateCreated>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ ),
+ ]
+]
+
@pytest.mark.parametrize(
- "metadata_ok",
- [
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:name": "foo",
- "atom:author": "someone",
- **METADATA_PROVENANCE_DICT,
- },
- {
- "atom:url": "some url",
- "atom:external_identifier": "some id",
- "atom:title": "bar",
- "atom:author": "no one",
- **METADATA_PROVENANCE_DICT,
- },
- {
- "atom:url": "some url",
- "codemeta:name": "bar",
- "codemeta:author": "no one",
- **METADATA_PROVENANCE_DICT,
- },
- {
- "atom:url": "some url",
- "atom:external_identifier": "some id",
- "atom:title": "bar",
- "atom:author": "no one",
- "codemeta:datePublished": "2020-12-21",
- "codemeta:dateCreated": "2020-12-21",
- },
- ],
+ "metadata_ok", _parameters1,
)
def test_api_checks_check_metadata_ok(metadata_ok, swh_checks_deposit):
- actual_check, detail = check_metadata(metadata_ok)
+ actual_check, detail = check_metadata(ElementTree.fromstring(metadata_ok))
assert actual_check is True, f"Unexpected result: {detail}"
if "swh:deposit" in metadata_ok:
# no missing suggested field
@@ -71,117 +108,143 @@
}
-@pytest.mark.parametrize(
- "metadata_ko,expected_summary",
- [
+_parameters2 = [
+ (textwrap.dedent(metadata_ko), expected_summary)
+ for (metadata_ko, expected_summary) in [
(
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:author": "someone",
- **METADATA_PROVENANCE_DICT,
- },
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <author>someone</author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
"summary": "Mandatory fields are missing",
"fields": ["atom:name or atom:title or codemeta:name"],
},
),
(
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:title": "foobar",
- **METADATA_PROVENANCE_DICT,
- },
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>foobar</title>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
"summary": "Mandatory fields are missing",
"fields": ["atom:author or codemeta:author"],
},
),
(
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "codemeta:title": "bar",
- "atom:author": "someone",
- **METADATA_PROVENANCE_DICT,
- },
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <codemeta:title>bar</codemeta:title>
+ <author>someone</author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
"summary": "Mandatory fields are missing",
"fields": ["atom:name or atom:title or codemeta:name"],
},
),
(
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:title": "foobar",
- "author": "foo",
- **METADATA_PROVENANCE_DICT,
- },
+ f"""
+ <entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns:swh="https://www.softwareheritage.org/schema/2018/deposit"
+ xmlns:codemeta="https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"
+ xmlns:schema="http://schema.org/">
+ <atom:url>something</atom:url>
+ <atom:external_identifier>something-else</atom:external_identifier>
+ <atom:title>foobar</atom:title>
+ <author>foo</author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
"summary": "Mandatory fields are missing",
"fields": ["atom:author or codemeta:author"],
},
),
(
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:title": "foobar",
- "atom:authorblahblah": "foo",
- **METADATA_PROVENANCE_DICT,
- },
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <authorblahblah>someone</authorblahblah>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
"summary": "Mandatory fields are missing",
"fields": ["atom:author or codemeta:author"],
},
),
(
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <author>someone</author>
+ <codemeta:datePublished>2020-aa-21</codemeta:datePublished>
+ <codemeta:dateCreated>2020-12-bb</codemeta:dateCreated>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:author": "someone",
- **METADATA_PROVENANCE_DICT,
- },
- {
- "summary": "Mandatory fields are missing",
- "fields": ["atom:name or atom:title or codemeta:name"],
+ "summary": "Invalid date format",
+ "fields": ["codemeta:datePublished", "codemeta:dateCreated"],
},
),
- ],
-)
+ ]
+]
+
+
+@pytest.mark.parametrize("metadata_ko,expected_summary", _parameters2)
def test_api_checks_check_metadata_ko(
metadata_ko, expected_summary, swh_checks_deposit
):
- actual_check, error_detail = check_metadata(metadata_ko)
+ actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko))
assert actual_check is False
assert error_detail == {"metadata": [expected_summary]}
-@pytest.mark.parametrize(
- "metadata_ko,expected_invalid_summary",
- [
+_parameters3 = [
+ (textwrap.dedent(metadata_ko), expected_summary)
+ for (metadata_ko, expected_summary) in [
(
- {
- "atom:url": "some url",
- "atom:external_identifier": "some id",
- "atom:title": "bar",
- "atom:author": "no one",
- "codemeta:datePublished": "2020-aa-21",
- "codemeta:dateCreated": "2020-12-bb",
- },
+ f"""
+ <entry {XMLNS}>
+ <url>some url</url>
+ <external_identifier>someid</external_identifier>
+ <title>bar</title>
+ <author>no one</author>
+ <codemeta:datePublished>2020-aa-21</codemeta:datePublished>
+ <codemeta:dateCreated>2020-12-bb</codemeta:dateCreated>
+ </entry>
+ """,
{
"summary": "Invalid date format",
"fields": ["codemeta:datePublished", "codemeta:dateCreated"],
},
),
- ],
-)
+ ]
+]
+
+
+@pytest.mark.parametrize("metadata_ko,expected_invalid_summary", _parameters3)
def test_api_checks_check_metadata_fields_ko_and_missing_suggested_fields(
metadata_ko, expected_invalid_summary, swh_checks_deposit
):
- actual_check, error_detail = check_metadata(metadata_ko)
+ actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko))
assert actual_check is False
assert error_detail == {
"metadata": [expected_invalid_summary]
diff --git a/swh/deposit/tests/api/test_collection_add_to_origin.py b/swh/deposit/tests/api/test_collection_add_to_origin.py
--- a/swh/deposit/tests/api/test_collection_add_to_origin.py
+++ b/swh/deposit/tests/api/test_collection_add_to_origin.py
@@ -39,7 +39,7 @@
data=atom_dataset["entry-data-with-add-to-origin"] % origin_url,
)
- assert response.status_code == status.HTTP_201_CREATED
+ assert response.status_code == status.HTTP_201_CREATED, response.content.decode()
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["swh:deposit_id"]
@@ -105,7 +105,7 @@
data=atom_dataset["entry-data0"] % origin_url,
)
- assert response.status_code == status.HTTP_403_FORBIDDEN
+ assert response.status_code == status.HTTP_403_FORBIDDEN, response.content.decode()
assert b"must start with" in response.content
diff --git a/swh/deposit/tests/api/test_collection_post_multipart.py b/swh/deposit/tests/api/test_collection_post_multipart.py
--- a/swh/deposit/tests/api/test_collection_post_multipart.py
+++ b/swh/deposit/tests/api/test_collection_post_multipart.py
@@ -42,7 +42,6 @@
HTTP_IN_PROGRESS="false",
)
- print(response.content.decode())
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["swh:deposit_id"]
diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py
--- a/swh/deposit/tests/api/test_deposit_private_check.py
+++ b/swh/deposit/tests/api/test_deposit_private_check.py
@@ -7,11 +7,7 @@
import pytest
from rest_framework import status
-from swh.deposit.api.checks import (
- MANDATORY_FIELDS_MISSING,
- METADATA_PROVENANCE_KEY,
- SUGGESTED_FIELDS_MISSING,
-)
+from swh.deposit.api.checks import METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING
from swh.deposit.api.private.deposit_check import (
MANDATORY_ARCHIVE_INVALID,
MANDATORY_ARCHIVE_MISSING,
@@ -142,18 +138,9 @@
assert len(details["archive"]) == 1
assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED
# metadata check failure
- assert len(details["metadata"]) == 2
+ assert len(details["metadata"]) == 1
mandatory = details["metadata"][0]
- assert mandatory["summary"] == MANDATORY_FIELDS_MISSING
- assert set(mandatory["fields"]) == set(
- [
- "atom:author or codemeta:author",
- "atom:name or atom:title or codemeta:name",
- ]
- )
- suggested = details["metadata"][1]
- assert suggested["summary"] == SUGGESTED_FIELDS_MISSING
- assert set(suggested["fields"]) == set([METADATA_PROVENANCE_KEY])
+ assert mandatory["summary"] == "Missing Atom document"
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_REJECTED
diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py
--- a/swh/deposit/tests/cli/test_client.py
+++ b/swh/deposit/tests/cli/test_client.py
@@ -10,6 +10,7 @@
import logging
import os
from unittest.mock import MagicMock
+from xml.etree import ElementTree
import pytest
import yaml
@@ -196,7 +197,7 @@
== "origin-url"
)
- checks_ok, detail = check_metadata(actual_metadata)
+ checks_ok, detail = check_metadata(ElementTree.fromstring(actual_metadata_xml))
assert checks_ok is True
# FIXME: Open the flag to suggest the provenance metadata url in the cli
@@ -227,7 +228,7 @@
assert actual_metadata.get("codemeta:identifier") is None
assert actual_metadata.get("swh:deposit") is None
- checks_ok, detail = check_metadata(actual_metadata)
+ checks_ok, detail = check_metadata(ElementTree.fromstring(actual_metadata_xml))
assert checks_ok is True
# FIXME: Open the flag to suggest the provenance metadata url in the cli
diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py
--- a/swh/deposit/tests/test_utils.py
+++ b/swh/deposit/tests/test_utils.py
@@ -3,10 +3,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from xml.etree import ElementTree
+
import pytest
from swh.deposit import utils
-from swh.deposit.parsers import parse_xml
from swh.model.exceptions import ValidationError
from swh.model.swhids import CoreSWHID, QualifiedSWHID
@@ -103,7 +104,7 @@
def test_parse_swh_reference_origin(xml_with_origin_reference):
url = "https://url"
xml_data = xml_with_origin_reference.format(url=url)
- metadata = parse_xml(xml_data)
+ metadata = ElementTree.fromstring(xml_data)
actual_origin = utils.parse_swh_reference(metadata)
assert actual_origin == url
@@ -133,7 +134,7 @@
)
def test_parse_swh_reference_empty(xml_swh_deposit_template, xml_ref):
xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref)
- metadata = utils.parse_xml(xml_body)
+ metadata = ElementTree.fromstring(xml_body)
assert utils.parse_swh_reference(metadata) is None
@@ -156,7 +157,7 @@
)
def test_parse_swh_reference_swhid(swhid, xml_with_swhid):
xml_data = xml_with_swhid.format(swhid=swhid)
- metadata = utils.parse_xml(xml_data)
+ metadata = ElementTree.fromstring(xml_data)
actual_swhid = utils.parse_swh_reference(metadata)
assert actual_swhid is not None
@@ -182,7 +183,7 @@
"""
xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid)
- metadata = utils.parse_xml(xml_invalid_swhid)
+ metadata = ElementTree.fromstring(xml_invalid_swhid)
with pytest.raises(ValidationError):
utils.parse_swh_reference(metadata)
@@ -198,7 +199,7 @@
)
def test_parse_swh_metatada_provenance_empty(xml_swh_deposit_template, xml_ref):
xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref)
- metadata = utils.parse_xml(xml_body)
+ metadata = ElementTree.fromstring(xml_body)
assert utils.parse_swh_metadata_provenance(metadata) is None
@@ -210,7 +211,7 @@
def test_parse_swh_metadata_provenance2(xml_with_metadata_provenance):
xml_data = xml_with_metadata_provenance.format(url="https://url.org/metadata/url")
- metadata = utils.parse_xml(xml_data)
+ metadata = ElementTree.fromstring(xml_data)
actual_url = utils.parse_swh_metadata_provenance(metadata)
diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py
--- a/swh/deposit/utils.py
+++ b/swh/deposit/utils.py
@@ -5,6 +5,7 @@
import logging
from typing import Any, Dict, Optional, Union
+from xml.etree import ElementTree
import iso8601
import xmltodict
@@ -16,21 +17,22 @@
logger = logging.getLogger(__name__)
-def parse_xml(stream, encoding="utf-8"):
- namespaces = {
- "http://www.w3.org/2005/Atom": "atom",
- "http://www.w3.org/2007/app": "app",
- "http://purl.org/dc/terms/": "dc",
- "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0": "codemeta",
- "http://purl.org/net/sword/terms/": "sword",
- "https://www.softwareheritage.org/schema/2018/deposit": "swh",
- "http://schema.org/": "schema",
- }
+NAMESPACES = {
+ "atom": "http://www.w3.org/2005/Atom",
+ "app": "http://www.w3.org/2007/app",
+ "dc": "http://purl.org/dc/terms/",
+ "codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+ "sword": "http://purl.org/net/sword/terms/",
+ "swh": "https://www.softwareheritage.org/schema/2018/deposit",
+ "schema": "http://schema.org/",
+}
+
+def parse_xml(stream, encoding="utf-8"):
data = xmltodict.parse(
stream,
encoding=encoding,
- namespaces=namespaces,
+ namespaces={uri: prefix for (prefix, uri) in NAMESPACES.items()},
process_namespaces=True,
dict_constructor=dict,
)
@@ -98,7 +100,7 @@
def parse_swh_metadata_provenance(
- metadata: Dict,
+ metadata: ElementTree.Element,
) -> Optional[Union[QualifiedSWHID, str]]:
"""Parse swh metadata-provenance within the metadata dict reference if found, None
otherwise.
@@ -121,19 +123,17 @@
Either the metadata provenance url if any or None otherwise
"""
-
- swh_deposit = metadata.get("swh:deposit")
- if not swh_deposit:
- return None
-
- swh_metadata_provenance = swh_deposit.get("swh:metadata-provenance")
- if not swh_metadata_provenance:
- return None
-
- return swh_metadata_provenance.get("schema:url")
+ url_element = metadata.find(
+ "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES
+ )
+ if url_element is not None:
+ return url_element.text
+ return None
-def parse_swh_reference(metadata: Dict,) -> Optional[Union[QualifiedSWHID, str]]:
+def parse_swh_reference(
+ metadata: ElementTree.Element,
+) -> Optional[Union[QualifiedSWHID, str]]:
"""Parse swh reference within the metadata dict (or origin) reference if found,
None otherwise.
@@ -155,7 +155,7 @@
</swh:deposit>
Args:
- metadata: result of parsing an Atom document with :func:`parse_xml`
+ metadata: result of parsing an Atom document
Raises:
ValidationError in case the swhid referenced (if any) is invalid
@@ -164,27 +164,21 @@
Either swhid or origin reference if any. None otherwise.
""" # noqa
- swh_deposit = metadata.get("swh:deposit")
- if not swh_deposit:
- return None
-
- swh_reference = swh_deposit.get("swh:reference")
- if not swh_reference:
- return None
-
- swh_origin = swh_reference.get("swh:origin")
- if swh_origin:
- url = swh_origin.get("@url")
- if url:
- return url
+ ref_origin = metadata.find(
+ "swh:deposit/swh:reference/swh:origin[@url]", namespaces=NAMESPACES
+ )
+ if ref_origin is not None:
+ return ref_origin.attrib["url"]
- swh_object = swh_reference.get("swh:object")
- if not swh_object:
+ ref_object = metadata.find(
+ "swh:deposit/swh:reference/swh:object[@swhid]", namespaces=NAMESPACES
+ )
+ if ref_object is None:
return None
-
- swhid = swh_object.get("@swhid")
+ swhid = ref_object.attrib["swhid"]
if not swhid:
return None
+
swhid_reference = QualifiedSWHID.from_string(swhid)
if swhid_reference.qualifiers():
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Dec 18, 7:07 AM (2 d, 1 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3229896
Attached To
D7215: server: Use xml.etree.ElementTree instead of nested dicts internally
Event Timeline
Log In to Comment