Page MenuHomeSoftware Heritage

D7215.id26160.diff
No OneTemporary

D7215.id26160.diff

diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py
--- a/swh/deposit/api/checks.py
+++ b/swh/deposit/api/checks.py
@@ -15,10 +15,11 @@
"""
from typing import Dict, Optional, Tuple
+from xml.etree import ElementTree
import iso8601
-from swh.deposit.utils import normalize_date, parse_swh_metadata_provenance
+from swh.deposit.utils import NAMESPACES, normalize_date, parse_swh_metadata_provenance
MANDATORY_FIELDS_MISSING = "Mandatory fields are missing"
INVALID_DATE_FORMAT = "Invalid date format"
@@ -27,7 +28,7 @@
METADATA_PROVENANCE_KEY = "swh:metadata-provenance"
-def check_metadata(metadata: Dict) -> Tuple[bool, Optional[Dict]]:
+def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]]:
"""Check metadata for mandatory field presence and date format.
Args:
@@ -47,9 +48,9 @@
("atom:author", "codemeta:author"): False,
}
- for field, value in metadata.items():
- for possible_names in alternate_fields:
- if field in possible_names:
+ for possible_names in alternate_fields:
+ for possible_name in possible_names:
+ if metadata.find(possible_name, namespaces=NAMESPACES) is not None:
alternate_fields[possible_names] = True
continue
@@ -68,18 +69,17 @@
fields = []
- commit_date = metadata.get("codemeta:datePublished")
- author_date = metadata.get("codemeta:dateCreated")
-
- if commit_date:
+ for commit_date in metadata.findall(
+ "codemeta:datePublished", namespaces=NAMESPACES
+ ):
try:
- normalize_date(commit_date)
+ normalize_date(commit_date.text)
except iso8601.iso8601.ParseError:
fields.append("codemeta:datePublished")
- if author_date:
+ for author_date in metadata.findall("codemeta:dateCreated", namespaces=NAMESPACES):
try:
- normalize_date(author_date)
+ normalize_date(author_date.text)
except iso8601.iso8601.ParseError:
fields.append("codemeta:dateCreated")
diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -9,6 +9,7 @@
import json
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
import uuid
+from xml.etree import ElementTree
import attr
from django.core.files.uploadedfile import UploadedFile
@@ -27,7 +28,7 @@
from swh.deposit.api.converters import convert_status_detail
from swh.deposit.auth import HasDepositPermission, KeycloakBasicAuthentication
from swh.deposit.models import Deposit
-from swh.deposit.utils import compute_metadata_context
+from swh.deposit.utils import NAMESPACES, compute_metadata_context
from swh.model import hashutil
from swh.model.model import (
MetadataAuthority,
@@ -316,6 +317,8 @@
metadata = deposit_request_data.get(METADATA_KEY)
if metadata:
+ # TODO: remove non-raw metadata? we don't use these anymore except in
+ # manual queries to the deposit DB
raw_metadata = deposit_request_data[RAW_METADATA_KEY]
deposit_request = DepositRequest(
type=METADATA_TYPE,
@@ -498,14 +501,23 @@
archive=filehandler.name,
)
- def _read_metadata(self, metadata_stream) -> Tuple[bytes, Dict[str, Any]]:
- """Given a metadata stream, reads the metadata and returns both the
- parsed and the raw metadata.
+ def _read_metadata(
+ self, metadata_stream
+ ) -> Tuple[bytes, Dict[str, Any], ElementTree.Element]:
+ """
+ Given a metadata stream, reads the metadata and returns the metadata in three
+ forms:
+ * verbatim (as raw bytes), for archival in long-term storage
+ * parsed as a Python dict, for archival in postgresql's jsonb type
+ * parsed as ElementTree, to extract information immediately
"""
raw_metadata = metadata_stream.read()
- metadata = parse_xml(raw_metadata)
- return raw_metadata, metadata
+ metadata_dict = parse_xml(raw_metadata)
+ metadata_tree = ElementTree.fromstring(raw_metadata)
+ # TODO: remove metadata_dict? we don't use it anymore, except in manual
+ # queries to the deposit DB
+ return raw_metadata, metadata_dict, metadata_tree
def _multipart_upload(
self,
@@ -592,7 +604,9 @@
self._check_file_md5sum(filehandler, headers.content_md5sum)
try:
- raw_metadata, metadata = self._read_metadata(data["application/atom+xml"])
+ raw_metadata, metadata_dict, metadata_tree = self._read_metadata(
+ data["application/atom+xml"]
+ )
except ParserError:
raise DepositError(
PARSING_ERROR,
@@ -601,7 +615,7 @@
"Please ensure your metadata file is correctly formatted.",
)
- self._set_deposit_origin_from_metadata(deposit, metadata, headers)
+ self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers)
# actual storage of data
self._deposit_put(
@@ -609,7 +623,7 @@
)
deposit_request_data = {
ARCHIVE_KEY: filehandler,
- METADATA_KEY: metadata,
+ METADATA_KEY: metadata_dict,
RAW_METADATA_KEY: raw_metadata,
}
self._deposit_request_put(
@@ -628,7 +642,8 @@
self,
deposit: Deposit,
swhid_reference: Union[str, QualifiedSWHID],
- metadata: Dict,
+ metadata_dict: Dict,
+ metadata_tree: ElementTree.Element,
raw_metadata: bytes,
deposit_origin: Optional[str] = None,
) -> Tuple[ExtendedSWHID, Deposit, DepositRequest]:
@@ -644,8 +659,10 @@
Args:
deposit: Deposit reference
swhid_reference: The swhid or the origin to attach metadata information to
- metadata: Full dict of metadata to check for validity (parsed out of
- raw_metadata)
+ metadata_dict: Full dict of metadata for storage in the deposit DB as jsonb
+ (parsed out of raw_metadata)
+ metadata_tree: Full element tree of metadata to check for validity
+ (parsed out of raw_metadata)
raw_metadata: The actual raw metadata to send in the storage metadata
deposit_origin: Optional deposit origin url to use if any (e.g. deposit
update scenario provides one)
@@ -658,7 +675,7 @@
Tuple of target swhid, deposit, and deposit request
"""
- metadata_ok, error_details = check_metadata(metadata)
+ metadata_ok, error_details = check_metadata(metadata_tree)
if not metadata_ok:
assert error_details, "Details should be set when a failure occurs"
raise DepositError(
@@ -675,7 +692,7 @@
# replace metadata within the deposit backend
deposit_request_data = {
- METADATA_KEY: metadata,
+ METADATA_KEY: metadata_dict,
RAW_METADATA_KEY: raw_metadata,
}
@@ -818,7 +835,9 @@
)
try:
- raw_metadata, metadata = self._read_metadata(metadata_stream)
+ raw_metadata, metadata_dict, metadata_tree = self._read_metadata(
+ metadata_stream
+ )
except ParserError:
raise DepositError(
BAD_REQUEST,
@@ -827,16 +846,16 @@
"Please ensure your metadata file is correctly formatted.",
)
- if metadata is None:
+ if metadata_dict is None:
raise DepositError(
BAD_REQUEST, empty_atom_entry_summary, empty_atom_entry_desc
)
- self._set_deposit_origin_from_metadata(deposit, metadata, headers)
+ self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers)
# Determine if we are in the metadata-only deposit case
try:
- swhid_ref = parse_swh_reference(metadata)
+ swhid_ref = parse_swh_reference(metadata_tree)
except ValidationError as e:
raise DepositError(
PARSING_ERROR, "Invalid SWHID reference", str(e),
@@ -855,7 +874,7 @@
if swhid_ref is not None:
deposit.save() # We need a deposit id
target_swhid, depo, depo_request = self._store_metadata_deposit(
- deposit, swhid_ref, metadata, raw_metadata
+ deposit, swhid_ref, metadata_dict, metadata_tree, raw_metadata
)
deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS
@@ -879,7 +898,7 @@
self._deposit_request_put(
deposit,
- {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata},
+ {METADATA_KEY: metadata_dict, RAW_METADATA_KEY: raw_metadata},
replace_metadata,
replace_archives,
)
@@ -892,10 +911,14 @@
)
def _set_deposit_origin_from_metadata(self, deposit, metadata, headers):
- create_origin = metadata.get("swh:deposit", {}).get("swh:create_origin")
- add_to_origin = metadata.get("swh:deposit", {}).get("swh:add_to_origin")
+ create_origin = metadata.find(
+ "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES
+ )
+ add_to_origin = metadata.find(
+ "swh:deposit/swh:add_to_origin/swh:origin", namespaces=NAMESPACES
+ )
- if create_origin and add_to_origin:
+ if create_origin is not None and add_to_origin is not None:
raise DepositError(
BAD_REQUEST,
"<swh:create_origin> and <swh:add_to_origin> are mutually exclusive, "
@@ -903,13 +926,13 @@
"origin.",
)
- if create_origin:
- origin_url = create_origin["swh:origin"]["@url"]
+ if create_origin is not None:
+ origin_url = create_origin.attrib["url"]
check_client_origin(deposit.client, origin_url)
deposit.origin_url = origin_url
- if add_to_origin:
- origin_url = add_to_origin["swh:origin"]["@url"]
+ if add_to_origin is not None:
+ origin_url = add_to_origin.attrib["url"]
check_client_origin(deposit.client, origin_url)
deposit.parent = (
Deposit.objects.filter(
@@ -922,7 +945,10 @@
)
deposit.origin_url = origin_url
- if "atom:external_identifier" in metadata:
+ external_identifier_element = metadata.find(
+ "atom:external_identifier", namespaces=NAMESPACES
+ )
+ if external_identifier_element is not None:
# Deprecated tag.
# When clients stopped using it, this should raise an error
# unconditionally
@@ -934,7 +960,7 @@
"<swh:create_origin> and <swh:add_to_origin> from now on.",
)
- if headers.slug and metadata["atom:external_identifier"] != headers.slug:
+ if headers.slug and external_identifier_element.text != headers.slug:
raise DepositError(
BAD_REQUEST,
"The <external_identifier> tag and Slug header are deprecated, "
diff --git a/swh/deposit/api/edit.py b/swh/deposit/api/edit.py
--- a/swh/deposit/api/edit.py
+++ b/swh/deposit/api/edit.py
@@ -108,7 +108,9 @@
)
try:
- raw_metadata, metadata = self._read_metadata(request.data)
+ raw_metadata, metadata_dict, metadata_tree = self._read_metadata(
+ request.data
+ )
except ParserError:
raise DepositError(
BAD_REQUEST,
@@ -117,7 +119,7 @@
"Please ensure your metadata file is correctly formatted.",
)
- if not metadata:
+ if not metadata_dict:
raise DepositError(
BAD_REQUEST,
"Empty body request is not supported",
@@ -128,7 +130,8 @@
_, deposit, deposit_request = self._store_metadata_deposit(
deposit,
QualifiedSWHID.from_string(swhid),
- metadata,
+ metadata_dict,
+ metadata_tree,
raw_metadata,
deposit.origin_url,
)
diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py
--- a/swh/deposit/api/private/__init__.py
+++ b/swh/deposit/api/private/__init__.py
@@ -3,7 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Any, Dict, Optional, Tuple
+from typing import Optional
from rest_framework.permissions import AllowAny
from rest_framework.views import APIView
@@ -36,24 +36,20 @@
for deposit_request in deposit_requests:
yield deposit_request
- def _metadata_get(self, deposit: Deposit) -> Tuple[Dict[str, Any], Optional[bytes]]:
- """Given a deposit, retrieve all metadata requests into one Dict and returns both that
- aggregated metadata dict and the list of raw_metdadata.
+ def _metadata_get(self, deposit: Deposit) -> Optional[bytes]:
+ """Retrieve the last non-empty raw metadata object for that deposit, if any
Args:
deposit: The deposit instance to extract metadata from
- Returns:
- Tuple of last metadata dict and last raw_metadata
-
"""
for deposit_request in self._deposit_requests(
deposit, request_type=METADATA_TYPE
):
if deposit_request.raw_metadata is not None:
- return (deposit_request.metadata, deposit_request.raw_metadata)
+ return deposit_request.raw_metadata
- return ({}, None)
+ return None
class APIPrivateView(APIConfig, APIView):
diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py
--- a/swh/deposit/api/private/deposit_check.py
+++ b/swh/deposit/api/private/deposit_check.py
@@ -8,6 +8,7 @@
from shutil import get_unpack_formats
import tarfile
from typing import Dict, Optional, Tuple
+from xml.etree import ElementTree
import zipfile
from rest_framework import status
@@ -151,7 +152,7 @@
Tuple (status, json response, content-type)
"""
- metadata, _ = self._metadata_get(deposit)
+ raw_metadata = self._metadata_get(deposit)
details_dict: Dict = {}
# will check each deposit's associated request (both of type
# archive and metadata) for errors
@@ -160,11 +161,18 @@
assert details is not None
details_dict.update(details)
- metadata_status_ok, details = check_metadata(metadata)
- # Ensure in case of error, we do have the rejection details
- assert metadata_status_ok or (not metadata_status_ok and details is not None)
- # we can have warnings even if checks are ok (e.g. missing suggested field)
- details_dict.update(details or {})
+ if raw_metadata is None:
+ metadata_status_ok = False
+ details_dict["metadata"] = [{"summary": "Missing Atom document"}]
+ else:
+ metadata_tree = ElementTree.fromstring(raw_metadata)
+ metadata_status_ok, details = check_metadata(metadata_tree)
+ # Ensure in case of error, we do have the rejection details
+ assert metadata_status_ok or (
+ not metadata_status_ok and details is not None
+ )
+ # we can have warnings even if checks are ok (e.g. missing suggested field)
+ details_dict.update(details or {})
deposit_status_ok = archives_status_ok and metadata_status_ok
# if any details_dict arose, the deposit is rejected
diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2020 The Software Heritage developers
+# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -7,12 +7,13 @@
import os
import shutil
import tempfile
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Optional, Tuple
+from xml.etree import ElementTree
from rest_framework import status
from swh.core import tarball
-from swh.deposit.utils import normalize_date
+from swh.deposit.utils import NAMESPACES, normalize_date
from swh.model.hashutil import hash_to_hex
from swh.model.model import MetadataAuthorityType
from swh.model.swhids import CoreSWHID
@@ -20,6 +21,7 @@
from . import APIPrivateView, DepositReadMixin
from ...config import ARCHIVE_TYPE, SWH_PERSON
from ...models import Deposit
+from ...utils import parse_xml
from ..common import APIGet
@@ -105,31 +107,33 @@
"""
- def _normalize_dates(self, deposit, metadata):
+ def _parse_dates(
+ self, deposit: Deposit, metadata: ElementTree.Element
+ ) -> Tuple[dict, dict]:
"""Normalize the date to use as a tuple of author date, committer date
from the incoming metadata.
- Args:
- deposit (Deposit): Deposit model representation
- metadata (Dict): Metadata dict representation
-
Returns:
Tuple of author date, committer date. Those dates are
swh normalized.
"""
- commit_date = metadata.get("codemeta:datePublished")
- author_date = metadata.get("codemeta:dateCreated")
-
- if author_date and commit_date:
- pass
- elif commit_date:
- author_date = commit_date
- elif author_date:
- commit_date = author_date
+ commit_date_elt = metadata.find("codemeta:datePublished", namespaces=NAMESPACES)
+ author_date_elt = metadata.find("codemeta:dateCreated", namespaces=NAMESPACES)
+
+ author_date: Any
+ commit_date: Any
+
+ if author_date_elt is None and commit_date_elt is None:
+ author_date = commit_date = deposit.complete_date
+ elif commit_date_elt is None:
+ author_date = commit_date = author_date_elt.text # type: ignore
+ elif author_date_elt is None:
+ author_date = commit_date = commit_date_elt.text
else:
- author_date = deposit.complete_date
- commit_date = deposit.complete_date
+ author_date = author_date_elt.text
+ commit_date = commit_date_elt.text
+
return (normalize_date(author_date), normalize_date(commit_date))
def metadata_read(self, deposit: Deposit) -> Dict[str, Any]:
@@ -158,8 +162,14 @@
(author_date, committer_date, etc...)
"""
- metadata, raw_metadata = self._metadata_get(deposit)
- author_date, commit_date = self._normalize_dates(deposit, metadata)
+ raw_metadata = self._metadata_get(deposit)
+ author_date: Optional[dict]
+ commit_date: Optional[dict]
+ if raw_metadata:
+ metadata_tree = ElementTree.fromstring(raw_metadata)
+ author_date, commit_date = self._parse_dates(deposit, metadata_tree)
+ else:
+ author_date = commit_date = None
if deposit.parent:
parent_swhid = deposit.parent.swhid
@@ -170,10 +180,15 @@
else:
parents = []
- release_notes = metadata.get("codemeta:releaseNotes")
- if isinstance(release_notes, list):
- release_notes = "\n\n".join(release_notes)
- if not release_notes:
+ release_notes_elements = metadata_tree.findall(
+ "codemeta:releaseNotes", namespaces=NAMESPACES
+ )
+ release_notes: Optional[str]
+ if release_notes_elements:
+ release_notes = "\n\n".join(
+ element.text for element in release_notes_elements if element.text
+ )
+ else:
release_notes = None
return {
@@ -186,7 +201,7 @@
},
"tool": self.tool,
"metadata_raw": raw_metadata,
- "metadata_dict": metadata,
+ "metadata_dict": parse_xml(raw_metadata),
"deposit": {
"id": deposit.id,
"client": deposit.client.username,
diff --git a/swh/deposit/cli/client.py b/swh/deposit/cli/client.py
--- a/swh/deposit/cli/client.py
+++ b/swh/deposit/cli/client.py
@@ -556,13 +556,15 @@
"""Deposit metadata only upload
"""
+ from xml.etree import ElementTree
+
from swh.deposit.client import PublicApiDepositClient
- from swh.deposit.utils import parse_swh_reference, parse_xml
+ from swh.deposit.utils import parse_swh_reference
# Parse to check for a swhid presence within the metadata file
with open(metadata_path, "r") as f:
metadata_raw = f.read()
- actual_swhid = parse_swh_reference(parse_xml(metadata_raw))
+ actual_swhid = parse_swh_reference(ElementTree.fromstring(metadata_raw))
if not actual_swhid:
raise InputError("A SWHID must be provided for a metadata-only deposit")
diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py
--- a/swh/deposit/tests/api/test_checks.py
+++ b/swh/deposit/tests/api/test_checks.py
@@ -3,7 +3,9 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import textwrap
from typing import Any, Dict
+from xml.etree import ElementTree
import pytest
@@ -19,42 +21,77 @@
}
}
+XMLNS = """xmlns="http://www.w3.org/2005/Atom"
+ xmlns:swh="https://www.softwareheritage.org/schema/2018/deposit"
+ xmlns:codemeta="https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"
+ xmlns:schema="http://schema.org/"
+"""
+
+PROVENANCE_XML = """
+ <swh:deposit>
+ <swh:metadata-provenance>
+ <schema:url>some-metadata-provenance-url</schema:url>
+ </swh:metadata-provenance>
+ </swh:deposit>
+"""
+
+_parameters1 = [
+ textwrap.dedent(metadata_ok)
+ for (metadata_ok,) in [
+ (
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <name>foo</name>
+ <author>someone</author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ ),
+ (
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <name>foo</name>
+ <author>no one</author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ ),
+ (
+ f"""
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>no one</codemeta:author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ ),
+ (
+ f"""
+ <entry {XMLNS}>
+ <url>some url</url>
+ <external_identifier>some id</external_identifier>
+ <name>nar</name>
+ <author>no one</author>
+ <codemeta:datePublished>2020-12-21</codemeta:datePublished>
+ <codemeta:dateCreated>2020-12-21</codemeta:dateCreated>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ ),
+ ]
+]
+
@pytest.mark.parametrize(
- "metadata_ok",
- [
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:name": "foo",
- "atom:author": "someone",
- **METADATA_PROVENANCE_DICT,
- },
- {
- "atom:url": "some url",
- "atom:external_identifier": "some id",
- "atom:title": "bar",
- "atom:author": "no one",
- **METADATA_PROVENANCE_DICT,
- },
- {
- "atom:url": "some url",
- "codemeta:name": "bar",
- "codemeta:author": "no one",
- **METADATA_PROVENANCE_DICT,
- },
- {
- "atom:url": "some url",
- "atom:external_identifier": "some id",
- "atom:title": "bar",
- "atom:author": "no one",
- "codemeta:datePublished": "2020-12-21",
- "codemeta:dateCreated": "2020-12-21",
- },
- ],
+ "metadata_ok", _parameters1,
)
def test_api_checks_check_metadata_ok(metadata_ok, swh_checks_deposit):
- actual_check, detail = check_metadata(metadata_ok)
+ actual_check, detail = check_metadata(ElementTree.fromstring(metadata_ok))
assert actual_check is True, f"Unexpected result: {detail}"
if "swh:deposit" in metadata_ok:
# no missing suggested field
@@ -71,117 +108,143 @@
}
-@pytest.mark.parametrize(
- "metadata_ko,expected_summary",
- [
+_parameters2 = [
+ (textwrap.dedent(metadata_ko), expected_summary)
+ for (metadata_ko, expected_summary) in [
(
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:author": "someone",
- **METADATA_PROVENANCE_DICT,
- },
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <author>someone</author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
"summary": "Mandatory fields are missing",
"fields": ["atom:name or atom:title or codemeta:name"],
},
),
(
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:title": "foobar",
- **METADATA_PROVENANCE_DICT,
- },
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>foobar</title>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
"summary": "Mandatory fields are missing",
"fields": ["atom:author or codemeta:author"],
},
),
(
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "codemeta:title": "bar",
- "atom:author": "someone",
- **METADATA_PROVENANCE_DICT,
- },
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <codemeta:title>bar</codemeta:title>
+ <author>someone</author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
"summary": "Mandatory fields are missing",
"fields": ["atom:name or atom:title or codemeta:name"],
},
),
(
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:title": "foobar",
- "author": "foo",
- **METADATA_PROVENANCE_DICT,
- },
+ f"""
+ <entry xmlns:atom="http://www.w3.org/2005/Atom"
+ xmlns:swh="https://www.softwareheritage.org/schema/2018/deposit"
+ xmlns:codemeta="https://doi.org/10.5063/SCHEMA/CODEMETA-2.0"
+ xmlns:schema="http://schema.org/">
+ <atom:url>something</atom:url>
+ <atom:external_identifier>something-else</atom:external_identifier>
+ <atom:title>foobar</atom:title>
+ <author>foo</author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
"summary": "Mandatory fields are missing",
"fields": ["atom:author or codemeta:author"],
},
),
(
- {
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:title": "foobar",
- "atom:authorblahblah": "foo",
- **METADATA_PROVENANCE_DICT,
- },
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <authorblahblah>someone</authorblahblah>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
"summary": "Mandatory fields are missing",
"fields": ["atom:author or codemeta:author"],
},
),
(
+ f"""
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <author>someone</author>
+ <codemeta:datePublished>2020-aa-21</codemeta:datePublished>
+ <codemeta:dateCreated>2020-12-bb</codemeta:dateCreated>
+ {PROVENANCE_XML}
+ </entry>
+ """,
{
- "atom:url": "something",
- "atom:external_identifier": "something-else",
- "atom:author": "someone",
- **METADATA_PROVENANCE_DICT,
- },
- {
- "summary": "Mandatory fields are missing",
- "fields": ["atom:name or atom:title or codemeta:name"],
+ "summary": "Invalid date format",
+ "fields": ["codemeta:datePublished", "codemeta:dateCreated"],
},
),
- ],
-)
+ ]
+]
+
+
+@pytest.mark.parametrize("metadata_ko,expected_summary", _parameters2)
def test_api_checks_check_metadata_ko(
metadata_ko, expected_summary, swh_checks_deposit
):
- actual_check, error_detail = check_metadata(metadata_ko)
+ actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko))
assert actual_check is False
assert error_detail == {"metadata": [expected_summary]}
-@pytest.mark.parametrize(
- "metadata_ko,expected_invalid_summary",
- [
+_parameters3 = [
+ (textwrap.dedent(metadata_ko), expected_summary)
+ for (metadata_ko, expected_summary) in [
(
- {
- "atom:url": "some url",
- "atom:external_identifier": "some id",
- "atom:title": "bar",
- "atom:author": "no one",
- "codemeta:datePublished": "2020-aa-21",
- "codemeta:dateCreated": "2020-12-bb",
- },
+ f"""
+ <entry {XMLNS}>
+ <url>some url</url>
+ <external_identifier>someid</external_identifier>
+ <title>bar</title>
+ <author>no one</author>
+ <codemeta:datePublished>2020-aa-21</codemeta:datePublished>
+ <codemeta:dateCreated>2020-12-bb</codemeta:dateCreated>
+ </entry>
+ """,
{
"summary": "Invalid date format",
"fields": ["codemeta:datePublished", "codemeta:dateCreated"],
},
),
- ],
-)
+ ]
+]
+
+
+@pytest.mark.parametrize("metadata_ko,expected_invalid_summary", _parameters3)
def test_api_checks_check_metadata_fields_ko_and_missing_suggested_fields(
metadata_ko, expected_invalid_summary, swh_checks_deposit
):
- actual_check, error_detail = check_metadata(metadata_ko)
+ actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko))
assert actual_check is False
assert error_detail == {
"metadata": [expected_invalid_summary]
diff --git a/swh/deposit/tests/api/test_collection_add_to_origin.py b/swh/deposit/tests/api/test_collection_add_to_origin.py
--- a/swh/deposit/tests/api/test_collection_add_to_origin.py
+++ b/swh/deposit/tests/api/test_collection_add_to_origin.py
@@ -39,7 +39,7 @@
data=atom_dataset["entry-data-with-add-to-origin"] % origin_url,
)
- assert response.status_code == status.HTTP_201_CREATED
+ assert response.status_code == status.HTTP_201_CREATED, response.content.decode()
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["swh:deposit_id"]
@@ -105,7 +105,7 @@
data=atom_dataset["entry-data0"] % origin_url,
)
- assert response.status_code == status.HTTP_403_FORBIDDEN
+ assert response.status_code == status.HTTP_403_FORBIDDEN, response.content.decode()
assert b"must start with" in response.content
diff --git a/swh/deposit/tests/api/test_collection_post_multipart.py b/swh/deposit/tests/api/test_collection_post_multipart.py
--- a/swh/deposit/tests/api/test_collection_post_multipart.py
+++ b/swh/deposit/tests/api/test_collection_post_multipart.py
@@ -42,7 +42,6 @@
HTTP_IN_PROGRESS="false",
)
- print(response.content.decode())
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content["swh:deposit_id"]
diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py
--- a/swh/deposit/tests/api/test_deposit_private_check.py
+++ b/swh/deposit/tests/api/test_deposit_private_check.py
@@ -7,11 +7,7 @@
import pytest
from rest_framework import status
-from swh.deposit.api.checks import (
- MANDATORY_FIELDS_MISSING,
- METADATA_PROVENANCE_KEY,
- SUGGESTED_FIELDS_MISSING,
-)
+from swh.deposit.api.checks import METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING
from swh.deposit.api.private.deposit_check import (
MANDATORY_ARCHIVE_INVALID,
MANDATORY_ARCHIVE_MISSING,
@@ -142,18 +138,9 @@
assert len(details["archive"]) == 1
assert details["archive"][0]["summary"] == MANDATORY_ARCHIVE_UNSUPPORTED
# metadata check failure
- assert len(details["metadata"]) == 2
+ assert len(details["metadata"]) == 1
mandatory = details["metadata"][0]
- assert mandatory["summary"] == MANDATORY_FIELDS_MISSING
- assert set(mandatory["fields"]) == set(
- [
- "atom:author or codemeta:author",
- "atom:name or atom:title or codemeta:name",
- ]
- )
- suggested = details["metadata"][1]
- assert suggested["summary"] == SUGGESTED_FIELDS_MISSING
- assert set(suggested["fields"]) == set([METADATA_PROVENANCE_KEY])
+ assert mandatory["summary"] == "Missing Atom document"
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_REJECTED
diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py
--- a/swh/deposit/tests/cli/test_client.py
+++ b/swh/deposit/tests/cli/test_client.py
@@ -10,6 +10,7 @@
import logging
import os
from unittest.mock import MagicMock
+from xml.etree import ElementTree
import pytest
import yaml
@@ -196,7 +197,7 @@
== "origin-url"
)
- checks_ok, detail = check_metadata(actual_metadata)
+ checks_ok, detail = check_metadata(ElementTree.fromstring(actual_metadata_xml))
assert checks_ok is True
# FIXME: Open the flag to suggest the provenance metadata url in the cli
@@ -227,7 +228,7 @@
assert actual_metadata.get("codemeta:identifier") is None
assert actual_metadata.get("swh:deposit") is None
- checks_ok, detail = check_metadata(actual_metadata)
+ checks_ok, detail = check_metadata(ElementTree.fromstring(actual_metadata_xml))
assert checks_ok is True
# FIXME: Open the flag to suggest the provenance metadata url in the cli
diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py
--- a/swh/deposit/tests/test_utils.py
+++ b/swh/deposit/tests/test_utils.py
@@ -3,10 +3,11 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from xml.etree import ElementTree
+
import pytest
from swh.deposit import utils
-from swh.deposit.parsers import parse_xml
from swh.model.exceptions import ValidationError
from swh.model.swhids import CoreSWHID, QualifiedSWHID
@@ -103,7 +104,7 @@
def test_parse_swh_reference_origin(xml_with_origin_reference):
url = "https://url"
xml_data = xml_with_origin_reference.format(url=url)
- metadata = parse_xml(xml_data)
+ metadata = ElementTree.fromstring(xml_data)
actual_origin = utils.parse_swh_reference(metadata)
assert actual_origin == url
@@ -133,7 +134,7 @@
)
def test_parse_swh_reference_empty(xml_swh_deposit_template, xml_ref):
xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref)
- metadata = utils.parse_xml(xml_body)
+ metadata = ElementTree.fromstring(xml_body)
assert utils.parse_swh_reference(metadata) is None
@@ -156,7 +157,7 @@
)
def test_parse_swh_reference_swhid(swhid, xml_with_swhid):
xml_data = xml_with_swhid.format(swhid=swhid)
- metadata = utils.parse_xml(xml_data)
+ metadata = ElementTree.fromstring(xml_data)
actual_swhid = utils.parse_swh_reference(metadata)
assert actual_swhid is not None
@@ -182,7 +183,7 @@
"""
xml_invalid_swhid = xml_with_swhid.format(swhid=invalid_swhid)
- metadata = utils.parse_xml(xml_invalid_swhid)
+ metadata = ElementTree.fromstring(xml_invalid_swhid)
with pytest.raises(ValidationError):
utils.parse_swh_reference(metadata)
@@ -198,7 +199,7 @@
)
def test_parse_swh_metatada_provenance_empty(xml_swh_deposit_template, xml_ref):
xml_body = xml_swh_deposit_template.format(swh_deposit=xml_ref)
- metadata = utils.parse_xml(xml_body)
+ metadata = ElementTree.fromstring(xml_body)
assert utils.parse_swh_metadata_provenance(metadata) is None
@@ -210,7 +211,7 @@
def test_parse_swh_metadata_provenance2(xml_with_metadata_provenance):
xml_data = xml_with_metadata_provenance.format(url="https://url.org/metadata/url")
- metadata = utils.parse_xml(xml_data)
+ metadata = ElementTree.fromstring(xml_data)
actual_url = utils.parse_swh_metadata_provenance(metadata)
diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py
--- a/swh/deposit/utils.py
+++ b/swh/deposit/utils.py
@@ -5,6 +5,7 @@
import logging
from typing import Any, Dict, Optional, Union
+from xml.etree import ElementTree
import iso8601
import xmltodict
@@ -16,21 +17,22 @@
logger = logging.getLogger(__name__)
-def parse_xml(stream, encoding="utf-8"):
- namespaces = {
- "http://www.w3.org/2005/Atom": "atom",
- "http://www.w3.org/2007/app": "app",
- "http://purl.org/dc/terms/": "dc",
- "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0": "codemeta",
- "http://purl.org/net/sword/terms/": "sword",
- "https://www.softwareheritage.org/schema/2018/deposit": "swh",
- "http://schema.org/": "schema",
- }
+NAMESPACES = {
+ "atom": "http://www.w3.org/2005/Atom",
+ "app": "http://www.w3.org/2007/app",
+ "dc": "http://purl.org/dc/terms/",
+ "codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
+ "sword": "http://purl.org/net/sword/terms/",
+ "swh": "https://www.softwareheritage.org/schema/2018/deposit",
+ "schema": "http://schema.org/",
+}
+
+def parse_xml(stream, encoding="utf-8"):
data = xmltodict.parse(
stream,
encoding=encoding,
- namespaces=namespaces,
+ namespaces={uri: prefix for (prefix, uri) in NAMESPACES.items()},
process_namespaces=True,
dict_constructor=dict,
)
@@ -98,7 +100,7 @@
def parse_swh_metadata_provenance(
- metadata: Dict,
+ metadata: ElementTree.Element,
) -> Optional[Union[QualifiedSWHID, str]]:
"""Parse swh metadata-provenance within the metadata dict reference if found, None
otherwise.
@@ -121,19 +123,17 @@
Either the metadata provenance url if any or None otherwise
"""
-
- swh_deposit = metadata.get("swh:deposit")
- if not swh_deposit:
- return None
-
- swh_metadata_provenance = swh_deposit.get("swh:metadata-provenance")
- if not swh_metadata_provenance:
- return None
-
- return swh_metadata_provenance.get("schema:url")
+ url_element = metadata.find(
+ "swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES
+ )
+ if url_element is not None:
+ return url_element.text
+ return None
-def parse_swh_reference(metadata: Dict,) -> Optional[Union[QualifiedSWHID, str]]:
+def parse_swh_reference(
+ metadata: ElementTree.Element,
+) -> Optional[Union[QualifiedSWHID, str]]:
"""Parse swh reference within the metadata dict (or origin) reference if found,
None otherwise.
@@ -155,7 +155,7 @@
</swh:deposit>
Args:
- metadata: result of parsing an Atom document with :func:`parse_xml`
+ metadata: result of parsing an Atom document
Raises:
ValidationError in case the swhid referenced (if any) is invalid
@@ -164,27 +164,21 @@
Either swhid or origin reference if any. None otherwise.
""" # noqa
- swh_deposit = metadata.get("swh:deposit")
- if not swh_deposit:
- return None
-
- swh_reference = swh_deposit.get("swh:reference")
- if not swh_reference:
- return None
-
- swh_origin = swh_reference.get("swh:origin")
- if swh_origin:
- url = swh_origin.get("@url")
- if url:
- return url
+ ref_origin = metadata.find(
+ "swh:deposit/swh:reference/swh:origin[@url]", namespaces=NAMESPACES
+ )
+ if ref_origin is not None:
+ return ref_origin.attrib["url"]
- swh_object = swh_reference.get("swh:object")
- if not swh_object:
+ ref_object = metadata.find(
+ "swh:deposit/swh:reference/swh:object[@swhid]", namespaces=NAMESPACES
+ )
+ if ref_object is None:
return None
-
- swhid = swh_object.get("@swhid")
+ swhid = ref_object.attrib["swhid"]
if not swhid:
return None
+
swhid_reference = QualifiedSWHID.from_string(swhid)
if swhid_reference.qualifiers():

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 7:07 AM (1 d, 18 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3229896

Event Timeline