Page MenuHomeSoftware Heritage

D7243.id26258.diff
No OneTemporary

D7243.id26258.diff

diff --git a/MANIFEST.in b/MANIFEST.in
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -6,6 +6,7 @@
recursive-include swh/deposit/templates *
recursive-include swh/deposit/tests/data *
recursive-include swh/deposit/tests/*/data *
+recursive-include swh/deposit/xsd *
recursive-include swh py.typed
include tox.ini
include pytest.ini
diff --git a/docs/specs/protocol-reference.rst b/docs/specs/protocol-reference.rst
--- a/docs/specs/protocol-reference.rst
+++ b/docs/specs/protocol-reference.rst
@@ -351,7 +351,7 @@
Here is an XML schema to summarize the syntax described in this document:
-.. literalinclude:: swh.xsd
+.. literalinclude:: ../swh/deposit/xsd/swh.xsd
:language: xml
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -33,6 +33,9 @@
[mypy-rest_framework.*]
ignore_missing_imports = True
+[mypy-xmlschema.*]
+ignore_missing_imports = True
+
[mypy-xmltodict.*]
ignore_missing_imports = True
diff --git a/requirements-server.txt b/requirements-server.txt
--- a/requirements-server.txt
+++ b/requirements-server.txt
@@ -2,3 +2,4 @@
djangorestframework
psycopg2 < 2.9
setuptools
+xmlschema
diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py
--- a/swh/deposit/api/checks.py
+++ b/swh/deposit/api/checks.py
@@ -14,10 +14,14 @@
"""
+import dataclasses
+import functools
from typing import Dict, Optional, Tuple
from xml.etree import ElementTree
import iso8601
+import pkg_resources
+import xmlschema
from swh.deposit.utils import NAMESPACES, normalize_date, parse_swh_metadata_provenance
@@ -28,6 +32,21 @@
METADATA_PROVENANCE_KEY = "swh:metadata-provenance"
+@dataclasses.dataclass
+class Schemas:
+ swh: xmlschema.XMLSchema
+
+
+@functools.lru_cache(1)
+def schemas() -> Schemas:
+ def load_xsd(name) -> xmlschema.XMLSchema11:
+ return xmlschema.XMLSchema11(
+ pkg_resources.resource_string("swh.deposit", f"xsd/{name}.xsd").decode()
+ )
+
+ return Schemas(swh=load_xsd("swh"))
+
+
def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]]:
"""Check metadata for mandatory field presence and date format.
@@ -67,6 +86,13 @@
detail = [{"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result}]
return False, {"metadata": detail + suggested_fields}
+ deposit_elt = metadata.find("swh:deposit", namespaces=NAMESPACES)
+ if deposit_elt:
+ try:
+ schemas().swh.validate(deposit_elt)
+ except xmlschema.exceptions.XMLSchemaException as e:
+ return False, {"metadata": [{"fields": ["swh:deposit"], "summary": str(e)}]}
+
fields = []
for commit_date in metadata.findall(
@@ -89,4 +115,5 @@
if suggested_fields: # it's fine but warn about missing suggested fields
return True, {"metadata": suggested_fields}
+
return True, None
diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py
--- a/swh/deposit/tests/api/test_checks.py
+++ b/swh/deposit/tests/api/test_checks.py
@@ -3,6 +3,10 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+# disable flake8 on this file because of line length
+# flake8: noqa
+
+import re
import textwrap
from typing import Any, Dict
from xml.etree import ElementTree
@@ -81,6 +85,70 @@
</entry>
""",
),
+ (
+ f"""\
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <author>someone</author>
+ </entry>
+ """,
+ ),
+ (
+ f"""\
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <author>someone</author>
+ <swh:deposit>
+ <swh:add_to_origin>
+ <swh:origin url="http://example.org" />
+ </swh:add_to_origin>
+ <swh:metadata-provenance>
+ <schema:url>some-metadata-provenance-url</schema:url>
+ </swh:metadata-provenance>
+ </swh:deposit>
+ </entry>
+ """,
+ ),
+ (
+ f"""\
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <author>someone</author>
+ <swh:deposit>
+ <swh:reference>
+ <swh:origin url="http://example.org" />
+ </swh:reference>
+ <swh:metadata-provenance>
+ <schema:url>some-metadata-provenance-url</schema:url>
+ </swh:metadata-provenance>
+ </swh:deposit>
+ </entry>
+ """,
+ ),
+ (
+ f"""\
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <author>someone</author>
+ <swh:deposit>
+ <swh:reference>
+ <swh:object swhid="swh:1:dir:0000000000000000000000000000000000000000" />
+ </swh:reference>
+ <swh:metadata-provenance>
+ <schema:url>some-metadata-provenance-url</schema:url>
+ </swh:metadata-provenance>
+ </swh:deposit>
+ </entry>
+ """,
+ ),
]
]
@@ -248,3 +316,131 @@
"metadata": [expected_invalid_summary]
+ [{"fields": [METADATA_PROVENANCE_KEY], "summary": SUGGESTED_FIELDS_MISSING,}]
}
+
+
+_parameters4 = [
+ (textwrap.dedent(metadata_ko), expected_summary)
+ for (metadata_ko, expected_summary) in [
+ (
+ f"""\
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <author>someone</author>
+ <swh:deposit>
+ <swh:invalid>
+ <swh:origin url="http://example.org" />
+ </swh:invalid>
+ </swh:deposit>
+ </entry>
+ """,
+ {
+ "summary": (
+ r".*Reason: Unexpected child with tag 'swh:invalid'.*"
+ r"Instance:.*swh:invalid.*"
+ ),
+ "fields": ["swh:deposit"],
+ },
+ ),
+ (
+ f"""\
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <author>someone</author>
+ <swh:deposit>
+ <swh:add_to_origin>
+ <swh:origin url="http://example.org" />
+ </swh:add_to_origin>
+ <swh:add_to_origin>
+ <swh:origin url="http://example.org" />
+ </swh:add_to_origin>
+ </swh:deposit>
+ </entry>
+ """,
+ {
+ "summary": (
+ r".*Reason: Unexpected child with tag 'swh:add_to_origin'.*"
+ ),
+ "fields": ["swh:deposit"],
+ },
+ ),
+ (
+ f"""\
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <author>someone</author>
+ <swh:deposit>
+ <swh:reference>
+ <swh:object swhid="swh:1:dir:0000000000000000000000000000000000000000" />
+ <swh:origin url="http://example.org" />
+ </swh:reference>
+ </swh:deposit>
+ </entry>
+ """,
+ {
+ "summary": r".*Reason: Unexpected child with tag 'swh:origin'.*",
+ "fields": ["swh:deposit"],
+ },
+ ),
+ (
+ f"""\
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <author>someone</author>
+ <swh:deposit>
+ <swh:reference>
+ <swh:origin url="http://example.org" />
+ <swh:origin url="http://example.org" />
+ </swh:reference>
+ </swh:deposit>
+ </entry>
+ """,
+ {
+ "summary": r".*Reason: Unexpected child with tag 'swh:origin'.*",
+ "fields": ["swh:deposit"],
+ },
+ ),
+ (
+ f"""\
+ <entry {XMLNS}>
+ <url>something</url>
+ <external_identifier>something-else</external_identifier>
+ <title>bar</title>
+ <author>someone</author>
+ <swh:deposit>
+ <swh:reference>
+ <swh:origin url="http://example.org" />
+ <swh:object swhid="swh:1:dir:0000000000000000000000000000000000000000" />
+ </swh:reference>
+ </swh:deposit>
+ </entry>
+ """,
+ {
+ "summary": r".*Reason: Unexpected child with tag 'swh:object'.*",
+ "fields": ["swh:deposit"],
+ },
+ ),
+ ]
+]
+
+
+@pytest.mark.parametrize("metadata_ko,expected_summary", _parameters4)
+def test_api_checks_check_metadata_ko_schema(
+ metadata_ko, expected_summary, swh_checks_deposit
+):
+ actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko))
+ assert actual_check is False
+ assert len(error_detail["metadata"]) == 1, error_detail["metadata"]
+ assert error_detail["metadata"][0]["fields"] == expected_summary["fields"]
+
+ # xmlschema returns very detailed errors, we cannot reasonably test them
+ # for equality
+ summary = error_detail["metadata"][0]["summary"]
+ assert re.match(expected_summary["summary"], summary, re.DOTALL), summary
diff --git a/docs/specs/swh.xsd b/swh/deposit/xsd/swh.xsd
rename from docs/specs/swh.xsd
rename to swh/deposit/xsd/swh.xsd

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 11:29 PM (21 h, 43 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3233557

Event Timeline