diff --git a/MANIFEST.in b/MANIFEST.in --- a/MANIFEST.in +++ b/MANIFEST.in @@ -6,6 +6,7 @@ recursive-include swh/deposit/templates * recursive-include swh/deposit/tests/data * recursive-include swh/deposit/tests/*/data * +recursive-include swh/deposit/xsd * recursive-include swh py.typed include tox.ini include pytest.ini diff --git a/docs/specs/protocol-reference.rst b/docs/specs/protocol-reference.rst --- a/docs/specs/protocol-reference.rst +++ b/docs/specs/protocol-reference.rst @@ -351,7 +351,7 @@ Here is an XML schema to summarize the syntax described in this document: -.. literalinclude:: swh.xsd +.. literalinclude:: ../swh/deposit/xsd/swh.xsd :language: xml diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -33,6 +33,9 @@ [mypy-rest_framework.*] ignore_missing_imports = True +[mypy-xmlschema.*] +ignore_missing_imports = True + [mypy-xmltodict.*] ignore_missing_imports = True diff --git a/requirements-server.txt b/requirements-server.txt --- a/requirements-server.txt +++ b/requirements-server.txt @@ -2,3 +2,4 @@ djangorestframework psycopg2 < 2.9 setuptools +xmlschema diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py --- a/swh/deposit/api/checks.py +++ b/swh/deposit/api/checks.py @@ -14,10 +14,14 @@ """ +import dataclasses +import functools from typing import Dict, Optional, Tuple from xml.etree import ElementTree import iso8601 +import pkg_resources +import xmlschema from swh.deposit.utils import NAMESPACES, normalize_date, parse_swh_metadata_provenance @@ -28,6 +32,21 @@ METADATA_PROVENANCE_KEY = "swh:metadata-provenance" +@dataclasses.dataclass +class Schemas: + swh: xmlschema.XMLSchema + + +@functools.lru_cache(1) +def schemas() -> Schemas: + def load_xsd(name) -> xmlschema.XMLSchema: + return xmlschema.XMLSchema11( + pkg_resources.resource_string("swh.deposit", f"xsd/{name}.xsd").decode() + ) + + return Schemas(swh=load_xsd("swh")) + + def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]]: """Check metadata for mandatory field presence and date format. @@ -67,6 +86,13 @@ detail = [{"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result}] return False, {"metadata": detail + suggested_fields} + deposit_elt = metadata.find("swh:deposit", namespaces=NAMESPACES) + if deposit_elt: + try: + schemas().swh.validate(deposit_elt) + except xmlschema.exceptions.XMLSchemaException as e: + return False, {"metadata": [{"fields": ["swh:deposit"], "summary": str(e)}]} + fields = [] for commit_date in metadata.findall( @@ -89,4 +115,5 @@ if suggested_fields: # it's fine but warn about missing suggested fields return True, {"metadata": suggested_fields} + return True, None diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py --- a/swh/deposit/tests/api/test_checks.py +++ b/swh/deposit/tests/api/test_checks.py @@ -3,6 +3,10 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +# disable flake8 on this file because of line length +# flake8: noqa + +import re import textwrap from typing import Any, Dict from xml.etree import ElementTree @@ -81,6 +85,70 @@ """, ), + ( + f"""\ + + something + something-else + bar + someone + + """, + ), + ( + f"""\ + + something + something-else + bar + someone + + + + + + some-metadata-provenance-url + + + + """, + ), + ( + f"""\ + + something + something-else + bar + someone + + + + + + some-metadata-provenance-url + + + + """, + ), + ( + f"""\ + + something + something-else + bar + someone + + + + + + some-metadata-provenance-url + + + + """, + ), ] ] @@ -248,3 +316,131 @@ "metadata": [expected_invalid_summary] + [{"fields": [METADATA_PROVENANCE_KEY], "summary": SUGGESTED_FIELDS_MISSING,}] } + + +_parameters4 = [ + (textwrap.dedent(metadata_ko), expected_summary) + for (metadata_ko, expected_summary) in [ + ( + f"""\ + + something + something-else + bar + someone + + + + + + + """, + { + "summary": ( + r".*Reason: Unexpected child with tag 'swh:invalid'.*" + r"Instance:.*swh:invalid.*" + ), + "fields": ["swh:deposit"], + }, + ), + ( + f"""\ + + something + something-else + bar + someone + + + + + + + + + + """, + { + "summary": ( + r".*Reason: Unexpected child with tag 'swh:add_to_origin'.*" + ), + "fields": ["swh:deposit"], + }, + ), + ( + f"""\ + + something + something-else + bar + someone + + + + + + + + """, + { + "summary": r".*Reason: Unexpected child with tag 'swh:origin'.*", + "fields": ["swh:deposit"], + }, + ), + ( + f"""\ + + something + something-else + bar + someone + + + + + + + + """, + { + "summary": r".*Reason: Unexpected child with tag 'swh:origin'.*", + "fields": ["swh:deposit"], + }, + ), + ( + f"""\ + + something + something-else + bar + someone + + + + + + + + """, + { + "summary": r".*Reason: Unexpected child with tag 'swh:object'.*", + "fields": ["swh:deposit"], + }, + ), + ] +] + + +@pytest.mark.parametrize("metadata_ko,expected_summary", _parameters4) +def test_api_checks_check_metadata_ko_schema( + metadata_ko, expected_summary, swh_checks_deposit +): + actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko)) + assert actual_check is False + assert len(error_detail["metadata"]) == 1, error_detail["metadata"] + assert error_detail["metadata"][0]["fields"] == expected_summary["fields"] + + # xmlschema returns very detailed errors, we cannot reasonably test them + # for equality + summary = error_detail["metadata"][0]["summary"] + assert re.match(expected_summary["summary"], summary, re.DOTALL), summary diff --git a/docs/specs/swh.xsd b/swh/deposit/xsd/swh.xsd rename from docs/specs/swh.xsd rename to swh/deposit/xsd/swh.xsd