diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py --- a/swh/deposit/api/checks.py +++ b/swh/deposit/api/checks.py @@ -17,6 +17,7 @@ import dataclasses import functools from typing import Dict, Optional, Tuple +import urllib from xml.etree import ElementTree import pkg_resources @@ -32,6 +33,41 @@ METADATA_PROVENANCE_KEY = "swh:metadata-provenance" +def extra_validator(element, xsd_element): + """Performs extra checks on Atom elements that cannot be implemented purely + within XML Schema. + + For now, this only checks URIs are absolute.""" + type_name = xsd_element.type.name + if type_name == "{http://www.w3.org/2001/XMLSchema}anyURI": + # Check their URI is absolute. + # This could technically be implemented in the schema like this: + # + # + # + # + # + # + # However, this would give an unreadable error, so we implement it here + # in Python instead. + try: + url = urllib.parse.urlparse(element.text) + except ValueError: + raise xmlschema.XMLSchemaValidationError( + xsd_element, element, f"{element.text!r} is not a valid URI", + ) from None + else: + if not url.scheme or not url.netloc: + raise xmlschema.XMLSchemaValidationError( + xsd_element, element, f"{element.text!r} is not an absolute URI", + ) + elif " " in url.netloc: + # urllib is a little too permissive... + raise xmlschema.XMLSchemaValidationError( + xsd_element, element, f"{element.text!r} is not a valid URI", + ) + + @dataclasses.dataclass class Schemas: swh: xmlschema.XMLSchema11 @@ -90,7 +126,7 @@ deposit_elt = metadata.find("swh:deposit", namespaces=NAMESPACES) if deposit_elt: try: - schemas().swh.validate(deposit_elt) + schemas().swh.validate(deposit_elt, extra_validator=extra_validator) except xmlschema.exceptions.XMLSchemaException as e: return False, {"metadata": [{"fields": ["swh:deposit"], "summary": str(e)}]} @@ -103,7 +139,7 @@ # Tag is not specified in the schema, don't validate it continue try: - schemas().codemeta.validate(child) + schemas().codemeta.validate(child, extra_validator=extra_validator) except xmlschema.exceptions.XMLSchemaException as e: detail.append({"fields": [schema_element.prefixed_name], "summary": str(e)}) diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py --- a/swh/deposit/tests/api/test_checks.py +++ b/swh/deposit/tests/api/test_checks.py @@ -157,6 +157,7 @@ 2020-12-21 2020-12-21 2020-12-25 + 2020-12-25 {PROVENANCE_XML} """, @@ -251,6 +252,45 @@ """, ), + ( + # a full example with every tag we know + "codemeta-full", + f"""\ + + something + foo + someone + + The Author + http://example.org/~theauthor/ + author@example.org + + University 1 + + https://sandbox.orcid.org/0000-0002-9227-8514 + + + A Contributor + + University 2 + + + something + something else + http://example.org/ + Blah blah + 1.0.0 + 1.0.0 + kw1 + kw2 + Blah blah + http://example.org/ + http://example.org/ + http://example.org/ + {PROVENANCE_XML} + + """, + ), ] ] @@ -451,6 +491,110 @@ }, ], ), + ( + "contributor-with-no-name", + f"""\ + + some url + bar + + should allow anything here + + + abc + + {PROVENANCE_XML} + + """, + [ + { + "summary": ".*Tag '?codemeta:name'? expected.*", + "fields": ["codemeta:contributor"], + }, + ], + ), + ( + "id-is-not-url", + f"""\ + + some url + bar + + The Author + http://not a url/ + + {PROVENANCE_XML} + + """, + [ + { + "summary": ".*Reason: 'http://not a url/' is not a valid URI.*", + "fields": ["codemeta:author"], + }, + ], + ), + ( + "identifier-is-invalid-url", + f"""\ + + some url + bar + + The Author + http://[invalid-url/ + + {PROVENANCE_XML} + + """, + [ + { + "summary": ( + r".*Reason: 'http://\[invalid-url/' is not a valid URI.*" + ), + "fields": ["codemeta:author"], + }, + ], + ), + ( + "identifier-is-not-url", + f"""\ + + some url + bar + + The Author + http://not a url/ + + {PROVENANCE_XML} + + """, + [ + { + "summary": ".*Reason: 'http://not a url/' is not a valid URI.*", + "fields": ["codemeta:author"], + }, + ], + ), + ( + "identifier-is-not-url2", + f"""\ + + some url + bar + + The Author + not a url + + {PROVENANCE_XML} + + """, + [ + { + "summary": ".*Reason: 'not a url' is not an absolute URI.*", + "fields": ["codemeta:author"], + }, + ], + ), ( "invalid-dates", f"""\ @@ -494,6 +638,25 @@ }, ], ), + ( + "invalid-embargoDate", + f"""\ + + some url + someid + bar + no one + 2022-02-28T12:00:00 + {PROVENANCE_XML} + + """, + [ + { + "summary": ".*Invalid datetime string '2022-02-28T12:00:00'.*", + "fields": ["codemeta:embargoDate"], + }, + ], + ), ( "error-and-missing-provenance", f"""\ diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py --- a/swh/deposit/tests/cli/test_client.py +++ b/swh/deposit/tests/cli/test_client.py @@ -179,7 +179,7 @@ "deposit-client", "project-name", authors=["some", "authors"], - external_id="external-id", + external_id="http://example.org/external-id", create_origin="origin-url", metadata_provenance_url="meta-prov-url", ) @@ -199,7 +199,7 @@ ) assert ( actual_metadata.findtext("codemeta:identifier", namespaces=NAMESPACES) - == "external-id" + == "http://example.org/external-id" ) authors = actual_metadata.findall( "codemeta:author/codemeta:name", namespaces=NAMESPACES diff --git a/swh/deposit/xsd/codemeta.xsd b/swh/deposit/xsd/codemeta.xsd --- a/swh/deposit/xsd/codemeta.xsd +++ b/swh/deposit/xsd/codemeta.xsd @@ -7,11 +7,14 @@ + + + @@ -38,4 +41,20 @@ + + + + + + + + + + + + + + + +