diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py
--- a/swh/deposit/api/checks.py
+++ b/swh/deposit/api/checks.py
@@ -17,6 +17,7 @@
import dataclasses
import functools
from typing import Dict, Optional, Tuple
+import urllib
from xml.etree import ElementTree
import pkg_resources
@@ -32,6 +33,41 @@
METADATA_PROVENANCE_KEY = "swh:metadata-provenance"
+def extra_validator(element, xsd_element):
+ """Performs extra checks on Atom elements that cannot be implemented purely
+ within XML Schema.
+
+ For now, this only checks URIs are absolute."""
+ type_name = xsd_element.type.name
+ if type_name == "{http://www.w3.org/2001/XMLSchema}anyURI":
+ # Check their URI is absolute.
+ # This could technically be implemented in the schema like this:
+ #
+ #
+ #
+ #
+ #
+ #
+ # However, this would give an unreadable error, so we implement it here
+ # in Python instead.
+ try:
+ url = urllib.parse.urlparse(element.text)
+ except ValueError:
+ raise xmlschema.XMLSchemaValidationError(
+ xsd_element, element, f"{element.text!r} is not a valid URI",
+ ) from None
+ else:
+ if not url.scheme or not url.netloc:
+ raise xmlschema.XMLSchemaValidationError(
+ xsd_element, element, f"{element.text!r} is not an absolute URI",
+ )
+ elif " " in url.netloc:
+ # urllib is a little too permissive...
+ raise xmlschema.XMLSchemaValidationError(
+ xsd_element, element, f"{element.text!r} is not a valid URI",
+ )
+
+
@dataclasses.dataclass
class Schemas:
swh: xmlschema.XMLSchema11
@@ -90,7 +126,7 @@
deposit_elt = metadata.find("swh:deposit", namespaces=NAMESPACES)
if deposit_elt:
try:
- schemas().swh.validate(deposit_elt)
+ schemas().swh.validate(deposit_elt, extra_validator=extra_validator)
except xmlschema.exceptions.XMLSchemaException as e:
return False, {"metadata": [{"fields": ["swh:deposit"], "summary": str(e)}]}
@@ -103,7 +139,7 @@
# Tag is not specified in the schema, don't validate it
continue
try:
- schemas().codemeta.validate(child)
+ schemas().codemeta.validate(child, extra_validator=extra_validator)
except xmlschema.exceptions.XMLSchemaException as e:
detail.append({"fields": [schema_element.prefixed_name], "summary": str(e)})
diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py
--- a/swh/deposit/tests/api/test_checks.py
+++ b/swh/deposit/tests/api/test_checks.py
@@ -157,6 +157,7 @@
2020-12-21
2020-12-21
2020-12-25
+ 2020-12-25
{PROVENANCE_XML}
""",
@@ -251,6 +252,45 @@
""",
),
+ (
+ # a full example with every tag we know
+ "codemeta-full",
+ f"""\
+
+ something
+ foo
+ someone
+
+ The Author
+ http://example.org/~theauthor/
+ author@example.org
+
+ University 1
+
+ https://sandbox.orcid.org/0000-0002-9227-8514
+
+
+ A Contributor
+
+ University 2
+
+
+ something
+ something else
+ http://example.org/
+ Blah blah
+ 1.0.0
+ 1.0.0
+ kw1
+ kw2
+ Blah blah
+ http://example.org/
+ http://example.org/
+ http://example.org/
+ {PROVENANCE_XML}
+
+ """,
+ ),
]
]
@@ -451,6 +491,110 @@
},
],
),
+ (
+ "contributor-with-no-name",
+ f"""\
+
+ some url
+ bar
+
+ should allow anything here
+
+
+ abc
+
+ {PROVENANCE_XML}
+
+ """,
+ [
+ {
+ "summary": ".*Tag '?codemeta:name'? expected.*",
+ "fields": ["codemeta:contributor"],
+ },
+ ],
+ ),
+ (
+ "id-is-not-url",
+ f"""\
+
+ some url
+ bar
+
+ The Author
+ http://not a url/
+
+ {PROVENANCE_XML}
+
+ """,
+ [
+ {
+ "summary": ".*Reason: 'http://not a url/' is not a valid URI.*",
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
+ (
+ "identifier-is-invalid-url",
+ f"""\
+
+ some url
+ bar
+
+ The Author
+ http://[invalid-url/
+
+ {PROVENANCE_XML}
+
+ """,
+ [
+ {
+ "summary": (
+ r".*Reason: 'http://\[invalid-url/' is not a valid URI.*"
+ ),
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
+ (
+ "identifier-is-not-url",
+ f"""\
+
+ some url
+ bar
+
+ The Author
+ http://not a url/
+
+ {PROVENANCE_XML}
+
+ """,
+ [
+ {
+ "summary": ".*Reason: 'http://not a url/' is not a valid URI.*",
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
+ (
+ "identifier-is-not-url2",
+ f"""\
+
+ some url
+ bar
+
+ The Author
+ not a url
+
+ {PROVENANCE_XML}
+
+ """,
+ [
+ {
+ "summary": ".*Reason: 'not a url' is not an absolute URI.*",
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
(
"invalid-dates",
f"""\
@@ -494,6 +638,25 @@
},
],
),
+ (
+ "invalid-embargoDate",
+ f"""\
+
+ some url
+ someid
+ bar
+ no one
+ 2022-02-28T12:00:00
+ {PROVENANCE_XML}
+
+ """,
+ [
+ {
+ "summary": ".*Invalid datetime string '2022-02-28T12:00:00'.*",
+ "fields": ["codemeta:embargoDate"],
+ },
+ ],
+ ),
(
"error-and-missing-provenance",
f"""\
diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py
--- a/swh/deposit/tests/cli/test_client.py
+++ b/swh/deposit/tests/cli/test_client.py
@@ -179,7 +179,7 @@
"deposit-client",
"project-name",
authors=["some", "authors"],
- external_id="external-id",
+ external_id="http://example.org/external-id",
create_origin="origin-url",
metadata_provenance_url="meta-prov-url",
)
@@ -199,7 +199,7 @@
)
assert (
actual_metadata.findtext("codemeta:identifier", namespaces=NAMESPACES)
- == "external-id"
+ == "http://example.org/external-id"
)
authors = actual_metadata.findall(
"codemeta:author/codemeta:name", namespaces=NAMESPACES
diff --git a/swh/deposit/xsd/codemeta.xsd b/swh/deposit/xsd/codemeta.xsd
--- a/swh/deposit/xsd/codemeta.xsd
+++ b/swh/deposit/xsd/codemeta.xsd
@@ -7,11 +7,14 @@
+
+
+
@@ -38,4 +41,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+