Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7122788
D7317.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
D7317.diff
View Options
diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py
--- a/swh/deposit/api/checks.py
+++ b/swh/deposit/api/checks.py
@@ -17,6 +17,7 @@
import dataclasses
import functools
from typing import Dict, Optional, Tuple
+import urllib
from xml.etree import ElementTree
import pkg_resources
@@ -32,6 +33,41 @@
METADATA_PROVENANCE_KEY = "swh:metadata-provenance"
+def extra_validator(element, xsd_element):
+ """Performs extra checks on Atom elements that cannot be implemented purely
+ within XML Schema.
+
+ For now, this only checks URIs are absolute."""
+ type_name = xsd_element.type.name
+ if type_name == "{http://www.w3.org/2001/XMLSchema}anyURI":
+ # Check their URI is absolute.
+ # This could technically be implemented in the schema like this:
+ # <xsd:simpleType name="URL">
+ # <xsd:restriction base="xsd:anyURI">
+ # <!-- https://datatracker.ietf.org/doc/html/rfc2396#section-3.1 -->
+ # <xsd:pattern value="[a-zA-Z][a-zA-Z0-9+.-]*:.+" />
+ # </xsd:restriction>
+ # </xsd:simpleType>
+ # However, this would give an unreadable error, so we implement it here
+ # in Python instead.
+ try:
+ url = urllib.parse.urlparse(element.text)
+ except ValueError:
+ raise xmlschema.XMLSchemaValidationError(
+ xsd_element, element, f"{element.text!r} is not a valid URI",
+ ) from None
+ else:
+ if not url.scheme or not url.netloc:
+ raise xmlschema.XMLSchemaValidationError(
+ xsd_element, element, f"{element.text!r} is not an absolute URI",
+ )
+ elif " " in url.netloc:
+ # urllib is a little too permissive...
+ raise xmlschema.XMLSchemaValidationError(
+ xsd_element, element, f"{element.text!r} is not a valid URI",
+ )
+
+
@dataclasses.dataclass
class Schemas:
swh: xmlschema.XMLSchema11
@@ -90,7 +126,7 @@
deposit_elt = metadata.find("swh:deposit", namespaces=NAMESPACES)
if deposit_elt:
try:
- schemas().swh.validate(deposit_elt)
+ schemas().swh.validate(deposit_elt, extra_validator=extra_validator)
except xmlschema.exceptions.XMLSchemaException as e:
return False, {"metadata": [{"fields": ["swh:deposit"], "summary": str(e)}]}
@@ -103,7 +139,7 @@
# Tag is not specified in the schema, don't validate it
continue
try:
- schemas().codemeta.validate(child)
+ schemas().codemeta.validate(child, extra_validator=extra_validator)
except xmlschema.exceptions.XMLSchemaException as e:
detail.append({"fields": [schema_element.prefixed_name], "summary": str(e)})
diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py
--- a/swh/deposit/tests/api/test_checks.py
+++ b/swh/deposit/tests/api/test_checks.py
@@ -157,6 +157,7 @@
<codemeta:datePublished>2020-12-21</codemeta:datePublished>
<codemeta:dateCreated>2020-12-21</codemeta:dateCreated>
<codemeta:dateModified>2020-12-25</codemeta:dateModified>
+ <codemeta:embargoDate>2020-12-25</codemeta:embargoDate>
{PROVENANCE_XML}
</entry>
""",
@@ -251,6 +252,45 @@
</entry>
""",
),
+ (
+ # a full example with every tag we know
+ "codemeta-full",
+ f"""\
+ <entry {XMLNS}>
+ <url>something</url>
+ <name>foo</name>
+ <author>someone</author>
+ <codemeta:author>
+ <codemeta:name>The Author</codemeta:name>
+ <codemeta:id>http://example.org/~theauthor/</codemeta:id>
+ <codemeta:email>author@example.org</codemeta:email>
+ <codemeta:affiliation>
+ <codemeta:name>University 1</codemeta:name>
+ </codemeta:affiliation>
+ <codemeta:identifier>https://sandbox.orcid.org/0000-0002-9227-8514</codemeta:identifier>
+ </codemeta:author>
+ <codemeta:contributor>
+ <codemeta:name>A Contributor</codemeta:name>
+ <codemeta:affiliation>
+ <codemeta:name>University 2</codemeta:name>
+ </codemeta:affiliation>
+ </codemeta:contributor>
+ <codemeta:applicationCategory>something</codemeta:applicationCategory>
+ <codemeta:applicationSubCategory>something else</codemeta:applicationSubCategory>
+ <codemeta:installUrl>http://example.org/</codemeta:installUrl>
+ <codemeta:releaseNotes>Blah blah</codemeta:releaseNotes>
+ <codemeta:softwareVersion>1.0.0</codemeta:softwareVersion>
+ <codemeta:version>1.0.0</codemeta:version>
+ <codemeta:keywords>kw1</codemeta:keywords>
+ <codemeta:keywords>kw2</codemeta:keywords>
+ <codemeta:description>Blah blah</codemeta:description>
+ <codemeta:url>http://example.org/</codemeta:url>
+ <codemeta:issueTracker>http://example.org/</codemeta:issueTracker>
+ <codemeta:readme>http://example.org/</codemeta:readme>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ ),
]
]
@@ -451,6 +491,110 @@
},
],
),
+ (
+ "contributor-with-no-name",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>
+ <codemeta:name>should allow anything here</codemeta:name>
+ </codemeta:author>
+ <codemeta:contributor>
+ <schema:unknown-tag>abc</schema:unknown-tag>
+ </codemeta:contributor>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": ".*Tag '?codemeta:name'? expected.*",
+ "fields": ["codemeta:contributor"],
+ },
+ ],
+ ),
+ (
+ "id-is-not-url",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>
+ <codemeta:name>The Author</codemeta:name>
+ <codemeta:id>http://not a url/</codemeta:id>
+ </codemeta:author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": ".*Reason: 'http://not a url/' is not a valid URI.*",
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
+ (
+ "identifier-is-invalid-url",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>
+ <codemeta:name>The Author</codemeta:name>
+ <codemeta:identifier>http://[invalid-url/</codemeta:identifier>
+ </codemeta:author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": (
+ r".*Reason: 'http://\[invalid-url/' is not a valid URI.*"
+ ),
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
+ (
+ "identifier-is-not-url",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>
+ <codemeta:name>The Author</codemeta:name>
+ <codemeta:identifier>http://not a url/</codemeta:identifier>
+ </codemeta:author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": ".*Reason: 'http://not a url/' is not a valid URI.*",
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
+ (
+ "identifier-is-not-url2",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>
+ <codemeta:name>The Author</codemeta:name>
+ <codemeta:identifier>not a url</codemeta:identifier>
+ </codemeta:author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": ".*Reason: 'not a url' is not an absolute URI.*",
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
(
"invalid-dates",
f"""\
@@ -494,6 +638,25 @@
},
],
),
+ (
+ "invalid-embargoDate",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <external_identifier>someid</external_identifier>
+ <title>bar</title>
+ <author>no one</author>
+ <codemeta:embargoDate>2022-02-28T12:00:00</codemeta:embargoDate>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": ".*Invalid datetime string '2022-02-28T12:00:00'.*",
+ "fields": ["codemeta:embargoDate"],
+ },
+ ],
+ ),
(
"error-and-missing-provenance",
f"""\
diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py
--- a/swh/deposit/tests/cli/test_client.py
+++ b/swh/deposit/tests/cli/test_client.py
@@ -179,7 +179,7 @@
"deposit-client",
"project-name",
authors=["some", "authors"],
- external_id="external-id",
+ external_id="http://example.org/external-id",
create_origin="origin-url",
metadata_provenance_url="meta-prov-url",
)
@@ -199,7 +199,7 @@
)
assert (
actual_metadata.findtext("codemeta:identifier", namespaces=NAMESPACES)
- == "external-id"
+ == "http://example.org/external-id"
)
authors = actual_metadata.findall(
"codemeta:author/codemeta:name", namespaces=NAMESPACES
diff --git a/swh/deposit/xsd/codemeta.xsd b/swh/deposit/xsd/codemeta.xsd
--- a/swh/deposit/xsd/codemeta.xsd
+++ b/swh/deposit/xsd/codemeta.xsd
@@ -7,11 +7,14 @@
<!-- This is a non-exhaustive schema used to find likely errors in XML-encoded
Codemeta documents -->
+ <xsd:element name="id" type="xsd:anyURI" />
+
<!-- technically, only Date is allowed for datePublished; but we allow DateTime
for backward compatibility with old swh-deposit versions -->
<xsd:element name="datePublished" type="codemeta:DateOrDateTime" />
<xsd:element name="dateCreated" type="codemeta:DateOrDateTime" />
<xsd:element name="dateModified" type="codemeta:DateOrDateTime" />
+ <xsd:element name="embargoDate" type="xsd:date" />
<xsd:simpleType name="DateOrDateTime">
<xsd:union memberTypes="xsd:date xsd:dateTime"/>
@@ -38,4 +41,20 @@
<xsd:element name="affiliation" type="codemeta:PersonOrOrganization" />
<xsd:element name="name" type="xsd:string" />
+ <xsd:element name="givenName" type="xsd:string" />
+ <xsd:element name="familyName" type="xsd:string" />
+ <xsd:element name="email" type="xsd:string" />
+ <xsd:element name="identifier" type="xsd:anyURI" />
+
+ <xsd:element name="applicationCategory" type="xsd:string" />
+ <xsd:element name="applicationSubCategory" type="xsd:string" />
+ <xsd:element name="installUrl" type="xsd:anyURI" />
+ <xsd:element name="releaseNotes" type="xsd:string" />
+ <xsd:element name="softwareVersion" type="xsd:string" />
+ <xsd:element name="version" type="xsd:string" />
+ <xsd:element name="keywords" type="xsd:string" />
+ <xsd:element name="description" type="xsd:string" />
+ <xsd:element name="url" type="xsd:anyURI" />
+ <xsd:element name="issueTracker" type="xsd:anyURI" />
+ <xsd:element name="readme" type="xsd:anyURI" />
</xsd:schema>
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 12:31 AM (2 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3215569
Attached To
D7317: Add more simpletypes to the schema.
Event Timeline
Log In to Comment