Page MenuHomeSoftware Heritage

D7317.diff
No OneTemporary

D7317.diff

diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py
--- a/swh/deposit/api/checks.py
+++ b/swh/deposit/api/checks.py
@@ -17,6 +17,7 @@
import dataclasses
import functools
from typing import Dict, Optional, Tuple
+import urllib
from xml.etree import ElementTree
import pkg_resources
@@ -32,6 +33,41 @@
METADATA_PROVENANCE_KEY = "swh:metadata-provenance"
+def extra_validator(element, xsd_element):
+ """Performs extra checks on Atom elements that cannot be implemented purely
+ within XML Schema.
+
+ For now, this only checks URIs are absolute."""
+ type_name = xsd_element.type.name
+ if type_name == "{http://www.w3.org/2001/XMLSchema}anyURI":
+ # Check their URI is absolute.
+ # This could technically be implemented in the schema like this:
+ # <xsd:simpleType name="URL">
+ # <xsd:restriction base="xsd:anyURI">
+ # <!-- https://datatracker.ietf.org/doc/html/rfc2396#section-3.1 -->
+ # <xsd:pattern value="[a-zA-Z][a-zA-Z0-9+.-]*:.+" />
+ # </xsd:restriction>
+ # </xsd:simpleType>
+ # However, this would give an unreadable error, so we implement it here
+ # in Python instead.
+ try:
+ url = urllib.parse.urlparse(element.text)
+ except ValueError:
+ raise xmlschema.XMLSchemaValidationError(
+ xsd_element, element, f"{element.text!r} is not a valid URI",
+ ) from None
+ else:
+ if not url.scheme or not url.netloc:
+ raise xmlschema.XMLSchemaValidationError(
+ xsd_element, element, f"{element.text!r} is not an absolute URI",
+ )
+ elif " " in url.netloc:
+ # urllib is a little too permissive...
+ raise xmlschema.XMLSchemaValidationError(
+ xsd_element, element, f"{element.text!r} is not a valid URI",
+ )
+
+
@dataclasses.dataclass
class Schemas:
swh: xmlschema.XMLSchema11
@@ -90,7 +126,7 @@
deposit_elt = metadata.find("swh:deposit", namespaces=NAMESPACES)
if deposit_elt:
try:
- schemas().swh.validate(deposit_elt)
+ schemas().swh.validate(deposit_elt, extra_validator=extra_validator)
except xmlschema.exceptions.XMLSchemaException as e:
return False, {"metadata": [{"fields": ["swh:deposit"], "summary": str(e)}]}
@@ -103,7 +139,7 @@
# Tag is not specified in the schema, don't validate it
continue
try:
- schemas().codemeta.validate(child)
+ schemas().codemeta.validate(child, extra_validator=extra_validator)
except xmlschema.exceptions.XMLSchemaException as e:
detail.append({"fields": [schema_element.prefixed_name], "summary": str(e)})
diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py
--- a/swh/deposit/tests/api/test_checks.py
+++ b/swh/deposit/tests/api/test_checks.py
@@ -157,6 +157,7 @@
<codemeta:datePublished>2020-12-21</codemeta:datePublished>
<codemeta:dateCreated>2020-12-21</codemeta:dateCreated>
<codemeta:dateModified>2020-12-25</codemeta:dateModified>
+ <codemeta:embargoDate>2020-12-25</codemeta:embargoDate>
{PROVENANCE_XML}
</entry>
""",
@@ -251,6 +252,45 @@
</entry>
""",
),
+ (
+ # a full example with every tag we know
+ "codemeta-full",
+ f"""\
+ <entry {XMLNS}>
+ <url>something</url>
+ <name>foo</name>
+ <author>someone</author>
+ <codemeta:author>
+ <codemeta:name>The Author</codemeta:name>
+ <codemeta:id>http://example.org/~theauthor/</codemeta:id>
+ <codemeta:email>author@example.org</codemeta:email>
+ <codemeta:affiliation>
+ <codemeta:name>University 1</codemeta:name>
+ </codemeta:affiliation>
+ <codemeta:identifier>https://sandbox.orcid.org/0000-0002-9227-8514</codemeta:identifier>
+ </codemeta:author>
+ <codemeta:contributor>
+ <codemeta:name>A Contributor</codemeta:name>
+ <codemeta:affiliation>
+ <codemeta:name>University 2</codemeta:name>
+ </codemeta:affiliation>
+ </codemeta:contributor>
+ <codemeta:applicationCategory>something</codemeta:applicationCategory>
+ <codemeta:applicationSubCategory>something else</codemeta:applicationSubCategory>
+ <codemeta:installUrl>http://example.org/</codemeta:installUrl>
+ <codemeta:releaseNotes>Blah blah</codemeta:releaseNotes>
+ <codemeta:softwareVersion>1.0.0</codemeta:softwareVersion>
+ <codemeta:version>1.0.0</codemeta:version>
+ <codemeta:keywords>kw1</codemeta:keywords>
+ <codemeta:keywords>kw2</codemeta:keywords>
+ <codemeta:description>Blah blah</codemeta:description>
+ <codemeta:url>http://example.org/</codemeta:url>
+ <codemeta:issueTracker>http://example.org/</codemeta:issueTracker>
+ <codemeta:readme>http://example.org/</codemeta:readme>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ ),
]
]
@@ -451,6 +491,110 @@
},
],
),
+ (
+ "contributor-with-no-name",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>
+ <codemeta:name>should allow anything here</codemeta:name>
+ </codemeta:author>
+ <codemeta:contributor>
+ <schema:unknown-tag>abc</schema:unknown-tag>
+ </codemeta:contributor>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": ".*Tag '?codemeta:name'? expected.*",
+ "fields": ["codemeta:contributor"],
+ },
+ ],
+ ),
+ (
+ "id-is-not-url",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>
+ <codemeta:name>The Author</codemeta:name>
+ <codemeta:id>http://not a url/</codemeta:id>
+ </codemeta:author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": ".*Reason: 'http://not a url/' is not a valid URI.*",
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
+ (
+ "identifier-is-invalid-url",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>
+ <codemeta:name>The Author</codemeta:name>
+ <codemeta:identifier>http://[invalid-url/</codemeta:identifier>
+ </codemeta:author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": (
+ r".*Reason: 'http://\[invalid-url/' is not a valid URI.*"
+ ),
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
+ (
+ "identifier-is-not-url",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>
+ <codemeta:name>The Author</codemeta:name>
+ <codemeta:identifier>http://not a url/</codemeta:identifier>
+ </codemeta:author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": ".*Reason: 'http://not a url/' is not a valid URI.*",
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
+ (
+ "identifier-is-not-url2",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <codemeta:name>bar</codemeta:name>
+ <codemeta:author>
+ <codemeta:name>The Author</codemeta:name>
+ <codemeta:identifier>not a url</codemeta:identifier>
+ </codemeta:author>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": ".*Reason: 'not a url' is not an absolute URI.*",
+ "fields": ["codemeta:author"],
+ },
+ ],
+ ),
(
"invalid-dates",
f"""\
@@ -494,6 +638,25 @@
},
],
),
+ (
+ "invalid-embargoDate",
+ f"""\
+ <entry {XMLNS}>
+ <url>some url</url>
+ <external_identifier>someid</external_identifier>
+ <title>bar</title>
+ <author>no one</author>
+ <codemeta:embargoDate>2022-02-28T12:00:00</codemeta:embargoDate>
+ {PROVENANCE_XML}
+ </entry>
+ """,
+ [
+ {
+ "summary": ".*Invalid datetime string '2022-02-28T12:00:00'.*",
+ "fields": ["codemeta:embargoDate"],
+ },
+ ],
+ ),
(
"error-and-missing-provenance",
f"""\
diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py
--- a/swh/deposit/tests/cli/test_client.py
+++ b/swh/deposit/tests/cli/test_client.py
@@ -179,7 +179,7 @@
"deposit-client",
"project-name",
authors=["some", "authors"],
- external_id="external-id",
+ external_id="http://example.org/external-id",
create_origin="origin-url",
metadata_provenance_url="meta-prov-url",
)
@@ -199,7 +199,7 @@
)
assert (
actual_metadata.findtext("codemeta:identifier", namespaces=NAMESPACES)
- == "external-id"
+ == "http://example.org/external-id"
)
authors = actual_metadata.findall(
"codemeta:author/codemeta:name", namespaces=NAMESPACES
diff --git a/swh/deposit/xsd/codemeta.xsd b/swh/deposit/xsd/codemeta.xsd
--- a/swh/deposit/xsd/codemeta.xsd
+++ b/swh/deposit/xsd/codemeta.xsd
@@ -7,11 +7,14 @@
<!-- This is a non-exhaustive schema used to find likely errors in XML-encoded
Codemeta documents -->
+ <xsd:element name="id" type="xsd:anyURI" />
+
<!-- technically, only Date is allowed for datePublished; but we allow DateTime
for backward compatibility with old swh-deposit versions -->
<xsd:element name="datePublished" type="codemeta:DateOrDateTime" />
<xsd:element name="dateCreated" type="codemeta:DateOrDateTime" />
<xsd:element name="dateModified" type="codemeta:DateOrDateTime" />
+ <xsd:element name="embargoDate" type="xsd:date" />
<xsd:simpleType name="DateOrDateTime">
<xsd:union memberTypes="xsd:date xsd:dateTime"/>
@@ -38,4 +41,20 @@
<xsd:element name="affiliation" type="codemeta:PersonOrOrganization" />
<xsd:element name="name" type="xsd:string" />
+ <xsd:element name="givenName" type="xsd:string" />
+ <xsd:element name="familyName" type="xsd:string" />
+ <xsd:element name="email" type="xsd:string" />
+ <xsd:element name="identifier" type="xsd:anyURI" />
+
+ <xsd:element name="applicationCategory" type="xsd:string" />
+ <xsd:element name="applicationSubCategory" type="xsd:string" />
+ <xsd:element name="installUrl" type="xsd:anyURI" />
+ <xsd:element name="releaseNotes" type="xsd:string" />
+ <xsd:element name="softwareVersion" type="xsd:string" />
+ <xsd:element name="version" type="xsd:string" />
+ <xsd:element name="keywords" type="xsd:string" />
+ <xsd:element name="description" type="xsd:string" />
+ <xsd:element name="url" type="xsd:anyURI" />
+ <xsd:element name="issueTracker" type="xsd:anyURI" />
+ <xsd:element name="readme" type="xsd:anyURI" />
</xsd:schema>

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 12:31 AM (2 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3215569

Event Timeline