diff --git a/swh/deposit/api/checks.py b/swh/deposit/api/checks.py index 3fb990c4..1d82a28e 100644 --- a/swh/deposit/api/checks.py +++ b/swh/deposit/api/checks.py @@ -1,228 +1,240 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Functional Metadata checks: Mandatory fields: - 'author' - 'name' or 'title' Suggested fields: - metadata-provenance """ import dataclasses import functools +import re from typing import Dict, Iterator, Optional, Tuple, cast import urllib from xml.etree import ElementTree import pkg_resources import xmlschema from swh.deposit.errors import FORBIDDEN, DepositError from swh.deposit.utils import NAMESPACES, parse_swh_metadata_provenance MANDATORY_FIELDS_MISSING = "Mandatory fields are missing" INVALID_DATE_FORMAT = "Invalid date format" SUGGESTED_FIELDS_MISSING = "Suggested fields are missing" METADATA_PROVENANCE_KEY = "swh:metadata-provenance" AFFILIATION_NO_NAME = "Reason: affiliation does not have a element" def extra_validator( element: ElementTree.Element, xsd_element: xmlschema.validators.elements.Xsd11Element, ) -> Optional[Iterator[xmlschema.XMLSchemaValidationError]]: """Performs extra checks on Atom elements that cannot be implemented purely within XML Schema. For now, this only checks URIs are absolute.""" type_name = xsd_element.type.name if type_name == "{http://www.w3.org/2001/XMLSchema}anyURI": # Check their URI is absolute. # This could technically be implemented in the schema like this: # # # # # # # However, this would give an unreadable error, so we implement it here # in Python instead. - try: - url = urllib.parse.urlparse(element.text) - except ValueError: + yield from absolute_uri_validator(element, xsd_element) + elif type_name == "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}identifierType": + # Made-up type, that allows both absolute URIs and HAL-IDs + if not re.match("hal-[0-9]+", element.text or ""): + yield from absolute_uri_validator(element, xsd_element) + + +def absolute_uri_validator( + element: ElementTree.Element, + xsd_element: xmlschema.validators.elements.Xsd11Element, +) -> Iterator[xmlschema.XMLSchemaValidationError]: + try: + url = urllib.parse.urlparse(element.text) + except ValueError: + yield xmlschema.XMLSchemaValidationError( + xsd_element, element, f"{element.text!r} is not a valid URI", + ) + else: + if not url.scheme or not url.netloc: + yield xmlschema.XMLSchemaValidationError( + xsd_element, element, f"{element.text!r} is not an absolute URI", + ) + elif " " in url.netloc: + # urllib is a little too permissive... yield xmlschema.XMLSchemaValidationError( xsd_element, element, f"{element.text!r} is not a valid URI", ) - else: - if not url.scheme or not url.netloc: - yield xmlschema.XMLSchemaValidationError( - xsd_element, element, f"{element.text!r} is not an absolute URI", - ) - elif " " in url.netloc: - # urllib is a little too permissive... - yield xmlschema.XMLSchemaValidationError( - xsd_element, element, f"{element.text!r} is not a valid URI", - ) @dataclasses.dataclass class Schemas: swh: xmlschema.XMLSchema11 codemeta: xmlschema.XMLSchema11 @functools.lru_cache(1) def schemas() -> Schemas: def load_xsd(name) -> xmlschema.XMLSchema11: return xmlschema.XMLSchema11( pkg_resources.resource_string("swh.deposit", f"xsd/{name}.xsd").decode() ) return Schemas(swh=load_xsd("swh"), codemeta=load_xsd("codemeta")) def check_metadata(metadata: ElementTree.Element) -> Tuple[bool, Optional[Dict]]: """Check metadata for mandatory field presence and date format. Args: metadata: Metadata dictionary to check Returns: tuple (status, error_detail): - (True, None) if metadata are ok and suggested fields are also present - (True, ) if metadata are ok but some suggestions are missing - (False, ) otherwise. """ suggested_fields = [] # at least one value per couple below is mandatory alternate_fields = { ("atom:name", "atom:title", "codemeta:name"): False, ("atom:author", "codemeta:author"): False, } for possible_names in alternate_fields: for possible_name in possible_names: if metadata.find(possible_name, namespaces=NAMESPACES) is not None: alternate_fields[possible_names] = True continue mandatory_result = [" or ".join(k) for k, v in alternate_fields.items() if not v] # provenance metadata is optional provenance_meta = parse_swh_metadata_provenance(metadata) if provenance_meta is None: suggested_fields = [ {"summary": SUGGESTED_FIELDS_MISSING, "fields": [METADATA_PROVENANCE_KEY]} ] if mandatory_result: detail = [{"summary": MANDATORY_FIELDS_MISSING, "fields": mandatory_result}] return False, {"metadata": detail + suggested_fields} deposit_elt = metadata.find("swh:deposit", namespaces=NAMESPACES) if deposit_elt: try: schemas().swh.validate( deposit_elt, extra_validator=cast( # ExtraValidatorType is a callable with "SchemaType" as second # argument, but extra_validator() is actually passed Xsd11Element # as second argument # https://github.com/sissaschool/xmlschema/issues/291 xmlschema.aliases.ExtraValidatorType, extra_validator, ), ) except xmlschema.exceptions.XMLSchemaException as e: return False, {"metadata": [{"fields": ["swh:deposit"], "summary": str(e)}]} detail = [] for child in metadata: for schema_element in schemas().codemeta.root_elements: if child.tag in schema_element.name: break else: # Tag is not specified in the schema, don't validate it continue try: schemas().codemeta.validate( child, extra_validator=cast( # ExtraValidatorType is a callable with "SchemaType" as second # argument, but extra_validator() is actually passed Xsd11Element # as second argument # https://github.com/sissaschool/xmlschema/issues/291 xmlschema.aliases.ExtraValidatorType, extra_validator, ), ) except xmlschema.exceptions.XMLSchemaException as e: detail.append({"fields": [schema_element.prefixed_name], "summary": str(e)}) else: # Manually validate . Unfortunately, this cannot be # validated by codemeta.xsd, because Codemeta has conflicting requirements: # 1. https://codemeta.github.io/terms/ requires it to be Text (represented # by simple content), but # 2. https://doi.org/10.5063/SCHEMA/CODEMETA-2.0 requires it to be an # Organization (represented by complex content) # And this is (legitimately) not representable in XML Schema. # # See https://github.com/codemeta/codemeta/pull/239 for a discussion about # this issue. for affiliation in child.findall( "codemeta:affiliation", namespaces=NAMESPACES ): if len(affiliation) > 0: # This is a complex element (as required by # https://codemeta.github.io/terms/), then we want to make sure # there is at least a name. if not affiliation.findtext("codemeta:name", namespaces=NAMESPACES): detail.append( { "fields": [schema_element.prefixed_name], "summary": AFFILIATION_NO_NAME, } ) break else: # This is a simple element (as required by # https://doi.org/10.5063/SCHEMA/CODEMETA-2.0) if affiliation.text is None or not affiliation.text.strip(): # Completely empty element detail.append( { "fields": [schema_element.prefixed_name], "summary": AFFILIATION_NO_NAME, } ) break if detail: return False, {"metadata": detail + suggested_fields} if suggested_fields: # it's fine but warn about missing suggested fields return True, {"metadata": suggested_fields} return True, None def check_url_match_provider(url: str, provider_url: str) -> None: """Check url matches the provider url. Raises DepositError in case of mismatch """ provider_url = provider_url.rstrip("/") + "/" if not url.startswith(provider_url): raise DepositError( FORBIDDEN, f"URL mismatch: {url} must start with {provider_url}", ) diff --git a/swh/deposit/tests/api/test_checks.py b/swh/deposit/tests/api/test_checks.py index df6d8961..e3e55704 100644 --- a/swh/deposit/tests/api/test_checks.py +++ b/swh/deposit/tests/api/test_checks.py @@ -1,1055 +1,1087 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # disable flake8 on this file because of line length # flake8: noqa import pprint import re import textwrap from typing import Any, Dict from xml.etree import ElementTree import pytest from swh.deposit.api.checks import ( METADATA_PROVENANCE_KEY, SUGGESTED_FIELDS_MISSING, check_metadata, ) METADATA_PROVENANCE_DICT: Dict[str, Any] = { "swh:deposit": { METADATA_PROVENANCE_KEY: {"schema:url": "some-metadata-provenance-url"} } } XMLNS = """xmlns="http://www.w3.org/2005/Atom" xmlns:swh="https://www.softwareheritage.org/schema/2018/deposit" xmlns:codemeta="https://doi.org/10.5063/SCHEMA/CODEMETA-2.0" xmlns:schema="http://schema.org/" """ PROVENANCE_XML = """ some-metadata-provenance-url """ _parameters1 = [ pytest.param(textwrap.dedent(metadata_ok), id=id_) for (id_, metadata_ok,) in [ ( "atom-only-with-name", f"""\ something something-else foo someone {PROVENANCE_XML} """, ), ( "atom-only-with-title", f"""\ something something-else bar someone """, ), ( "atom-only-and-external_identifier", f"""\ something something-else foo no one {PROVENANCE_XML} """, ), ( "atom-and-codemeta-minimal", f"""\ some url bar no one {PROVENANCE_XML} """, ), ( "unknown-codemeta-inner-element-after", f"""\ some url bar someone should allow anything here {PROVENANCE_XML} """, ), ( "unknown-schema-inner-element-after", f"""\ some url bar someone should allow anything here {PROVENANCE_XML} """, ), ( "unknown-schema-inner-element-before", f"""\ some url bar should allow anything here someone {PROVENANCE_XML} """, ), ( "unknown-schema-inner-element-before-and-after", f"""\ some url bar should allow anything here someone should allow anything here {PROVENANCE_XML} """, ), + ( + "identifier-is-halid", + f"""\ + + some url + bar + + The Author + + hal-12345 + {PROVENANCE_XML} + + """, + ), + ( + "identifier-is-propertyvalue", + f"""\ + + some url + bar + + The Author + + + schema:PropertyValue + HAL-ID + hal-02527911 + + {PROVENANCE_XML} + + """, + ), ( "codemeta-dates", f"""\ some url some id nar no one 2020-12-21 2020-12-21 2020-12-25 2020-12-25 {PROVENANCE_XML} """, ), ( "codemeta-date-month", # Allowed by ISO8601, therefore by schema:Date, but not by xsd:date f"""\ some url some id nar no one 2020-12 2020-12 2020-12 {PROVENANCE_XML} """, ), ( "codemeta-date-year", # Allowed by ISO8601, therefore by schema:Date, but not by xsd:date f"""\ some url some id nar no one 2020 2020 2020 {PROVENANCE_XML} """, ), ( "codemeta-datetimes", # technically, only Date is allowed for datePublished; but we allow DateTime # for backward compatibility with old swh-deposit versions f"""\ some url some id nar no one 2020-12-21T12:00:00 2020-12-21T12:00:00 2020-12-25T12:00:00 {PROVENANCE_XML} """, ), ( "author-two-names", f"""\ some url bar someone an alias {PROVENANCE_XML} """, ), ( # Required by codemeta.jsonld, but forbidden by # https://codemeta.github.io/terms/ "element-in--affiliation", f"""\ some url bar someone My Orga {PROVENANCE_XML} """, ), ( # Forbidden by codemeta.jsonld, but required by # https://codemeta.github.io/terms/ "chardata-in-affiliation", f"""\ some url bar someone My Orga {PROVENANCE_XML} """, ), ( "swh:add_to_origin", f"""\ something something-else bar someone some-metadata-provenance-url """, ), ( "swh:reference-origin", f"""\ something something-else bar someone some-metadata-provenance-url """, ), ( "swh:reference-object", f"""\ something something-else bar someone some-metadata-provenance-url """, ), ( # a full example with every tag we know "codemeta-full", f"""\ something foo someone The Author http://example.org/~theauthor/ author@example.org University 1 https://sandbox.orcid.org/0000-0002-9227-8514 A Contributor University 2 A Maintainer University 3 University 3 A Maintainer something something else http://example.org/ Blah blah 1.0.0 1.0.0 kw1 kw2 Blah blah http://example.org/ http://example.org/ http://example.org/ {PROVENANCE_XML} """, ), ] ] @pytest.mark.parametrize( "metadata_ok", _parameters1, ) def test_api_checks_check_metadata_ok(metadata_ok, swh_checks_deposit): actual_check, detail = check_metadata(ElementTree.fromstring(metadata_ok)) assert actual_check is True, f"Unexpected result:\n{pprint.pformat(detail)}" if "swh:deposit" in metadata_ok: # no missing suggested field assert detail is None else: # missing suggested field assert detail == { "metadata": [ { "fields": [METADATA_PROVENANCE_KEY], "summary": SUGGESTED_FIELDS_MISSING, } ] } _parameters2 = [ pytest.param(textwrap.dedent(metadata_ko), expected_summary, id=id_) for (id_, metadata_ko, expected_summary) in [ ( "no-name-or-title", f"""\ something something-else someone {PROVENANCE_XML} """, { "summary": "Mandatory fields are missing", "fields": ["atom:name or atom:title or codemeta:name"], }, ), ( "no-author", f"""\ something something-else foobar {PROVENANCE_XML} """, { "summary": "Mandatory fields are missing", "fields": ["atom:author or codemeta:author"], }, ), ( "wrong-title-namespace", f"""\ something something-else bar someone {PROVENANCE_XML} """, { "summary": "Mandatory fields are missing", "fields": ["atom:name or atom:title or codemeta:name"], }, ), ( "wrong-author-namespace", f"""\ something something-else foobar foo {PROVENANCE_XML} """, { "summary": "Mandatory fields are missing", "fields": ["atom:author or codemeta:author"], }, ), ( "wrong-author-tag", f"""\ something something-else bar someone {PROVENANCE_XML} """, { "summary": "Mandatory fields are missing", "fields": ["atom:author or codemeta:author"], }, ), ] ] @pytest.mark.parametrize("metadata_ko,expected_summary", _parameters2) def test_api_checks_check_metadata_ko( metadata_ko, expected_summary, swh_checks_deposit ): actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko)) assert actual_check is False assert error_detail == {"metadata": [expected_summary]} _parameters3 = [ pytest.param(textwrap.dedent(metadata_ko), expected_summary, id=id_) for (id_, metadata_ko, expected_summary) in [ ( "child-element-in-name", f"""\ some url bar no one {PROVENANCE_XML} """, [ { "summary": ".*Reason: a simple content element can't have child elements.*", "fields": ["codemeta:name"], }, ], ), ( "affiliation-with-no-name", f"""\ some url bar someone http://example.org {PROVENANCE_XML} """, [ { "summary": ".*Reason: affiliation does not have a element.*", "fields": ["codemeta:author"], }, ], ), ( "empty-affiliation", f"""\ some url bar someone {PROVENANCE_XML} """, [ { "summary": ".*Reason: affiliation does not have a element.*", "fields": ["codemeta:author"], }, ], ), ( "chardata-in-author", f"""\ some url bar no one {PROVENANCE_XML} """, [ { "summary": ".*Reason: character data between child elements.*", "fields": ["codemeta:author"], }, ], ), ( "author-with-no-name", f"""\ some url bar should allow anything here {PROVENANCE_XML} """, [ { "summary": ".*Tag '?codemeta:name'? expected.*", "fields": ["codemeta:author"], }, ], ), ( "contributor-with-no-name", f"""\ some url bar should allow anything here abc {PROVENANCE_XML} """, [ { "summary": ".*Tag '?codemeta:name'? expected.*", "fields": ["codemeta:contributor"], }, ], ), ( "maintainer-with-no-name", f"""\ some url bar should allow anything here abc {PROVENANCE_XML} """, [ { "summary": ".*Tag '?codemeta:name'? expected.*", "fields": ["codemeta:maintainer"], }, ], ), ( "id-is-not-url", f"""\ some url bar The Author http://not a url/ {PROVENANCE_XML} """, [ { "summary": ".*Reason: 'http://not a url/' is not a valid URI.*", "fields": ["codemeta:author"], }, ], ), ( "identifier-is-invalid-url", f"""\ some url bar The Author http://[invalid-url/ {PROVENANCE_XML} """, [ { "summary": ( r".*Reason: 'http://\[invalid-url/' is not a valid URI.*" ), "fields": ["codemeta:author"], }, ], ), ( "identifier-is-not-url", f"""\ some url bar The Author http://not a url/ {PROVENANCE_XML} """, [ { "summary": ".*Reason: 'http://not a url/' is not a valid URI.*", "fields": ["codemeta:author"], }, ], ), ( "identifier-is-not-url2", f"""\ some url bar The Author not a url {PROVENANCE_XML} """, [ { "summary": ".*Reason: 'not a url' is not an absolute URI.*", "fields": ["codemeta:author"], }, ], ), ( "invalid-dates", f"""\ something something-else bar someone 2020-aa-21 2020-12-bb {PROVENANCE_XML} """, [ { "summary": ".*Reason: invalid value '2020-aa-21'.*", "fields": ["codemeta:datePublished"], }, { "summary": ".*Reason: invalid value '2020-12-bb'.*", "fields": ["codemeta:dateCreated"], }, ], ), ( "invalid-dateModified", f"""\ some url someid bar no one 2020-12-aa {PROVENANCE_XML} """, [ { "summary": ".*Reason: invalid value '2020-12-aa'.*", "fields": ["codemeta:dateModified"], }, ], ), ( "invalid-embargoDate", f"""\ some url someid bar no one 2022-02-28T12:00:00 {PROVENANCE_XML} """, [ { "summary": ".*Invalid datetime string '2022-02-28T12:00:00'.*", "fields": ["codemeta:embargoDate"], }, ], ), ( "error-and-missing-provenance", f"""\ some url bar no one """, [ { "summary": ".*Reason: character data between child elements.*", "fields": ["codemeta:author"], }, { "summary": "Suggested fields are missing", "fields": ["swh:metadata-provenance"], }, ], ), ( "unknown-tag-in-swh-namespace", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": ( r".*Reason: Unexpected child with tag 'swh:invalid'.*" r"Instance:.*swh:invalid.*" ), "fields": ["swh:deposit"], } ], ), ( "multiple-swh:add_to_origin", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": ( r".*Reason: Unexpected child with tag 'swh:add_to_origin'.*" ), "fields": ["swh:deposit"], } ], ), ( "swh:add_to_origin-and-swh:create_origin", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": ( r".*Reason: assertion test if false.*" r"Schema:\n*" r' *]+ id="swhdeposit-incompatible-create-and-add".*' ), "fields": ["swh:deposit"], } ], ), ( "swh:reference-and-swh:create_origin", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": ( r".*Reason: assertion test if false.*" r"Schema:\n*" r' *]+ id="swhdeposit-incompatible-create-and-reference".*' ), "fields": ["swh:deposit"], } ], ), ( "swh:add_to_origin-and-swh:reference", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": ( r".*Reason: assertion test if false.*" r"Schema:\n*" r' *]+ id="swhdeposit-incompatible-add-and-reference".*' ), "fields": ["swh:deposit"], } ], ), ( "swh:reference-two-children", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": r".*Reason: Unexpected child with tag 'swh:origin'.*", "fields": ["swh:deposit"], }, ], ), ( "swh:reference-two-origins", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": r".*Reason: Unexpected child with tag 'swh:origin'.*", "fields": ["swh:deposit"], }, ], ), ( "swh:reference-two-objects", f"""\ something something-else bar someone some-metadata-provenance-url """, [ { "summary": r".*Reason: Unexpected child with tag 'swh:object'.*", "fields": ["swh:deposit"], }, ], ), ] ] @pytest.mark.parametrize("metadata_ko,expected_summaries", _parameters3) def test_api_checks_check_metadata_ko_schema( metadata_ko, expected_summaries, swh_checks_deposit ): actual_check, error_detail = check_metadata(ElementTree.fromstring(metadata_ko)) assert actual_check is False assert len(error_detail["metadata"]) == len(expected_summaries), error_detail[ "metadata" ] for (detail, expected_summary) in zip(error_detail["metadata"], expected_summaries): assert detail["fields"] == expected_summary["fields"] # xmlschema returns very detailed errors, we cannot reasonably test them # for equality summary = detail["summary"] assert re.match( expected_summary["summary"], summary, re.DOTALL ), f"Failed to match {expected_summary['summary']!r} with:\n{summary}" diff --git a/swh/deposit/xsd/codemeta.xsd b/swh/deposit/xsd/codemeta.xsd index 64e5187c..474e4c9a 100644 --- a/swh/deposit/xsd/codemeta.xsd +++ b/swh/deposit/xsd/codemeta.xsd @@ -1,72 +1,84 @@ + + + + + + + + + + + + -