Changeset View
Changeset View
Standalone View
Standalone View
swh/deposit/utils.py
# Copyright (C) 2018-2020 The Software Heritage developers | # Copyright (C) 2018-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import logging | |||||
from types import GeneratorType | from types import GeneratorType | ||||
from typing import Any, Dict, Tuple, Union | from typing import Any, Dict, Optional, Tuple, Union | ||||
import iso8601 | import iso8601 | ||||
import xmltodict | import xmltodict | ||||
from swh.model.identifiers import SWHID, normalize_timestamp, parse_swhid | from swh.model.exceptions import ValidationError | ||||
from swh.model.identifiers import ( | |||||
DIRECTORY, | |||||
RELEASE, | |||||
REVISION, | |||||
SNAPSHOT, | |||||
SWHID, | |||||
normalize_timestamp, | |||||
parse_swhid, | |||||
) | |||||
from swh.model.model import MetadataTargetType | from swh.model.model import MetadataTargetType | ||||
logger = logging.getLogger(__name__) | |||||
def parse_xml(stream, encoding="utf-8"): | def parse_xml(stream, encoding="utf-8"): | ||||
namespaces = { | namespaces = { | ||||
"http://www.w3.org/2005/Atom": "atom", | "http://www.w3.org/2005/Atom": "atom", | ||||
"http://www.w3.org/2007/app": "app", | "http://www.w3.org/2007/app": "app", | ||||
"http://purl.org/dc/terms/": "dc", | "http://purl.org/dc/terms/": "dc", | ||||
"https://doi.org/10.5063/SCHEMA/CODEMETA-2.0": "codemeta", | "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0": "codemeta", | ||||
"http://purl.org/net/sword/terms/": "sword", | "http://purl.org/net/sword/terms/": "sword", | ||||
▲ Show 20 Lines • Show All 112 Lines • ▼ Show 20 Lines | if isinstance(swhid_reference, SWHID): | ||||
anchor = swhid_reference.metadata.get("anchor") | anchor = swhid_reference.metadata.get("anchor") | ||||
if anchor: | if anchor: | ||||
anchor_swhid = parse_swhid(anchor) | anchor_swhid = parse_swhid(anchor) | ||||
metadata_context[anchor_swhid.object_type] = anchor_swhid | metadata_context[anchor_swhid.object_type] = anchor_swhid | ||||
else: | else: | ||||
object_type = MetadataTargetType.ORIGIN | object_type = MetadataTargetType.ORIGIN | ||||
return object_type, metadata_context | return object_type, metadata_context | ||||
ALLOWED_QUALIFIERS_NODE_TYPE = (SNAPSHOT, REVISION, RELEASE, DIRECTORY) | |||||
def parse_swh_reference(metadata: Dict) -> Optional[Union[str, SWHID]]: | |||||
"""Parse swh reference within the metadata dict (or origin) reference if found, None | |||||
otherwise. | |||||
<swh:deposit> | |||||
<swh:reference> | |||||
<swh:origin url='https://github.com/user/repo'/> | |||||
</swh:reference> | |||||
</swh:deposit> | |||||
or: | |||||
<swh:deposit> | |||||
<swh:reference> | |||||
<swh:object swhid="swh:1:dir:31b5c8cc985d190b5a7ef4878128ebfdc2358f49;origin=https://hal.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:4fc1e36fca86b2070204bedd51106014a614f321;anchor=swh:1:rev:9c5de20cfb54682370a398fcc733e829903c8cba;path=/moranegg-AffectationRO-df7f68b/" | |||||
/> | |||||
</swh:deposit> | |||||
Raises: | |||||
ValidationError in case the swhid referenced (if any) is invalid | |||||
Returns: | |||||
Either swhid or origin reference if any. None otherwise. | |||||
""" # noqa | |||||
visit_swhid = None | |||||
anchor_swhid = None | |||||
swh_deposit = metadata.get("swh:deposit") | |||||
if not swh_deposit: | |||||
return None | |||||
swh_reference = swh_deposit.get("swh:reference") | |||||
if not swh_reference: | |||||
return None | |||||
swh_origin = swh_reference.get("swh:origin") | |||||
if swh_origin: | |||||
url = swh_origin.get("@url") | |||||
if url: | |||||
return url | |||||
swh_object = swh_reference.get("swh:object") | |||||
if not swh_object: | |||||
return None | |||||
swhid = swh_object.get("@swhid") | |||||
if not swhid: | |||||
return None | |||||
swhid_reference = parse_swhid(swhid) | |||||
if swhid_reference.metadata: | |||||
anchor = swhid_reference.metadata.get("anchor") | |||||
if anchor: | |||||
anchor_swhid = parse_swhid(anchor) | |||||
if anchor_swhid.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE: | |||||
error_msg = ( | |||||
"anchor qualifier should be a core SWHID with type one of " | |||||
f" {', '.join(ALLOWED_QUALIFIERS_NODE_TYPE)}" | |||||
) | |||||
raise ValidationError(error_msg) | |||||
visit = swhid_reference.metadata.get("visit") | |||||
if visit: | |||||
visit_swhid = parse_swhid(visit) | |||||
if visit_swhid.object_type != SNAPSHOT: | |||||
raise ValidationError( | |||||
f"visit qualifier should be a core SWHID with type {SNAPSHOT}" | |||||
) | |||||
if ( | |||||
visit_swhid | |||||
and anchor_swhid | |||||
and visit_swhid.object_type == SNAPSHOT | |||||
and anchor_swhid.object_type == SNAPSHOT | |||||
): | |||||
logger.warn( | |||||
"SWHID use of both anchor and visit targeting " | |||||
f"a snapshot: {swhid_reference}" | |||||
) | |||||
raise ValidationError( | |||||
"'anchor=swh:1:snp:' is not supported when 'visit' is also provided." | |||||
) | |||||
return swhid_reference |