diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
index ae4a442e..0f0e52ee 100644
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -1,1301 +1,1300 @@
# Copyright (C) 2017-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from abc import ABCMeta, abstractmethod
import datetime
import hashlib
import json
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union
import uuid
from xml.etree import ElementTree
import attr
from django.core.files.uploadedfile import UploadedFile
from django.http import FileResponse, HttpResponse
from django.shortcuts import render
from django.template.loader import render_to_string
from django.urls import reverse
from django.utils import timezone
from rest_framework import status
from rest_framework.authentication import BaseAuthentication, BasicAuthentication
from rest_framework.permissions import BasePermission, IsAuthenticated
from rest_framework.request import Request
from rest_framework.views import APIView
from swh.deposit.api.checks import check_metadata
from swh.deposit.api.converters import convert_status_detail
from swh.deposit.auth import HasDepositPermission, KeycloakBasicAuthentication
from swh.deposit.models import Deposit
from swh.deposit.utils import NAMESPACES, compute_metadata_context
from swh.model import hashutil
from swh.model.model import (
MetadataAuthority,
MetadataAuthorityType,
Origin,
RawExtrinsicMetadata,
)
from swh.model.swhids import (
ExtendedObjectType,
ExtendedSWHID,
QualifiedSWHID,
ValidationError,
)
from swh.scheduler.utils import create_oneshot_task_dict
from ..config import (
ARCHIVE_KEY,
ARCHIVE_TYPE,
CONT_FILE_IRI,
DEPOSIT_STATUS_DEPOSITED,
DEPOSIT_STATUS_LOAD_SUCCESS,
DEPOSIT_STATUS_PARTIAL,
EDIT_IRI,
EM_IRI,
METADATA_KEY,
METADATA_TYPE,
RAW_METADATA_KEY,
SE_IRI,
STATE_IRI,
APIConfig,
)
from ..errors import (
BAD_REQUEST,
CHECKSUM_MISMATCH,
ERROR_CONTENT,
FORBIDDEN,
MAX_UPLOAD_SIZE_EXCEEDED,
MEDIATION_NOT_ALLOWED,
METHOD_NOT_ALLOWED,
NOT_FOUND,
PARSING_ERROR,
DepositError,
ParserError,
)
from ..models import DepositClient, DepositCollection, DepositRequest
from ..parsers import parse_xml
-from ..utils import extended_swhid_from_qualified, parse_swh_reference
+from ..utils import (
+ extended_swhid_from_qualified,
+ parse_swh_deposit_origin,
+ parse_swh_reference,
+)
ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"]
ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"]
@attr.s
class ParsedRequestHeaders:
content_type = attr.ib(type=str)
content_length = attr.ib(type=Optional[int])
in_progress = attr.ib(type=bool)
content_disposition = attr.ib(type=Optional[str])
content_md5sum = attr.ib(type=Optional[bytes])
packaging = attr.ib(type=Optional[str])
slug = attr.ib(type=Optional[str])
on_behalf_of = attr.ib(type=Optional[str])
metadata_relevant = attr.ib(type=Optional[str])
swhid = attr.ib(type=Optional[str])
@attr.s
class Receipt:
"""Data computed while handling the request body that will be served in the
Deposit Receipt."""
deposit_id = attr.ib(type=int)
deposit_date = attr.ib(type=datetime.datetime)
status = attr.ib(type=str)
archive = attr.ib(type=Optional[str])
def _compute_md5(filehandler: UploadedFile) -> bytes:
h = hashlib.md5()
for chunk in filehandler:
h.update(chunk) # type: ignore
return h.digest()
def get_deposit_by_id(
deposit_id: int, collection_name: Optional[str] = None
) -> Deposit:
"""Gets an existing Deposit object if it exists, or raises `DepositError`.
If `collection` is not None, also checks the deposit belongs to the collection."""
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
raise DepositError(NOT_FOUND, f"Deposit {deposit_id} does not exist")
if collection_name and deposit.collection.name != collection_name:
get_collection_by_name(collection_name) # raises if does not exist
raise DepositError(
NOT_FOUND,
f"Deposit {deposit_id} does not belong to collection {collection_name}",
)
return deposit
def get_collection_by_name(collection_name: str):
"""Gets an existing Deposit object if it exists, or raises `DepositError`."""
try:
collection = DepositCollection.objects.get(name=collection_name)
except DepositCollection.DoesNotExist:
raise DepositError(NOT_FOUND, f"Unknown collection name {collection_name}")
assert collection is not None
return collection
def guess_deposit_origin_url(deposit: Deposit):
"""Guesses an origin url for the given deposit."""
external_id = deposit.external_id
if not external_id:
# The client provided neither an origin_url nor a slug. That's inconvenient,
# but SWORD requires we support it. So let's generate a random slug.
external_id = str(uuid.uuid4())
return "%s/%s" % (deposit.client.provider_url.rstrip("/"), external_id)
def check_client_origin(client: DepositClient, origin_url: str):
provider_url = client.provider_url.rstrip("/") + "/"
if not origin_url.startswith(provider_url):
raise DepositError(
FORBIDDEN,
f"Cannot create origin {origin_url}, it must start with {provider_url}",
)
class APIBase(APIConfig, APIView, metaclass=ABCMeta):
"""Base deposit request class sharing multiple common behaviors.
"""
_client: Optional[DepositClient] = None
def __init__(self):
super().__init__()
auth_provider = self.config.get("authentication_provider")
if auth_provider == "basic":
self.authentication_classes: Sequence[Type[BaseAuthentication]] = (
BasicAuthentication,
)
self.permission_classes: Sequence[Type[BasePermission]] = (IsAuthenticated,)
elif auth_provider == "keycloak":
self.authentication_classes: Sequence[Type[BaseAuthentication]] = (
KeycloakBasicAuthentication,
)
self.permission_classes: Sequence[Type[BasePermission]] = (
IsAuthenticated,
HasDepositPermission,
)
else:
raise ValueError(
"Configuration key 'authentication_provider' should be provided with"
f"either 'basic' or 'keycloak' value not {auth_provider!r}."
)
def _read_headers(self, request: Request) -> ParsedRequestHeaders:
"""Read and unify the necessary headers from the request (those are
not stored in the same location or not properly formatted).
Args:
request: Input request
Returns:
Dictionary with the following keys (some associated values may be
None):
- content-type
- content-length
- in-progress
- content-disposition
- packaging
- slug
- on-behalf-of
"""
meta = request._request.META
content_length = meta.get("CONTENT_LENGTH")
if content_length and isinstance(content_length, str):
content_length = int(content_length)
# final deposit if not provided
in_progress = meta.get("HTTP_IN_PROGRESS", False)
if isinstance(in_progress, str):
in_progress = in_progress.lower() == "true"
content_md5sum = meta.get("HTTP_CONTENT_MD5")
if content_md5sum:
content_md5sum = bytes.fromhex(content_md5sum)
return ParsedRequestHeaders(
content_type=request.content_type,
content_length=content_length,
in_progress=in_progress,
content_disposition=meta.get("HTTP_CONTENT_DISPOSITION"),
content_md5sum=content_md5sum,
packaging=meta.get("HTTP_PACKAGING"),
slug=meta.get("HTTP_SLUG"),
on_behalf_of=meta.get("HTTP_ON_BEHALF_OF"),
metadata_relevant=meta.get("HTTP_METADATA_RELEVANT"),
swhid=meta.get("HTTP_X_CHECK_SWHID"),
)
def _deposit_put(self, deposit: Deposit, in_progress: bool = False) -> None:
"""Save/Update a deposit in db.
Args:
deposit: deposit being updated/created
in_progress: deposit status
"""
if in_progress is False:
self._complete_deposit(deposit)
else:
deposit.status = DEPOSIT_STATUS_PARTIAL
deposit.save()
def _complete_deposit(self, deposit: Deposit) -> None:
"""Marks the deposit as 'deposited', then schedule a check task if configured
to do so."""
deposit.complete_date = timezone.now()
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
if not deposit.origin_url:
deposit.origin_url = guess_deposit_origin_url(deposit)
if self.config["checks"]:
scheduler = self.scheduler
if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id:
task = create_oneshot_task_dict(
"check-deposit",
collection=deposit.collection.name,
deposit_id=deposit.id,
retries_left=3,
)
check_task_id = scheduler.create_tasks([task])[0]["id"]
deposit.check_task_id = check_task_id
deposit.save()
def _deposit_request_put(
self,
deposit: Deposit,
deposit_request_data: Dict[str, Any],
replace_metadata: bool = False,
replace_archives: bool = False,
) -> DepositRequest:
"""Save a deposit request with metadata attached to a deposit.
Args:
deposit: The deposit concerned by the request
deposit_request_data: The dictionary with at most 2 deposit
request types (archive, metadata) to associate to the deposit
replace_metadata: Flag defining if we add or update
existing metadata to the deposit
replace_archives: Flag defining if we add or update
archives to existing deposit
Returns:
the DepositRequest object stored in the backend
"""
if replace_metadata:
DepositRequest.objects.filter(deposit=deposit, type=METADATA_TYPE).delete()
if replace_archives:
DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete()
deposit_request = None
archive_file = deposit_request_data.get(ARCHIVE_KEY)
if archive_file:
deposit_request = DepositRequest(
type=ARCHIVE_TYPE, deposit=deposit, archive=archive_file
)
deposit_request.save()
metadata = deposit_request_data.get(METADATA_KEY)
if metadata:
# TODO: remove non-raw metadata? we don't use these anymore except in
# manual queries to the deposit DB
raw_metadata = deposit_request_data[RAW_METADATA_KEY]
deposit_request = DepositRequest(
type=METADATA_TYPE,
deposit=deposit,
metadata=metadata,
raw_metadata=raw_metadata.decode("utf-8"),
)
deposit_request.save()
assert deposit_request is not None
return deposit_request
def _delete_archives(self, collection_name: str, deposit: Deposit) -> Dict:
"""Delete archive references from the deposit id.
"""
DepositRequest.objects.filter(deposit=deposit, type=ARCHIVE_TYPE).delete()
return {}
def _delete_deposit(self, collection_name: str, deposit: Deposit) -> Dict:
"""Delete deposit reference.
Args:
collection_name: Client's collection
deposit: The deposit to delete
Returns
Empty dict when ok.
Dict with error key to describe the failure.
"""
if deposit.collection.name != collection_name:
summary = "Cannot delete a deposit from another collection"
description = "Deposit %s does not belong to the collection %s" % (
deposit.id,
collection_name,
)
raise DepositError(
BAD_REQUEST, summary=summary, verbose_description=description
)
DepositRequest.objects.filter(deposit=deposit).delete()
deposit.delete()
return {}
def _check_file_length(
self, filehandler: UploadedFile, content_length: Optional[int] = None,
) -> None:
"""Check the filehandler passed as argument has exactly the
expected content_length
Args:
filehandler: The file to check
content_length: the expected length if provided.
Raises:
DepositError if the actual length does not match
"""
max_upload_size = self.config["max_upload_size"]
if content_length:
length = filehandler.size
if length != content_length:
raise DepositError(status.HTTP_412_PRECONDITION_FAILED, "Wrong length")
if filehandler.size > max_upload_size:
raise DepositError(
MAX_UPLOAD_SIZE_EXCEEDED,
f"Upload size limit exceeded (max {max_upload_size} bytes)."
"Please consider sending the archive in multiple steps.",
)
def _check_file_md5sum(
self, filehandler: UploadedFile, md5sum: Optional[bytes],
) -> None:
"""Check the filehandler passed as argument has the expected md5sum
Args:
filehandler: The file to check
md5sum: md5 hash expected from the file's content
Raises:
DepositError if the md5sum does not match
"""
if md5sum:
_md5sum = _compute_md5(filehandler)
if _md5sum != md5sum:
raise DepositError(
CHECKSUM_MISMATCH,
"Wrong md5 hash",
f"The checksum sent {hashutil.hash_to_hex(md5sum)} and the actual "
f"checksum {hashutil.hash_to_hex(_md5sum)} does not match.",
)
def _binary_upload(
self,
request: Request,
headers: ParsedRequestHeaders,
collection_name: str,
deposit: Deposit,
replace_metadata: bool = False,
replace_archives: bool = False,
) -> Receipt:
"""Binary upload routine.
Other than such a request, a 415 response is returned.
Args:
request: the request holding information to parse
and inject in db
headers: parsed request headers
collection_name: the associated client
deposit: deposit to be updated
replace_metadata: 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
replace_archives: 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Raises:
- 400 (bad request) if the request is not providing an external
identifier
- 413 (request entity too large) if the length of the
archive exceeds the max size configured
- 412 (precondition failed) if the length or md5 hash provided
mismatch the reality of the archive
- 415 (unsupported media type) if a wrong media type is provided
"""
content_length = headers.content_length
if not content_length:
raise DepositError(
BAD_REQUEST,
"CONTENT_LENGTH header is mandatory",
"For archive deposit, the CONTENT_LENGTH header must be sent.",
)
content_disposition = headers.content_disposition
if not content_disposition:
raise DepositError(
BAD_REQUEST,
"CONTENT_DISPOSITION header is mandatory",
"For archive deposit, the CONTENT_DISPOSITION header must be sent.",
)
packaging = headers.packaging
if packaging and packaging not in ACCEPT_PACKAGINGS:
raise DepositError(
BAD_REQUEST,
f"Only packaging {ACCEPT_PACKAGINGS} is supported",
f"The packaging provided {packaging} is not supported",
)
filehandler = request.FILES["file"]
assert isinstance(filehandler, UploadedFile), filehandler
self._check_file_length(filehandler, content_length)
self._check_file_md5sum(filehandler, headers.content_md5sum)
# actual storage of data
archive_metadata = filehandler
self._deposit_put(
deposit=deposit, in_progress=headers.in_progress,
)
self._deposit_request_put(
deposit,
{ARCHIVE_KEY: archive_metadata},
replace_metadata=replace_metadata,
replace_archives=replace_archives,
)
return Receipt(
deposit_id=deposit.id,
deposit_date=deposit.reception_date,
status=deposit.status,
archive=filehandler.name,
)
def _read_metadata(
self, metadata_stream
) -> Tuple[bytes, Dict[str, Any], ElementTree.Element]:
"""
Given a metadata stream, reads the metadata and returns the metadata in three
forms:
* verbatim (as raw bytes), for archival in long-term storage
* parsed as a Python dict, for archival in postgresql's jsonb type
* parsed as ElementTree, to extract information immediately
"""
raw_metadata = metadata_stream.read()
metadata_dict = parse_xml(raw_metadata)
metadata_tree = ElementTree.fromstring(raw_metadata)
# TODO: remove metadata_dict? we don't use it anymore, except in manual
# queries to the deposit DB
return raw_metadata, metadata_dict, metadata_tree
def _multipart_upload(
self,
request: Request,
headers: ParsedRequestHeaders,
collection_name: str,
deposit: Deposit,
replace_metadata: bool = False,
replace_archives: bool = False,
) -> Receipt:
"""Multipart upload supported with exactly:
- 1 archive (zip)
- 1 atom entry
Other than such a request, a 415 response is returned.
Args:
request: the request holding information to parse
and inject in db
headers: parsed request headers
collection_name: the associated client
deposit: deposit to be updated
replace_metadata: 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
replace_archives: 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Raises:
- 400 (bad request) if the request is not providing an external
identifier
- 412 (precondition failed) if the potentially md5 hash provided
mismatch the reality of the archive
- 413 (request entity too large) if the length of the
archive exceeds the max size configured
- 415 (unsupported media type) if a wrong media type is provided
"""
content_types_present = set()
data: Dict[str, Optional[Any]] = {
"application/zip": None, # expected either zip
"application/x-tar": None, # or x-tar
"application/atom+xml": None,
}
for key, value in request.FILES.items():
fh = value
content_type = fh.content_type
if content_type in content_types_present:
raise DepositError(
ERROR_CONTENT,
"Only 1 application/zip (or application/x-tar) archive "
"and 1 atom+xml entry is supported (as per sword2.0 "
"specification)",
"You provided more than 1 application/(zip|x-tar) "
"or more than 1 application/atom+xml content-disposition "
"header in the multipart deposit",
)
content_types_present.add(content_type)
assert content_type is not None
data[content_type] = fh
if len(content_types_present) != 2:
raise DepositError(
ERROR_CONTENT,
"You must provide both 1 application/zip (or "
"application/x-tar) and 1 atom+xml entry for multipart "
"deposit",
"You need to provide only 1 application/(zip|x-tar) "
"and 1 application/atom+xml content-disposition header "
"in the multipart deposit",
)
filehandler = data["application/zip"]
if not filehandler:
filehandler = data["application/x-tar"]
assert isinstance(filehandler, UploadedFile), filehandler
self._check_file_length(filehandler)
self._check_file_md5sum(filehandler, headers.content_md5sum)
try:
raw_metadata, metadata_dict, metadata_tree = self._read_metadata(
data["application/atom+xml"]
)
except ParserError:
raise DepositError(
PARSING_ERROR,
"Malformed xml metadata",
"The xml received is malformed. "
"Please ensure your metadata file is correctly formatted.",
)
self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers)
# actual storage of data
self._deposit_put(
deposit=deposit, in_progress=headers.in_progress,
)
deposit_request_data = {
ARCHIVE_KEY: filehandler,
METADATA_KEY: metadata_dict,
RAW_METADATA_KEY: raw_metadata,
}
self._deposit_request_put(
deposit, deposit_request_data, replace_metadata, replace_archives
)
assert filehandler is not None
return Receipt(
deposit_id=deposit.id,
deposit_date=deposit.reception_date,
archive=filehandler.name,
status=deposit.status,
)
def _store_metadata_deposit(
self,
deposit: Deposit,
swhid_reference: Union[str, QualifiedSWHID],
metadata_dict: Dict,
metadata_tree: ElementTree.Element,
raw_metadata: bytes,
deposit_origin: Optional[str] = None,
) -> Tuple[ExtendedSWHID, Deposit, DepositRequest]:
"""When all user inputs pass the checks, this associates the raw_metadata to the
swhid_reference in the raw extrinsic metadata storage. In case of any issues,
a bad request response is returned to the user with the details.
Checks:
- metadata are technically parsable
- metadata pass the functional checks
- SWHID (if any) is technically valid
Args:
deposit: Deposit reference
swhid_reference: The swhid or the origin to attach metadata information to
metadata_dict: Full dict of metadata for storage in the deposit DB as jsonb
(parsed out of raw_metadata)
metadata_tree: Full element tree of metadata to check for validity
(parsed out of raw_metadata)
raw_metadata: The actual raw metadata to send in the storage metadata
deposit_origin: Optional deposit origin url to use if any (e.g. deposit
update scenario provides one)
Raises:
DepositError in case of incorrect inputs from the deposit client
(e.g. functionally invalid metadata, ...)
Returns:
Tuple of target swhid, deposit, and deposit request
"""
metadata_ok, error_details = check_metadata(metadata_tree)
if not metadata_ok:
assert error_details, "Details should be set when a failure occurs"
raise DepositError(
BAD_REQUEST,
"Functional metadata checks failure",
convert_status_detail(error_details),
)
metadata_authority = MetadataAuthority(
type=MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit.client.provider_url,
)
metadata_fetcher = self.swh_deposit_fetcher()
# replace metadata within the deposit backend
deposit_request_data = {
METADATA_KEY: metadata_dict,
RAW_METADATA_KEY: raw_metadata,
}
# actually add the metadata to the completed deposit
deposit_request = self._deposit_request_put(deposit, deposit_request_data)
target_swhid: ExtendedSWHID # origin URL or CoreSWHID
if isinstance(swhid_reference, str):
target_swhid = Origin(swhid_reference).swhid()
metadata_context = {}
else:
metadata_context = compute_metadata_context(swhid_reference)
if deposit_origin: # metadata deposit update on completed deposit
metadata_context["origin"] = deposit_origin
target_swhid = extended_swhid_from_qualified(swhid_reference)
self._check_swhid_in_archive(target_swhid)
# metadata deposited by the client
metadata_object = RawExtrinsicMetadata(
target=target_swhid, # core swhid or origin
discovery_date=deposit_request.date,
authority=metadata_authority,
fetcher=metadata_fetcher,
format="sword-v2-atom-codemeta",
metadata=raw_metadata,
**metadata_context,
)
# metadata on the metadata object
swh_deposit_authority = self.swh_deposit_authority()
swh_deposit_fetcher = self.swh_deposit_fetcher()
metametadata_object = RawExtrinsicMetadata(
target=metadata_object.swhid(),
discovery_date=deposit_request.date,
authority=swh_deposit_authority,
fetcher=swh_deposit_fetcher,
format="xml-deposit-info",
metadata=render_to_string(
"deposit/deposit_info.xml", context={"deposit": deposit}
).encode(),
)
# write to metadata storage
self.storage_metadata.metadata_authority_add(
[metadata_authority, swh_deposit_authority]
)
self.storage_metadata.metadata_fetcher_add(
[metadata_fetcher, swh_deposit_fetcher]
)
self.storage_metadata.raw_extrinsic_metadata_add(
[metadata_object, metametadata_object]
)
return (target_swhid, deposit, deposit_request)
def _check_swhid_in_archive(self, target_swhid: ExtendedSWHID) -> None:
"""Check the target object already exists in the archive,
and raises a BAD_REQUEST if it does not."""
if target_swhid.object_type in (ExtendedObjectType.CONTENT,):
if list(
self.storage.content_missing_per_sha1_git([target_swhid.object_id])
):
raise DepositError(
BAD_REQUEST,
f"Cannot load metadata on {target_swhid}, this content "
f"object does not exist in the archive (yet?).",
)
elif target_swhid.object_type in (
ExtendedObjectType.DIRECTORY,
ExtendedObjectType.REVISION,
ExtendedObjectType.RELEASE,
ExtendedObjectType.SNAPSHOT,
):
target_type_name = target_swhid.object_type.name.lower()
method = getattr(self.storage, target_type_name + "_missing")
if list(method([target_swhid.object_id])):
raise DepositError(
BAD_REQUEST,
f"Cannot load metadata on {target_swhid}, this {target_type_name} "
f"object does not exist in the archive (yet?).",
)
elif target_swhid.object_type in (ExtendedObjectType.ORIGIN,):
if None in list(self.storage.origin_get_by_sha1([target_swhid.object_id])):
raise DepositError(
BAD_REQUEST,
"Cannot load metadata on origin, it is not (yet?) known to the "
"archive.",
)
else:
# This should not happen, because target_swhid is generated from either
# a core swhid or an origin URL.
# Let's just check it again so the "switch" is exhaustive.
raise ValueError(
f"_check_swhid_in_archive expected core SWHID or origin SWHID, "
f"but got {target_swhid}."
)
def _atom_entry(
self,
request: Request,
headers: ParsedRequestHeaders,
collection_name: str,
deposit: Deposit,
replace_metadata: bool = False,
replace_archives: bool = False,
) -> Receipt:
"""Atom entry deposit.
Args:
request: the request holding information to parse
and inject in db
headers: parsed request headers
collection_name: the associated client
deposit: deposit to be updated
replace_metadata: 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
replace_archives: 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Raises:
- 400 (bad request) if the request is not providing an external
identifier
- 400 (bad request) if the request's body is empty
- 415 (unsupported media type) if a wrong media type is provided
"""
metadata_stream = request.data
empty_atom_entry_summary = "Empty body request is not supported."
empty_atom_entry_desc = (
"Atom entry request is about non-empty metadata deposit."
)
if not metadata_stream:
raise DepositError(
BAD_REQUEST, empty_atom_entry_summary, empty_atom_entry_desc
)
try:
raw_metadata, metadata_dict, metadata_tree = self._read_metadata(
metadata_stream
)
except ParserError:
raise DepositError(
BAD_REQUEST,
"Malformed xml metadata",
"The xml received is malformed. "
"Please ensure your metadata file is correctly formatted.",
)
if metadata_dict is None:
raise DepositError(
BAD_REQUEST, empty_atom_entry_summary, empty_atom_entry_desc
)
self._set_deposit_origin_from_metadata(deposit, metadata_tree, headers)
# Determine if we are in the metadata-only deposit case
try:
swhid_ref = parse_swh_reference(metadata_tree)
except ValidationError as e:
raise DepositError(
PARSING_ERROR, "Invalid SWHID reference", str(e),
)
if swhid_ref is not None and (
deposit.origin_url or deposit.parent or deposit.external_id
):
raise DepositError(
BAD_REQUEST,
" is for metadata-only deposits and "
" / / Slug are for "
"code deposits, only one may be used on a given deposit.",
)
if swhid_ref is not None:
deposit.save() # We need a deposit id
target_swhid, depo, depo_request = self._store_metadata_deposit(
deposit, swhid_ref, metadata_dict, metadata_tree, raw_metadata
)
deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS
if isinstance(swhid_ref, QualifiedSWHID):
deposit.swhid = str(extended_swhid_from_qualified(swhid_ref))
deposit.swhid_context = str(swhid_ref)
deposit.complete_date = depo_request.date
deposit.reception_date = depo_request.date
deposit.save()
return Receipt(
deposit_id=deposit.id,
deposit_date=depo_request.date,
status=deposit.status,
archive=None,
)
self._deposit_put(
deposit=deposit, in_progress=headers.in_progress,
)
self._deposit_request_put(
deposit,
{METADATA_KEY: metadata_dict, RAW_METADATA_KEY: raw_metadata},
replace_metadata,
replace_archives,
)
return Receipt(
deposit_id=deposit.id,
deposit_date=deposit.reception_date,
status=deposit.status,
archive=None,
)
def _set_deposit_origin_from_metadata(self, deposit, metadata, headers):
- create_origin = metadata.find(
- "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES
- )
- add_to_origin = metadata.find(
- "swh:deposit/swh:add_to_origin/swh:origin", namespaces=NAMESPACES
- )
+ (create_origin, add_to_origin) = parse_swh_deposit_origin(metadata)
- if create_origin is not None and add_to_origin is not None:
+ if create_origin and add_to_origin:
raise DepositError(
BAD_REQUEST,
" and are mutually exclusive, "
"as they respectively create a new origin and add to an existing "
"origin.",
)
- if create_origin is not None:
- origin_url = create_origin.attrib["url"]
+ if create_origin:
+ origin_url = create_origin
check_client_origin(deposit.client, origin_url)
deposit.origin_url = origin_url
- if add_to_origin is not None:
- origin_url = add_to_origin.attrib["url"]
+ if add_to_origin:
+ origin_url = add_to_origin
check_client_origin(deposit.client, origin_url)
deposit.parent = (
Deposit.objects.filter(
client=deposit.client,
origin_url=origin_url,
status=DEPOSIT_STATUS_LOAD_SUCCESS,
)
.order_by("-id")[0:1]
.get()
)
deposit.origin_url = origin_url
external_identifier_element = metadata.find(
"atom:external_identifier", namespaces=NAMESPACES
)
if external_identifier_element is not None:
# Deprecated tag.
# When clients stopped using it, this should raise an error
# unconditionally
if deposit.origin_url:
raise DepositError(
BAD_REQUEST,
" is deprecated, you should only use "
" and from now on.",
)
if headers.slug and external_identifier_element.text != headers.slug:
raise DepositError(
BAD_REQUEST,
"The tag and Slug header are deprecated, "
" or "
"should be used instead.",
)
def _empty_post(
self,
request: Request,
headers: ParsedRequestHeaders,
collection_name: str,
deposit: Deposit,
) -> Receipt:
"""Empty post to finalize a deposit.
Args:
request: the request holding information to parse
and inject in db
headers: parsed request headers
collection_name: the associated client
deposit: deposit to be finalized
"""
self._complete_deposit(deposit)
assert deposit.complete_date is not None
return Receipt(
deposit_id=deposit.id,
deposit_date=deposit.complete_date,
status=deposit.status,
archive=None,
)
def additional_checks(
self,
request: Request,
headers: ParsedRequestHeaders,
collection_name: str,
deposit: Optional[Deposit],
) -> Dict[str, Any]:
"""Permit the child class to enrich additional checks.
Returns:
dict with 'error' detailing the problem.
"""
return {}
def get_client(self, request) -> DepositClient:
# This class depends on AuthenticatedAPIView, so request.user.username
# is always set
username = request.user.username
assert username is not None
if self._client is None:
try:
self._client = DepositClient.objects.get( # type: ignore
username=username
)
except DepositClient.DoesNotExist:
raise DepositError(NOT_FOUND, f"Unknown client name {username}")
assert self._client.username == username
return self._client
def checks(
self, request: Request, collection_name: str, deposit: Optional[Deposit] = None
) -> ParsedRequestHeaders:
if deposit is None:
collection = get_collection_by_name(collection_name)
else:
assert collection_name == deposit.collection.name
collection = deposit.collection
client = self.get_client(request)
collection_id = collection.id
collections = client.collections
assert collections is not None
if collection_id not in collections:
raise DepositError(
FORBIDDEN,
f"Client {client.username} cannot access collection {collection_name}",
)
headers = self._read_headers(request)
if deposit is not None:
self.restrict_access(request, headers, deposit)
if headers.on_behalf_of:
raise DepositError(MEDIATION_NOT_ALLOWED, "Mediation is not supported.")
self.additional_checks(request, headers, collection_name, deposit)
return headers
def restrict_access(
self, request: Request, headers: ParsedRequestHeaders, deposit: Deposit
) -> None:
"""Allow modifications on deposit with status 'partial' only, reject the rest.
"""
if request.method != "GET" and deposit.status != DEPOSIT_STATUS_PARTIAL:
summary = "You can only act on deposit with status '%s'" % (
DEPOSIT_STATUS_PARTIAL,
)
description = f"This deposit has status '{deposit.status}'"
raise DepositError(
BAD_REQUEST, summary=summary, verbose_description=description
)
def _basic_not_allowed_method(self, request: Request, method: str):
raise DepositError(
METHOD_NOT_ALLOWED, f"{method} method is not supported on this endpoint",
)
def get(
self, request: Request, *args, **kwargs
) -> Union[HttpResponse, FileResponse]:
return self._basic_not_allowed_method(request, "GET")
def post(self, request: Request, *args, **kwargs) -> HttpResponse:
return self._basic_not_allowed_method(request, "POST")
def put(self, request: Request, *args, **kwargs) -> HttpResponse:
return self._basic_not_allowed_method(request, "PUT")
def delete(self, request: Request, *args, **kwargs) -> HttpResponse:
return self._basic_not_allowed_method(request, "DELETE")
class APIGet(APIBase, metaclass=ABCMeta):
"""Mixin for class to support GET method.
"""
def get( # type: ignore
self, request: Request, collection_name: str, deposit_id: int
) -> Union[HttpResponse, FileResponse]:
"""Endpoint to create/add resources to deposit.
Returns:
200 response when no error during routine occurred
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
deposit = get_deposit_by_id(deposit_id, collection_name)
self.checks(request, collection_name, deposit)
r = self.process_get(request, collection_name, deposit)
status, content, content_type = r
if content_type == "swh/generator":
with content as path:
return FileResponse(
open(path, "rb"), status=status, content_type="application/tar"
)
if content_type == "application/json":
return HttpResponse(
json.dumps(content), status=status, content_type=content_type
)
return HttpResponse(content, status=status, content_type=content_type)
@abstractmethod
def process_get(
self, request: Request, collection_name: str, deposit: Deposit
) -> Tuple[int, Any, str]:
"""Routine to deal with the deposit's get processing.
Returns:
Tuple status, stream of content, content-type
"""
pass
class APIPost(APIBase, metaclass=ABCMeta):
"""Mixin for class to support POST method.
"""
def post( # type: ignore
self, request: Request, collection_name: str, deposit_id: Optional[int] = None
) -> HttpResponse:
"""Endpoint to create/add resources to deposit.
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
if deposit_id is None:
deposit = None
else:
deposit = get_deposit_by_id(deposit_id, collection_name)
headers = self.checks(request, collection_name, deposit)
status, iri_key, receipt = self.process_post(
request, headers, collection_name, deposit
)
return self._make_deposit_receipt(
request, collection_name, status, iri_key, receipt,
)
def _make_deposit_receipt(
self,
request,
collection_name: str,
status: int,
iri_key: str,
receipt: Receipt,
) -> HttpResponse:
"""Returns an HttpResponse with a SWORD Deposit receipt as content."""
# Build the IRIs in the receipt
args = [collection_name, receipt.deposit_id]
iris = {
iri: request.build_absolute_uri(reverse(iri, args=args))
for iri in [EM_IRI, EDIT_IRI, CONT_FILE_IRI, SE_IRI, STATE_IRI]
}
context = {
**attr.asdict(receipt),
**iris,
"packagings": ACCEPT_PACKAGINGS,
}
response = render(
request,
"deposit/deposit_receipt.xml",
context=context,
content_type="application/xml",
status=status,
)
response["Location"] = iris[iri_key]
return response
@abstractmethod
def process_post(
self,
request,
headers: ParsedRequestHeaders,
collection_name: str,
deposit: Optional[Deposit] = None,
) -> Tuple[int, str, Receipt]:
"""Routine to deal with the deposit's processing.
Returns
Tuple of:
- response status code (200, 201, etc...)
- key iri (EM_IRI, EDIT_IRI, etc...)
- Receipt
"""
pass
class APIPut(APIBase, metaclass=ABCMeta):
"""Mixin for class to support PUT method.
"""
def put( # type: ignore
self, request: Request, collection_name: str, deposit_id: int
) -> HttpResponse:
"""Endpoint to update deposit resources.
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
if deposit_id is None:
deposit = None
else:
deposit = get_deposit_by_id(deposit_id, collection_name)
headers = self.checks(request, collection_name, deposit)
self.process_put(request, headers, collection_name, deposit)
return HttpResponse(status=status.HTTP_204_NO_CONTENT)
@abstractmethod
def process_put(
self,
request: Request,
headers: ParsedRequestHeaders,
collection_name: str,
deposit: Deposit,
) -> None:
"""Routine to deal with updating a deposit in some way.
Returns
dictionary of the processing result
"""
pass
class APIDelete(APIBase, metaclass=ABCMeta):
"""Mixin for class to support DELETE method.
"""
def delete( # type: ignore
self, request: Request, collection_name: str, deposit_id: Optional[int] = None
) -> HttpResponse:
"""Endpoint to delete some deposit's resources (archives, deposit).
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
assert deposit_id is not None
deposit = get_deposit_by_id(deposit_id, collection_name)
self.checks(request, collection_name, deposit)
self.process_delete(request, collection_name, deposit)
return HttpResponse(status=status.HTTP_204_NO_CONTENT)
@abstractmethod
def process_delete(
self, request: Request, collection_name: str, deposit: Deposit
) -> None:
"""Routine to delete a resource.
This is mostly not allowed except for the
EM_IRI (cf. .api.deposit_update.APIUpdateArchive)
"""
pass
diff --git a/swh/deposit/cli/client.py b/swh/deposit/cli/client.py
index f2488713..6a58901f 100644
--- a/swh/deposit/cli/client.py
+++ b/swh/deposit/cli/client.py
@@ -1,646 +1,648 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from __future__ import annotations
from contextlib import contextmanager
from datetime import datetime, timezone
import logging
# WARNING: do not import unnecessary things here to keep cli startup time under
# control
import os
import sys
from typing import TYPE_CHECKING, Any, Collection, Dict, List, Optional
import warnings
import click
from swh.deposit.cli import deposit
logger = logging.getLogger(__name__)
if TYPE_CHECKING:
from swh.deposit.client import PublicApiDepositClient
class InputError(ValueError):
"""Input script error
"""
pass
@contextmanager
def trap_and_report_exceptions():
"""Trap and report exceptions (InputError, MaintenanceError) in a unified way.
"""
from swh.deposit.client import MaintenanceError
try:
yield
except InputError as e:
logger.error("Problem during parsing options: %s", e)
sys.exit(1)
except MaintenanceError as e:
logger.error(e)
sys.exit(1)
def _url(url: str) -> str:
"""Force the /1 api version at the end of the url (avoiding confusing
issues without it).
Args:
url (str): api url used by cli users
Returns:
Top level api url to actually request
"""
if not url.endswith("/1"):
url = "%s/1" % url
return url
def generate_metadata(
deposit_client: str,
name: str,
authors: List[str],
external_id: Optional[str] = None,
create_origin: Optional[str] = None,
metadata_provenance_url: Optional[str] = None,
) -> str:
"""Generate sword compliant xml metadata with the minimum required metadata.
The Atom spec, https://tools.ietf.org/html/rfc4287, says that:
- atom:entry elements MUST contain one or more atom:author elements
- atom:entry elements MUST contain exactly one atom:title element.
- atom:entry elements MUST contain exactly one atom:updated element.
However, we are also using CodeMeta, so we want some basic information to be
mandatory.
Therefore, we generate the following mandatory fields:
- http://www.w3.org/2005/Atom#updated
- http://www.w3.org/2005/Atom#author
- http://www.w3.org/2005/Atom#title
- https://doi.org/10.5063/SCHEMA/CODEMETA-2.0#name (yes, in addition to
http://www.w3.org/2005/Atom#title, even if they have somewhat the same
meaning)
- https://doi.org/10.5063/SCHEMA/CODEMETA-2.0#author
Args:
deposit_client: Deposit client username,
name: Software name
authors: List of author names
create_origin: Origin concerned by the deposit
metadata_provenance_url: Provenance metadata url
Returns:
metadata xml string
"""
import xmltodict
# generate a metadata file with the minimum required metadata
document = {
"atom:entry": {
"@xmlns:atom": "http://www.w3.org/2005/Atom",
"@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
"@xmlns:schema": "http://schema.org/",
"atom:updated": datetime.now(tz=timezone.utc), # mandatory, cf. docstring
"atom:author": deposit_client, # mandatory, cf. docstring
"atom:title": name, # mandatory, cf. docstring
"codemeta:name": name, # mandatory, cf. docstring
"codemeta:author": [ # mandatory, cf. docstring
{"codemeta:name": author_name} for author_name in authors
],
},
}
if external_id:
document["atom:entry"]["codemeta:identifier"] = external_id
swh_deposit_dict: Dict = {}
if create_origin or metadata_provenance_url:
document["atom:entry"][
"@xmlns:swh"
] = "https://www.softwareheritage.org/schema/2018/deposit"
if create_origin:
swh_deposit_dict.update(
{"swh:create_origin": {"swh:origin": {"@url": create_origin}}}
)
if metadata_provenance_url:
swh_deposit_dict.update(
{"swh:metadata-provenance": {"schema:url": metadata_provenance_url}}
)
if swh_deposit_dict:
document["atom:entry"]["swh:deposit"] = swh_deposit_dict
logging.debug("Atom entry dict to generate as xml: %s", document)
return xmltodict.unparse(document, pretty=True)
def _collection(client: PublicApiDepositClient) -> str:
"""Retrieve the client's collection
"""
# retrieve user's collection
sd_content = client.service_document()
if "error" in sd_content:
msg = sd_content["error"]
raise InputError(f"Service document retrieval: {msg}")
collection = sd_content["app:service"]["app:workspace"]["app:collection"][
"sword:name"
]
return collection
def client_command_parse_input(
client,
username: str,
archive: Optional[str],
metadata: Optional[str],
collection: Optional[str],
slug: Optional[str],
create_origin: Optional[str],
metadata_provenance_url: Optional[str],
partial: bool,
deposit_id: Optional[int],
swhid: Optional[str],
replace: bool,
url: str,
name: Optional[str],
authors: List[str],
temp_dir: str,
) -> Dict[str, Any]:
"""Parse the client subcommand options and make sure the combination
is acceptable*. If not, an InputError exception is raised
explaining the issue.
By acceptable, we mean:
- A multipart deposit (create or update) requires:
- an existing software archive
- an existing metadata file or author(s) and name provided in
params
- A binary deposit (create/update) requires an existing software
archive
- A metadata deposit (create/update) requires an existing metadata
file or author(s) and name provided in params
- A deposit update requires a deposit_id
This will not prevent all failure cases though. The remaining
errors are already dealt with by the underlying api client.
Raises:
InputError explaining the user input related issue
MaintenanceError explaining the api status
Returns:
dict with the following keys:
"archive": the software archive to deposit
"username": username
"metadata": the metadata file to deposit
"collection": the user's collection under which to put the deposit
"create_origin": the origin concerned by the deposit
"metadata_provenance_url": the metadata provenance url
"in_progress": if the deposit is partial or not
"url": deposit's server main entry point
"deposit_id": optional deposit identifier
"swhid": optional deposit swhid
"replace": whether the given deposit is to be replaced or not
"""
if not metadata:
if name and authors:
metadata_path = os.path.join(temp_dir, "metadata.xml")
logging.debug("Temporary file: %s", metadata_path)
metadata_xml = generate_metadata(
username,
name,
authors,
external_id=slug,
create_origin=create_origin,
metadata_provenance_url=metadata_provenance_url,
)
logging.debug("Metadata xml generated: %s", metadata_xml)
with open(metadata_path, "w") as f:
f.write(metadata_xml)
metadata = metadata_path
elif archive is not None and not partial and not deposit_id:
# If we meet all the following conditions:
# * this is not an archive-only deposit request
# * it is not part of a multipart deposit (either create/update
# or finish)
# * it misses either name or authors
raise InputError(
"For metadata deposit request, either a metadata file with "
"--metadata or both --author and --name must be provided. "
)
elif name or authors:
# If we are generating metadata, then all mandatory metadata
# must be present
raise InputError(
"For metadata deposit request, either a metadata file with "
"--metadata or both --author and --name must be provided."
)
else:
# TODO: this is a multipart deposit, we might want to check that
# metadata are deposited at some point
pass
elif name or authors or create_origin:
raise InputError(
"Using --metadata flag is incompatible with "
"--author and --name and --create-origin (those are used to generate one "
"metadata file)."
)
if not archive and not metadata:
raise InputError(
"Please provide an actionable command. See --help for more information"
)
if metadata:
from xml.etree import ElementTree
- from swh.deposit.utils import parse_swh_metadata_provenance, parse_xml
-
- metadata_raw = open(metadata, "r").read()
- metadata_dict = parse_xml(metadata_raw)
- metadata_swh = metadata_dict.get("swh:deposit", {})
- if (
- "swh:create_origin" not in metadata_swh
- and "swh:add_to_origin" not in metadata_swh
- ):
+ from swh.deposit.utils import (
+ parse_swh_deposit_origin,
+ parse_swh_metadata_provenance,
+ )
+
+ metadata_tree = ElementTree.fromstring(open(metadata).read())
+ (create_origin, add_to_origin) = parse_swh_deposit_origin(metadata_tree)
+ if create_origin and add_to_origin:
+ logger.error(
+ "The metadata file provided must not contain both "
+ '"" and "" tags',
+ )
+ elif not create_origin and not add_to_origin:
logger.warning(
"The metadata file provided should contain "
'"" or "" tag',
)
- meta_prov_url = parse_swh_metadata_provenance(
- ElementTree.fromstring(metadata_raw)
- )
+ meta_prov_url = parse_swh_metadata_provenance(metadata_tree)
if not meta_prov_url:
logger.warning(
"The metadata file provided should contain "
'"" tag'
)
if replace and not deposit_id:
raise InputError("To update an existing deposit, you must provide its id")
if not collection:
collection = _collection(client)
return {
"archive": archive,
"username": username,
"metadata": metadata,
"collection": collection,
"slug": slug,
"in_progress": partial,
"url": url,
"deposit_id": deposit_id,
"swhid": swhid,
"replace": replace,
}
def _subdict(d: Dict[str, Any], keys: Collection[str]) -> Dict[str, Any]:
"return a dict from d with only given keys"
return {k: v for k, v in d.items() if k in keys}
def credentials_decorator(f):
"""Add default --url, --username and --password flag to cli.
"""
f = click.option(
"--password", required=True, help="(Mandatory) User's associated password"
)(f)
f = click.option("--username", required=True, help="(Mandatory) User's name")(f)
f = click.option(
"--url",
default="https://deposit.softwareheritage.org",
help=(
"(Optional) Deposit server api endpoint. By default, "
"https://deposit.softwareheritage.org/1"
),
)(f)
return f
def output_format_decorator(f):
"""Add --format output flag decorator to cli.
"""
return click.option(
"-f",
"--format",
"output_format",
default="logging",
type=click.Choice(["logging", "yaml", "json"]),
help="Output format results.",
)(f)
@deposit.command()
@credentials_decorator
@click.option(
"--archive",
type=click.Path(exists=True),
help="(Optional) Software archive to deposit",
)
@click.option(
"--metadata",
type=click.Path(exists=True),
help=(
"(Optional) Path to xml metadata file. If not provided, "
"this will use a file named .metadata.xml"
),
)
@click.option(
"--archive-deposit/--no-archive-deposit",
default=False,
help="Deprecated (ignored)",
)
@click.option(
"--metadata-deposit/--no-metadata-deposit",
default=False,
help="Deprecated (ignored)",
)
@click.option(
"--collection",
help="(Optional) User's collection. If not provided, this will be fetched.",
)
@click.option(
"--slug",
help=(
"(Deprecated) (Optional) External system information identifier. "
"If not provided, it will be generated"
),
)
@click.option(
"--create-origin",
help=(
"(Optional) Origin url to attach information to. To be used alongside "
"--name and --author. This will be generated alongside the metadata to "
"provide to the deposit server."
),
)
@click.option(
"--metadata-provenance-url",
help=(
"(Optional) Provenance metadata url to indicate from where the metadata is "
"coming from."
),
)
@click.option(
"--partial/--no-partial",
default=False,
help=(
"(Optional) The deposit will be partial, other deposits "
"will have to take place to finalize it."
),
)
@click.option(
"--deposit-id",
default=None,
help="(Optional) Update an existing partial deposit with its identifier",
)
@click.option(
"--swhid",
default=None,
help="(Optional) Update existing completed deposit (status done) with new metadata",
)
@click.option(
"--replace/--no-replace",
default=False,
help="(Optional) Update by replacing existing metadata to a deposit",
)
@click.option("--verbose/--no-verbose", default=False, help="Verbose mode")
@click.option("--name", help="Software name")
@click.option(
"--author",
multiple=True,
help="Software author(s), this can be repeated as many times"
" as there are authors",
)
@output_format_decorator
@click.pass_context
def upload(
ctx,
username: str,
password: str,
archive: Optional[str],
metadata: Optional[str],
archive_deposit: bool,
metadata_deposit: bool,
collection: Optional[str],
slug: Optional[str],
create_origin: Optional[str],
metadata_provenance_url: Optional[str],
partial: bool,
deposit_id: Optional[int],
swhid: Optional[str],
replace: bool,
url: str,
verbose: bool,
name: Optional[str],
author: List[str],
output_format: Optional[str],
):
"""Software Heritage Public Deposit Client
Create/Update deposit through the command line.
More documentation can be found at
https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html.
"""
import tempfile
from swh.deposit.client import PublicApiDepositClient
if archive_deposit or metadata_deposit:
warnings.warn(
'"archive_deposit" and "metadata_deposit" option arguments are '
"deprecated and have no effect; simply do not provide the archive "
"for a metadata-only deposit, and do not provide a metadata for a"
"archive-only deposit.",
DeprecationWarning,
)
if slug:
if create_origin and slug != create_origin:
raise InputError(
'"--slug" flag has been deprecated in favor of "--create-origin" flag. '
"You mentioned both with different values, please only "
'use "--create-origin".'
)
warnings.warn(
'"--slug" flag has been deprecated in favor of "--create-origin" flag. '
'Please, start using "--create-origin" instead of "--slug"',
DeprecationWarning,
)
url = _url(url)
client = PublicApiDepositClient(url=url, auth=(username, password))
with tempfile.TemporaryDirectory() as temp_dir:
with trap_and_report_exceptions():
logger.debug("Parsing cli options")
config = client_command_parse_input(
client,
username,
archive,
metadata,
collection,
slug,
create_origin,
metadata_provenance_url,
partial,
deposit_id,
swhid,
replace,
url,
name,
author,
temp_dir,
)
if verbose:
logger.info("Parsed configuration: %s", config)
keys = [
"archive",
"collection",
"in_progress",
"metadata",
"slug",
]
if config["deposit_id"]:
keys += ["deposit_id", "replace", "swhid"]
data = client.deposit_update(**_subdict(config, keys))
else:
data = client.deposit_create(**_subdict(config, keys))
print_result(data, output_format)
@deposit.command()
@credentials_decorator
@click.option("--deposit-id", default=None, required=True, help="Deposit identifier.")
@output_format_decorator
@click.pass_context
def status(ctx, url, username, password, deposit_id, output_format):
"""Deposit's status
"""
from swh.deposit.client import PublicApiDepositClient
url = _url(url)
logger.debug("Status deposit")
with trap_and_report_exceptions():
client = PublicApiDepositClient(url=_url(url), auth=(username, password))
collection = _collection(client)
print_result(
client.deposit_status(collection=collection, deposit_id=deposit_id),
output_format,
)
def print_result(data: Dict[str, Any], output_format: Optional[str]) -> None:
"""Display the result data into a dedicated output format.
"""
import json
import yaml
if output_format == "json":
click.echo(json.dumps(data))
elif output_format == "yaml":
click.echo(yaml.dump(data))
else:
logger.info(data)
@deposit.command("metadata-only")
@credentials_decorator
@click.option(
"--metadata",
"metadata_path",
type=click.Path(exists=True),
required=True,
help="Path to xml metadata file",
)
@output_format_decorator
@click.pass_context
def metadata_only(ctx, url, username, password, metadata_path, output_format):
"""Deposit metadata only upload
"""
from xml.etree import ElementTree
from swh.deposit.client import PublicApiDepositClient
from swh.deposit.utils import parse_swh_reference
# Parse to check for a swhid presence within the metadata file
with open(metadata_path, "r") as f:
metadata_raw = f.read()
actual_swhid = parse_swh_reference(ElementTree.fromstring(metadata_raw))
if not actual_swhid:
raise InputError("A SWHID must be provided for a metadata-only deposit")
with trap_and_report_exceptions():
client = PublicApiDepositClient(url=_url(url), auth=(username, password))
collection = _collection(client)
result = client.deposit_metadata_only(collection, metadata_path)
print_result(result, output_format)
@deposit.command("list")
@credentials_decorator
@output_format_decorator
@click.option(
"--page", default=1, help="Page number when requesting more information",
)
@click.option(
"--page-size", default=100, help="Page number when requesting more information",
)
@click.pass_context
def deposit_list(ctx, url, username, password, output_format, page, page_size):
"""Client deposit listing
"""
from swh.deposit.client import PublicApiDepositClient
url = _url(url)
logger.debug("List deposits for user %s", username)
with trap_and_report_exceptions():
client = PublicApiDepositClient(url=_url(url), auth=(username, password))
collection = _collection(client)
result = client.deposit_list(collection, page=page, page_size=page_size)
print_result(result, output_format)
diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py
index 21b71f44..7cab518d 100644
--- a/swh/deposit/utils.py
+++ b/swh/deposit/utils.py
@@ -1,235 +1,273 @@
# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Optional, Tuple, Union
from xml.etree import ElementTree
import iso8601
import xmltodict
from swh.model.exceptions import ValidationError
from swh.model.model import TimestampWithTimezone
from swh.model.swhids import ExtendedSWHID, ObjectType, QualifiedSWHID
logger = logging.getLogger(__name__)
NAMESPACES = {
"atom": "http://www.w3.org/2005/Atom",
"app": "http://www.w3.org/2007/app",
"dc": "http://purl.org/dc/terms/",
"codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
"sword": "http://purl.org/net/sword/terms/",
"swh": "https://www.softwareheritage.org/schema/2018/deposit",
"schema": "http://schema.org/",
}
def parse_xml(stream, encoding="utf-8"):
data = xmltodict.parse(
stream,
encoding=encoding,
namespaces={uri: prefix for (prefix, uri) in NAMESPACES.items()},
process_namespaces=True,
dict_constructor=dict,
)
if "atom:entry" in data:
data = data["atom:entry"]
return data
def normalize_date(date):
"""Normalize date fields as expected by swh workers.
If date is a list, elect arbitrarily the first element of that
list
If date is (then) a string, parse it through
dateutil.parser.parse to extract a datetime.
Then normalize it through
:class:`swh.model.model.TimestampWithTimezone`
Returns
The swh date object
"""
if isinstance(date, list):
date = date[0]
if isinstance(date, str):
date = iso8601.parse_date(date)
tstz = TimestampWithTimezone.from_dict(date)
return {
"timestamp": tstz.timestamp.to_dict(),
"offset": tstz.offset_minutes(),
}
def compute_metadata_context(swhid_reference: QualifiedSWHID) -> Dict[str, Any]:
"""Given a SWHID object, determine the context as a dict.
"""
metadata_context: Dict[str, Any] = {"origin": None}
if swhid_reference.qualifiers():
metadata_context = {
"origin": swhid_reference.origin,
"path": swhid_reference.path,
}
snapshot = swhid_reference.visit
if snapshot:
metadata_context["snapshot"] = snapshot
anchor = swhid_reference.anchor
if anchor:
metadata_context[anchor.object_type.name.lower()] = anchor
return metadata_context
ALLOWED_QUALIFIERS_NODE_TYPE = (
ObjectType.SNAPSHOT,
ObjectType.REVISION,
ObjectType.RELEASE,
ObjectType.DIRECTORY,
)
def parse_swh_metadata_provenance(
metadata: ElementTree.Element,
) -> Optional[Union[QualifiedSWHID, str]]:
"""Parse swh metadata-provenance within the metadata dict reference if found, None
otherwise.
.. code-block:: xml
- https://url.org/metadata/url
+ https://example.org/metadata/url
Args:
metadata: result of parsing an Atom document with :func:`parse_xml`
Raises:
ValidationError in case of invalid xml
Returns:
Either the metadata provenance url if any or None otherwise
"""
url_element = metadata.find(
"swh:deposit/swh:metadata-provenance/schema:url", namespaces=NAMESPACES
)
if url_element is not None:
return url_element.text
return None
+def parse_swh_deposit_origin(
+ metadata: ElementTree.Element,
+) -> Tuple[Optional[str], Optional[str]]:
+ """Parses and from metadata document,
+ if any.
+
+ .. code-block:: xml
+
+
+
+
+
+
+
+ .. code-block:: xml
+
+
+
+
+
+
+
+ Returns:
+ tuple of (origin_to_create, origin_to_add). If both are non-None, this
+ should typically be an error raised to the user.
+ """
+ create_origin = metadata.find(
+ "swh:deposit/swh:create_origin/swh:origin", namespaces=NAMESPACES
+ )
+ add_to_origin = metadata.find(
+ "swh:deposit/swh:add_to_origin/swh:origin", namespaces=NAMESPACES
+ )
+
+ return (
+ None if create_origin is None else create_origin.attrib["url"],
+ None if add_to_origin is None else add_to_origin.attrib["url"],
+ )
+
+
def parse_swh_reference(
metadata: ElementTree.Element,
) -> Optional[Union[QualifiedSWHID, str]]:
- """Parse swh reference within the metadata dict (or origin) reference if found,
- None otherwise.
+ """Parse within the metadata document, if any.
.. code-block:: xml
or:
.. code-block:: xml
Args:
metadata: result of parsing an Atom document
Raises:
ValidationError in case the swhid referenced (if any) is invalid
Returns:
Either swhid or origin reference if any. None otherwise.
""" # noqa
ref_origin = metadata.find(
"swh:deposit/swh:reference/swh:origin[@url]", namespaces=NAMESPACES
)
if ref_origin is not None:
return ref_origin.attrib["url"]
ref_object = metadata.find(
"swh:deposit/swh:reference/swh:object[@swhid]", namespaces=NAMESPACES
)
if ref_object is None:
return None
swhid = ref_object.attrib["swhid"]
if not swhid:
return None
swhid_reference = QualifiedSWHID.from_string(swhid)
if swhid_reference.qualifiers():
anchor = swhid_reference.anchor
if anchor:
if anchor.object_type not in ALLOWED_QUALIFIERS_NODE_TYPE:
error_msg = (
"anchor qualifier should be a core SWHID with type one of "
f"{', '.join(t.name.lower() for t in ALLOWED_QUALIFIERS_NODE_TYPE)}"
)
raise ValidationError(error_msg)
visit = swhid_reference.visit
if visit:
if visit.object_type != ObjectType.SNAPSHOT:
raise ValidationError(
f"visit qualifier should be a core SWHID with type snp, "
f"not {visit.object_type.value}"
)
if (
visit
and anchor
and visit.object_type == ObjectType.SNAPSHOT
and anchor.object_type == ObjectType.SNAPSHOT
):
logger.warn(
"SWHID use of both anchor and visit targeting "
f"a snapshot: {swhid_reference}"
)
raise ValidationError(
"'anchor=swh:1:snp:' is not supported when 'visit' is also provided."
)
return swhid_reference
def extended_swhid_from_qualified(swhid: QualifiedSWHID) -> ExtendedSWHID:
"""Used to get the target of a metadata object from a ,
as the latter uses a QualifiedSWHID."""
return ExtendedSWHID.from_string(str(swhid).split(";")[0])
def to_header_link(link: str, link_name: str) -> str:
"""Build a single header link.
>>> link_next = to_header_link("next-url", "next")
>>> link_next
'; rel="next"'
>>> ','.join([link_next, to_header_link("prev-url", "prev")])
'; rel="next",; rel="prev"'
"""
return f'<{link}>; rel="{link_name}"'