Changeset View
Standalone View
swh/deposit/api/common.py
# Copyright (C) 2017-2020 The Software Heritage developers | # Copyright (C) 2017-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from abc import ABCMeta, abstractmethod | from abc import ABCMeta, abstractmethod | ||||
import datetime | import datetime | ||||
import hashlib | import hashlib | ||||
import json | import json | ||||
from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union | from typing import Any, Dict, Optional, Sequence, Tuple, Type, Union | ||||
import attr | |||||
from django.http import FileResponse, HttpResponse | from django.http import FileResponse, HttpResponse | ||||
from django.shortcuts import render | from django.shortcuts import render | ||||
from django.urls import reverse | from django.urls import reverse | ||||
from django.utils import timezone | from django.utils import timezone | ||||
from rest_framework import status | from rest_framework import status | ||||
from rest_framework.authentication import BaseAuthentication, BasicAuthentication | from rest_framework.authentication import BaseAuthentication, BasicAuthentication | ||||
from rest_framework.permissions import BasePermission, IsAuthenticated | from rest_framework.permissions import BasePermission, IsAuthenticated | ||||
from rest_framework.request import Request | from rest_framework.request import Request | ||||
from rest_framework.views import APIView | from rest_framework.views import APIView | ||||
from swh.deposit.api.checks import check_metadata | |||||
from swh.deposit.api.converters import convert_status_detail | |||||
from swh.deposit.models import Deposit | |||||
from swh.deposit.utils import compute_metadata_context | |||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.model.identifiers import SWHID, ValidationError | |||||
from swh.model.model import ( | |||||
MetadataAuthority, | |||||
MetadataAuthorityType, | |||||
MetadataFetcher, | |||||
RawExtrinsicMetadata, | |||||
) | |||||
from swh.scheduler.utils import create_oneshot_task_dict | from swh.scheduler.utils import create_oneshot_task_dict | ||||
from ..config import ( | from ..config import ( | ||||
ARCHIVE_KEY, | ARCHIVE_KEY, | ||||
ARCHIVE_TYPE, | ARCHIVE_TYPE, | ||||
CONT_FILE_IRI, | CONT_FILE_IRI, | ||||
DEPOSIT_STATUS_DEPOSITED, | DEPOSIT_STATUS_DEPOSITED, | ||||
DEPOSIT_STATUS_LOAD_SUCCESS, | DEPOSIT_STATUS_LOAD_SUCCESS, | ||||
Show All 11 Lines | from ..errors import ( | ||||
CHECKSUM_MISMATCH, | CHECKSUM_MISMATCH, | ||||
ERROR_CONTENT, | ERROR_CONTENT, | ||||
FORBIDDEN, | FORBIDDEN, | ||||
MAX_UPLOAD_SIZE_EXCEEDED, | MAX_UPLOAD_SIZE_EXCEEDED, | ||||
MEDIATION_NOT_ALLOWED, | MEDIATION_NOT_ALLOWED, | ||||
METHOD_NOT_ALLOWED, | METHOD_NOT_ALLOWED, | ||||
NOT_FOUND, | NOT_FOUND, | ||||
PARSING_ERROR, | PARSING_ERROR, | ||||
BadRequestError, | |||||
ParserError, | ParserError, | ||||
make_error_dict, | make_error_dict, | ||||
make_error_response, | make_error_response, | ||||
make_error_response_from_dict, | make_error_response_from_dict, | ||||
) | ) | ||||
from ..models import Deposit, DepositClient, DepositCollection, DepositRequest | from ..models import DepositClient, DepositCollection, DepositRequest | ||||
from ..parsers import parse_xml | from ..parsers import parse_swh_reference, parse_xml | ||||
ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] | ACCEPT_PACKAGINGS = ["http://purl.org/net/sword/package/SimpleZip"] | ||||
ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] | ACCEPT_ARCHIVE_CONTENT_TYPES = ["application/zip", "application/x-tar"] | ||||
class AuthenticatedAPIView(APIView): | class AuthenticatedAPIView(APIView): | ||||
"""Mixin intended as a based API view to enforce the basic | """Mixin intended as a based API view to enforce the basic | ||||
authentication check | authentication check | ||||
▲ Show 20 Lines • Show All 533 Lines • ▼ Show 20 Lines | ) -> Dict: | ||||
assert filehandler is not None | assert filehandler is not None | ||||
return { | return { | ||||
"deposit_id": deposit.id, | "deposit_id": deposit.id, | ||||
"deposit_date": deposit.reception_date, | "deposit_date": deposit.reception_date, | ||||
"archive": filehandler.name, | "archive": filehandler.name, | ||||
"status": deposit.status, | "status": deposit.status, | ||||
} | } | ||||
def _store_metadata_deposit( | |||||
self, | |||||
deposit: Deposit, | |||||
swhid_reference: Union[str, SWHID], | |||||
metadata: Dict, | |||||
raw_metadata: bytes, | |||||
deposit_origin: Optional[str] = None, | |||||
) -> Tuple[Union[SWHID, str], Union[SWHID, str], Deposit, DepositRequest]: | |||||
"""When all user inputs pass the checks, this associates the raw_metadata to the | |||||
swhid_reference in the raw extrinsic metadata storage. In case of any issues, | |||||
a bad request response is returned to the user with the details. | |||||
vlorentz: What does "In the nominal scenario" mean? | |||||
Done Inline Actionswell, no bump in the road, all user inputs are correct so we can do the actual job properly. ardumont: well, no bump in the road, all user inputs are correct so we can do the actual job properly. | |||||
Checks: | |||||
- metadata are technically parsable | |||||
- metadata pass the functional checks | |||||
Not Done Inline Actionsobject referenced (SWhiD or origin url) moranegg: object referenced (SWhiD or origin url) | |||||
- SWHID (if any) is technically valid | |||||
Args: | |||||
deposit: Deposit reference | |||||
swhid_reference: The swhid or the origin to attach metadata information to | |||||
metadata: Full dict of metadata to check for validity (parsed out of | |||||
raw_metadata) | |||||
raw_metadata: The actual raw metadata to send in the storage metadata | |||||
Not Done Inline Actionsthe word origin here doesn't mean a url origin, right? moranegg: the word origin here doesn't mean a url origin, right?
maybe use `with_original_deposit`? | |||||
Done Inline Actionsyes, it does mean an origin url as per the deposit update scenario. and lol, I initially made that a bool and called this with_origin_deposit... pretty much similar. ardumont: yes, it does mean an origin url as per the deposit update scenario.
----
and lol, I initially… | |||||
deposit_origin: Optional deposit origin url to use if any (e.g. deposit | |||||
update scenario provides one) | |||||
Raises: | |||||
BadRequestError in case of incorrect inputs from the deposit client | |||||
(e.g. functionally invalid metadata, ...) | |||||
Done Inline Actionsif metadata is not None vlorentz: `if metadata is not None` | |||||
Done Inline Actionsnah, i recall it's not the same here. And indeed, existing tests are complaining about it. ardumont: nah, i recall it's not the same here.
(plus that code is just moving from the deposit_update… | |||||
Returns: | |||||
Tuple of core swhid, swhid context, deposit and deposit request | |||||
""" | |||||
Done Inline Actionsheh, that's not a possible situation... ardumont: heh, that's not a possible situation...
(if we are here, that means we have extracted the swhid… | |||||
metadata_ok, error_details = check_metadata(metadata) | |||||
if not metadata_ok: | |||||
assert error_details, "Details should be set when a failure occurs" | |||||
raise BadRequestError( | |||||
"Functional metadata checks failure", | |||||
convert_status_detail(error_details), | |||||
) | |||||
metadata_authority = MetadataAuthority( | |||||
type=MetadataAuthorityType.DEPOSIT_CLIENT, | |||||
url=deposit.client.provider_url, | |||||
Not Done Inline Actionsthere is a notion of first_name and last_name for a client? moranegg: there is a notion of first_name and last_name for a client?
| |||||
metadata={"name": deposit.client.last_name}, | |||||
) | |||||
metadata_fetcher = MetadataFetcher( | |||||
name=self.tool["name"], | |||||
version=self.tool["version"], | |||||
metadata=self.tool["configuration"], | |||||
) | |||||
# replace metadata within the deposit backend | |||||
deposit_request_data = { | |||||
Done Inline Actionsmetadata={} vlorentz: `metadata={}` | |||||
Done Inline Actionswhy should this be empty? (also if that's ending up not being used, i can make that be dropped ardumont: why should this be empty?
(also if that's ending up not being used, i can make that be dropped… | |||||
Not Done Inline ActionsHmm actually, nevermind. I just re-read the metadata spec. It would be an issue currently because the config is not part of the unique key, but olasd is working on that. vlorentz: Hmm actually, nevermind. I just re-read the metadata spec.
It would be an issue currently… | |||||
METADATA_KEY: metadata, | |||||
RAW_METADATA_KEY: raw_metadata, | |||||
} | |||||
# actually add the metadata to the completed deposit | |||||
deposit_request = self._deposit_request_put(deposit, deposit_request_data) | |||||
object_type, metadata_context = compute_metadata_context(swhid_reference) | |||||
if deposit_origin: # metadata deposit update on completed deposit | |||||
metadata_context["origin"] = deposit_origin | |||||
swhid_core: Union[str, SWHID] | |||||
if isinstance(swhid_reference, str): | |||||
swhid_core = swhid_reference | |||||
else: | |||||
swhid_core = attr.evolve(swhid_reference, metadata={}) | |||||
# store that metadata to the metadata storage | |||||
Done Inline Actions@vlorentz i'm confused as how to actually map from a swhid like [1] to a raw extrinsic metadata. Do you have any clues? [1] swh:1:dir:c4993c872593e960dc84e4430dbbfbc34fd706d0;origin=https://inria.halpreprod.archives-ouvertes.fr/hal-01243573;visit=swh:1:snp:0175049fc45055a3824a1675ac06e3711619a55a;anchor=swh:1:rev:b5f505b005435fa5c4fa4c279792bd7b17167c04;path=/ ( ^ that's existing swhid from the deposit itself) ardumont: @vlorentz i'm confused as how to actually map from a swhid like [1] to a raw extrinsic metadata. | |||||
Done Inline Actionsgot my answers at the docs [1]
[1] https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html#qualifiers ardumont: got my answers at the docs [1]
- "visit" is a snapshot core swhid (if any)
- "anchor" is a… | |||||
metadata_object = RawExtrinsicMetadata( | |||||
type=object_type, | |||||
Done Inline ActionsSimpler IMO: swhid_core: Union[str, SWHID] if isinstance(swhid_reference, str): swhid_core = swhid_reference else: swhid_core = attr.evolve(swhid_reference, metadata={}) vlorentz: Simpler IMO:
```
swhid_core: Union[str, SWHID]
if isinstance(swhid_reference, str)… | |||||
Not Done Inline Actionsnaming here is difficult to follow. for the context without the swhid_core use swhid_only_context moranegg: naming here is difficult to follow.
A suggestion:
use `object_reference` which can be an… | |||||
target=swhid_core, # core swhid or origin | |||||
discovery_date=deposit_request.date, | |||||
authority=metadata_authority, | |||||
Done Inline Actionsobject_type = MetadataTargetType(swhid_reference.object_type) + assert it's not MetadataTargetType.ORIGIN vlorentz: `object_type = MetadataTargetType(swhid_reference.object_type)` + assert it's not… | |||||
Done Inline Actionsneat! ardumont: neat! | |||||
fetcher=metadata_fetcher, | |||||
format="sword-v2-atom-codemeta", | |||||
metadata=raw_metadata, | |||||
**metadata_context, | |||||
) | |||||
Done Inline Actionsencoding the path should be done in parse_swhid vlorentz: encoding the path should be done in `parse_swhid` | |||||
Done Inline Actionshow so? Do you mean the path stored in the SWHID class (resulting from parse_swhid call) should be bytes already? ardumont: how so?
Do you mean the path stored in the SWHID class (resulting from parse_swhid call)… | |||||
Done Inline Actionsok vlorentz: ok | |||||
# write to metadata storage | |||||
self.storage_metadata.metadata_authority_add([metadata_authority]) | |||||
self.storage_metadata.metadata_fetcher_add([metadata_fetcher]) | |||||
Done Inline ActionsThose should not raise providing D4490 ardumont: Those should not raise providing D4490 | |||||
self.storage_metadata.raw_extrinsic_metadata_add([metadata_object]) | |||||
return (swhid_core, swhid_reference, deposit, deposit_request) | |||||
Done Inline Actionssame remark as above. ardumont: same remark as above. | |||||
def _atom_entry( | def _atom_entry( | ||||
Done Inline ActionsWhat if there is a visit qualifier and the anchor is also a snapshot? vlorentz: What if there is a `visit` qualifier and the `anchor` is also a snapshot? | |||||
Done Inline ActionsWell, i thought of it as well but i don't know what to do for that case... Note that I don't really know what that means when such case arise... Do you have a better proposal than failing the parsing (and log it)? ardumont: Well, i thought of it as well but i don't know what to do for that case...
So right now, it's… | |||||
Done Inline Actions
return an HTTP error vlorentz: > but i don't know what to do for that case...
return an HTTP error | |||||
Done Inline ActionsLet's discuss that in D4493 which tries to do that ;) ardumont: Let's discuss that in D4493 which tries to do that ;) | |||||
self, | self, | ||||
request: Request, | request: Request, | ||||
headers: Dict[str, Any], | headers: Dict[str, Any], | ||||
collection_name: str, | collection_name: str, | ||||
deposit_id: Optional[int] = None, | deposit_id: Optional[int] = None, | ||||
replace_metadata: bool = False, | replace_metadata: bool = False, | ||||
replace_archives: bool = False, | replace_archives: bool = False, | ||||
Done Inline ActionsI don't understand how this conditional is related to checking if it's a metadata-only deposit vlorentz: I don't understand how this conditional is related to checking if it's a metadata-only deposit | |||||
Done Inline Actionswell, aside from specifically add a flag "with_metadata_only_deposit" (or something), how do you do detect it? For me we have currently 2 cases:
But i'm fine if you ask me to open a flag instead, it may be clearer that way. ardumont: well, aside from specifically add a flag "with_metadata_only_deposit" (or something), how do… | |||||
) -> Dict[str, Any]: | ) -> Dict[str, Any]: | ||||
Done Inline Actionsmove this (the computation of metadata_d) to its own function. (We will probably also move it to swh-model at some point) vlorentz: move this (the computation of `metadata_d`) to its own function. (We will probably also move… | |||||
Done Inline Actionsyep ardumont: yep | |||||
"""Atom entry deposit. | """Atom entry deposit. | ||||
Args: | Args: | ||||
request: the request holding information to parse | request: the request holding information to parse | ||||
and inject in db | and inject in db | ||||
headers: request headers formatted | headers: request headers formatted | ||||
Done Inline ActionsWhy does a function named _store_metadata_deposit update the deposit's status? Shouldn't this be done in the caller function? (and you might be able to drop the conditional that way) vlorentz: Why does a function named `_store_metadata_deposit` update the deposit's status? Shouldn't this… | |||||
Done Inline Actionsoh yeah, indeed, it crossed my mind and then i forget to act on it. ardumont: oh yeah, indeed, it crossed my mind and then i forget to act on it.
Thanks. | |||||
Done Inline Actions
to be fair, i mentioned it in the docstring...
yeah that'd be more clean.
indeed, that's the advantage. ardumont: > Why does a function named _store_metadata_deposit update the deposit's status?
to be fair… | |||||
collection_name: the associated client | collection_name: the associated client | ||||
deposit_id: deposit identifier if provided | deposit_id: deposit identifier if provided | ||||
replace_metadata: 'Update or add' request to existing | replace_metadata: 'Update or add' request to existing | ||||
deposit. If False (default), this adds new metadata request to | deposit. If False (default), this adds new metadata request to | ||||
existing ones. Otherwise, this will replace existing metadata. | existing ones. Otherwise, this will replace existing metadata. | ||||
replace_archives: 'Update or add' request to existing | replace_archives: 'Update or add' request to existing | ||||
deposit. If False (default), this adds new archive request to | deposit. If False (default), this adds new archive request to | ||||
existing ones. Otherwise, this will replace existing archives. | existing ones. Otherwise, this will replace existing archives. | ||||
Show All 28 Lines | ) -> Dict[str, Any]: | ||||
if not metadata: | if not metadata: | ||||
return make_error_dict( | return make_error_dict( | ||||
BAD_REQUEST, | BAD_REQUEST, | ||||
"Empty body request is not supported", | "Empty body request is not supported", | ||||
"Atom entry deposit is supposed to send for metadata. " | "Atom entry deposit is supposed to send for metadata. " | ||||
"If the body is empty, there is no metadata.", | "If the body is empty, there is no metadata.", | ||||
) | ) | ||||
external_id = metadata.get("external_identifier", headers["slug"]) | # Determine if we are in the metadata-only deposit case | ||||
try: | |||||
swhid = parse_swh_reference(metadata) | |||||
except ValidationError as e: | |||||
return make_error_dict(PARSING_ERROR, "Invalid SWHID reference", str(e),) | |||||
# TODO: Determine if we are in the metadata-only deposit case. If it is, then | external_id = metadata.get("external_identifier", headers["slug"]) | ||||
# save deposit and deposit request typed 'metadata' and send metadata to the | |||||
# metadata storage. Otherwise, do as existing deposit. | |||||
deposit = self._deposit_put( | deposit = self._deposit_put( | ||||
request, | request, | ||||
deposit_id=deposit_id, | deposit_id=deposit_id, | ||||
in_progress=headers["in-progress"], | in_progress=headers["in-progress"], | ||||
external_id=external_id, | external_id=external_id, | ||||
) | ) | ||||
if swhid is not None: | |||||
try: | |||||
swhid, swhid_ref, depo, depo_request = self._store_metadata_deposit( | |||||
deposit, swhid, metadata, raw_metadata | |||||
) | |||||
Done Inline Actionsand now i missed some cases, fixing as well. ardumont: and now i missed some cases, fixing as well. | |||||
except BadRequestError as bad_request_error: | |||||
Done Inline Actionstuple = success, dict = error? meh what about an exception instead? vlorentz: tuple = success, dict = error? meh
what about an exception instead? | |||||
Done Inline Actions
well, yeah...
I don't really want to plunge into refactoring the current actual exception handling in the deposit right now. ardumont: > tuple = success, dict = error? meh
well, yeah...
> what about an exception instead?
I… | |||||
Done Inline Actionswhat about returning (boolean, result), like the metadata validation function? vlorentz: what about returning `(boolean, result)`, like the metadata validation function? | |||||
Done Inline ActionsI was ok with the last suggestion and i implemented it accordingly. But in the end, that just added another indirection... (We will adapt the other part of the deposit similarly later if we agree upon that ;) ardumont: I was ok with the last suggestion and i implemented it accordingly.
But in the end, that just… | |||||
Done Inline ActionsT2792 to rework the exception handling. ardumont: T2792 to rework the exception handling.
| |||||
return bad_request_error.to_dict() | |||||
deposit.status = DEPOSIT_STATUS_LOAD_SUCCESS | |||||
if isinstance(swhid_ref, SWHID): | |||||
deposit.swhid = str(swhid) | |||||
deposit.swhid_context = str(swhid_ref) | |||||
deposit.complete_date = depo_request.date | |||||
Not Done Inline Actionslooks like the same date, is that normal behavior? it's when there is only one deposit request without in-progress right? moranegg: looks like the same date, is that normal behavior? it's when there is only one deposit request… | |||||
deposit.reception_date = depo_request.date | |||||
deposit.save() | |||||
return { | |||||
"deposit_id": deposit.id, | |||||
"deposit_date": depo_request.date, | |||||
"status": deposit.status, | |||||
"archive": None, | |||||
} | |||||
self._deposit_request_put( | self._deposit_request_put( | ||||
deposit, | deposit, | ||||
{METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, | {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata}, | ||||
replace_metadata, | replace_metadata, | ||||
replace_archives, | replace_archives, | ||||
) | ) | ||||
return { | return { | ||||
▲ Show 20 Lines • Show All 357 Lines • Show Last 20 Lines |
What does "In the nominal scenario" mean?