Changeset View
Standalone View
swh/deposit/api/deposit_update.py
# Copyright (C) 2017-2020 The Software Heritage developers | # Copyright (C) 2017-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from datetime import datetime, timezone | |||||
import json | |||||
from typing import Any, Dict, Optional, Tuple | from typing import Any, Dict, Optional, Tuple | ||||
from rest_framework import status | from rest_framework import status | ||||
from rest_framework.request import Request | |||||
from ..config import CONT_FILE_IRI, EDIT_SE_IRI, EM_IRI | from swh.deposit.models import Deposit | ||||
from ..errors import BAD_REQUEST, make_error_dict | from swh.model.hashutil import hash_to_bytes | ||||
from swh.model.identifiers import parse_swhid | |||||
from swh.model.model import ( | |||||
MetadataAuthority, | |||||
MetadataAuthorityType, | |||||
MetadataFetcher, | |||||
MetadataTargetType, | |||||
RawExtrinsicMetadata, | |||||
) | |||||
from ..config import ( | |||||
CONT_FILE_IRI, | |||||
DEPOSIT_STATUS_LOAD_SUCCESS, | |||||
EDIT_SE_IRI, | |||||
EM_IRI, | |||||
METADATA_KEY, | |||||
SWH_METADATA_AUTHORITY, | |||||
) | |||||
from ..errors import BAD_REQUEST, ParserError, make_error_dict | |||||
from ..parsers import ( | from ..parsers import ( | ||||
SWHAtomEntryParser, | SWHAtomEntryParser, | ||||
SWHFileUploadTarParser, | SWHFileUploadTarParser, | ||||
SWHFileUploadZipParser, | SWHFileUploadZipParser, | ||||
SWHMultiPartParser, | SWHMultiPartParser, | ||||
) | ) | ||||
from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIDelete, APIPost, APIPut | from .common import ACCEPT_ARCHIVE_CONTENT_TYPES, APIDelete, APIPost, APIPut | ||||
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines | """Deposit request class defining api endpoints for sword deposit. | ||||
What's known as 'Edit IRI' (and SE IRI) in the sword specification. | What's known as 'Edit IRI' (and SE IRI) in the sword specification. | ||||
HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE | HTTP verbs supported: POST (SE IRI), PUT (Edit IRI), DELETE | ||||
""" | """ | ||||
parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) | parser_classes = (SWHMultiPartParser, SWHAtomEntryParser) | ||||
def restrict_access(self, request: Request, deposit: Deposit) -> Dict[str, Any]: | |||||
"""For the metadata-only update of a deposit, we need to allow some checks on "done" | |||||
deposits. This overrides allow for such case to occur. | |||||
""" | |||||
headers = self._read_headers(request) | |||||
if ( | |||||
request.method == "PUT" | |||||
and headers["swhid"] is not None | |||||
and deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS | |||||
): | |||||
# Update only metadata is authorized for deposit ingested | |||||
return {} | |||||
# otherwise, let the standard access restriction occur | |||||
return super().restrict_access(request, deposit) | |||||
def process_put( | def process_put( | ||||
self, req, headers: Dict, collection_name: str, deposit_id: int | self, request, headers: Dict, collection_name: str, deposit_id: int | ||||
) -> Dict[str, Any]: | ) -> Dict[str, Any]: | ||||
"""Replace existing deposit's metadata/archive with new ones. | """Replace existing deposit's metadata/archive with new ones. | ||||
source: | source: | ||||
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata # noqa | - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_metadata # noqa | ||||
- http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart # noqa | - http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html#protocoloperations_editingcontent_multipart # noqa | ||||
Returns: | Returns: | ||||
204 No content | 204 No content | ||||
""" | """ | ||||
if req.content_type.startswith("multipart/"): | if request.content_type.startswith("multipart/"): | ||||
return self._multipart_upload( | return self._multipart_upload( | ||||
ardumont: missing test for another diff. | |||||
Done Inline Actionsardumont: D4048 | |||||
req, | request, | ||||
headers, | headers, | ||||
collection_name, | collection_name, | ||||
deposit_id=deposit_id, | deposit_id=deposit_id, | ||||
replace_archives=True, | replace_archives=True, | ||||
replace_metadata=True, | replace_metadata=True, | ||||
) | ) | ||||
swhid = headers.get("swhid") | |||||
if swhid is None: | |||||
return self._atom_entry( | return self._atom_entry( | ||||
req, headers, collection_name, deposit_id=deposit_id, replace_metadata=True | request, | ||||
headers, | |||||
collection_name, | |||||
deposit_id=deposit_id, | |||||
replace_metadata=True, | |||||
Done Inline ActionsPlease add a comment explaining what this does, and what we do if it's not None vlorentz: Please add a comment explaining what this does, and what we do if it's not `None` | |||||
Done Inline Actionsok. It's to currently keep the previous update behavior.
now:
And thanks to your comment, i realize i'm missing the status check in that conditional... nice! ardumont: ok.
It's to currently keep the previous update behavior.
The previous behavior was… | |||||
Done Inline ActionsReading this explanation, I think you should add the now section at line 155. moranegg: Reading this explanation, I think you should add the now section at line 155. | |||||
Done Inline Actionsyes, it will. ardumont: yes, it will. | |||||
) | ) | ||||
Done Inline ActionsHere I'm missing a check to see if the swhid that is not None is the correct swhid that we have with the deposit id (since it is an update of an existing deposit). moranegg: Here I'm missing a check to see if the swhid that is not None is the correct swhid that we have… | |||||
Done Inline Actionsoh, yeah, correct! ardumont: oh, yeah, correct! | |||||
# metadata-only deposit | |||||
try: | |||||
raw_metadata, metadata = self._read_metadata(request.data) | |||||
except ParserError: | |||||
return make_error_dict( | |||||
BAD_REQUEST, | |||||
"Malformed xml metadata", | |||||
"The xml received is malformed. " | |||||
"Please ensure your metadata file is correctly formatted.", | |||||
) | |||||
if not metadata: | |||||
return make_error_dict( | |||||
BAD_REQUEST, | |||||
"Empty body request is not supported", | |||||
"Atom entry deposit is supposed to send for metadata. " | |||||
"If the body is empty, there is no metadata.", | |||||
) | |||||
metadata_authority = MetadataAuthority( | |||||
type=MetadataAuthorityType.DEPOSIT_CLIENT, | |||||
url=self.provider["provider_url"], | |||||
metadata={ | |||||
"name": self.provider["provider_name"], | |||||
**(self.provider["metadata"] or {}), | |||||
}, | |||||
) | |||||
Done Inline Actions@vlorentz actually trying to complete the nominal check scenario on metadata update... I actually am unsure what authority to specify with the metadata.... ardumont: @vlorentz actually trying to complete the nominal check scenario on metadata update...
Do we… | |||||
metadata_fetcher = MetadataFetcher( | |||||
name=self.tool["name"], | |||||
Done Inline ActionsI don't actually know what to do about that discovery_date in regards to the deposit backend (it's currently not set anywhere in the deposit backend, only send to the metadata storage). Don't know if that's enough or not. ardumont: I don't actually know what to do about that discovery_date in regards to the deposit backend… | |||||
Done Inline ActionsWhat's does discovery_date semantically means? moranegg: What's does `discovery_date` semantically means? | |||||
Done Inline ActionsMy understanding is that it is the time at which swh received (and discovered) the metadata. ardumont: My understanding is that it is the time at which swh received (and discovered) the metadata. | |||||
Done Inline ActionsI've reworked to actually use the same date, pushing soon. ardumont: I've reworked to actually use the same date, pushing soon. | |||||
version=self.tool["version"], | |||||
metadata=self.tool["configuration"], | |||||
) | |||||
Done Inline ActionsIn a future diff, i'll check on how to refactor the following chunk 216-231 which is already defined elsewhere but not completely unified (deposit multipart for example for the atom part does not check the empty body but it most probably should...). It's potentially a large refactoring, so i'll do that in another diff to avoid conflating and enlarging things too much... I'll add the missing test scenarios here though. ardumont: In a future diff, i'll check on how to refactor the following chunk 216-231 which is already… | |||||
Not Done Inline ActionsI'm kinda surprised that HAL-preprod had the Malformed xml metadata error moranegg: I'm kinda surprised that HAL-preprod had the `Malformed xml metadata` error
I saw you opened a… | |||||
deposit_swhid = parse_swhid(swhid) | |||||
Done Inline Actionsi think this simply should be removed, any other typed (snapshot, ...) swhid won't just be found or be a mismatch with the deposit swhid already. ardumont: i think this simply should be removed, any other typed (snapshot, ...) swhid won't just be… | |||||
assert deposit_swhid.object_type == "directory" | |||||
directory_id = hash_to_bytes(deposit_swhid.object_id) | |||||
# FIXME: need a storage for that in tests, unplug for now to continue to check the rest | |||||
# check the swhid exists in the archive | |||||
# directory = self.storage.directory_missing([directory_id]) | |||||
Done Inline Actionsardumont: ^ shouldn't it be about the deposit client (hal, ipol, etc...) here?
@vlorentz @moranegg ^ | |||||
Not Done Inline Actionsthe provider is the deposit client vlorentz: the provider is the deposit client | |||||
Done Inline Actionsyes, thanks for confirming. So that part is wrong, thanks. ardumont: yes, thanks for confirming.
So that part is wrong, thanks.
There is no `self.provider` to use… | |||||
# if directory: | |||||
# return make_error_dict( | |||||
# BAD_REQUEST, | |||||
# "{swhid} must reference an existing object", | |||||
Not Done Inline ActionsI'm not sure we should do that. It means we can't allow deposit updates while a storage is being replayed. vlorentz: I'm not sure we should do that. It means we can't allow deposit updates while a storage is… | |||||
Done Inline Actionsi did not know the storage and storage metadata had the same requirements... ardumont: i did not know the storage and storage metadata had the same requirements...
I'm using storage… | |||||
# "The SWHID provided is not a know reference in SWH archive. " | |||||
Not Done Inline ActionsHere the assumption is that the SWHID is a dir swh:1:dir. On the other hand if it is an update of an already done content-deposit, it is implicit that the deposit is a directory. Can you check that the SWHID in itself exists without checking its type? second, why in 224 there is potentially a list of artifacts, in this case directories? moranegg: Here the assumption is that the SWHID is a `dir` swh:1:dir.
In the metadata-only deposit all… | |||||
Done Inline Actions
in the deposit, currently, all our deposit swhid are directory ones.
yes, i'll adapt the code then
Probably but i may have to pass with the archive instead of the storage (i don't recall the storage has an endpoint for that).
That's the api of the storage.directory_missing which takes a list of directory id, i'm only passing one. ardumont: > Here the assumption is that the SWHID is a dir swh:1:dir.
in the deposit, currently, all our… | |||||
Not Done Inline ActionsWould you say that updating metadata on deposits that are not directories will yield an error? If yes (and if no), let's do a test with a snapshot swhid to capture the behavior. moranegg: Would you say that updating metadata on deposits that are not directories will yield an error? | |||||
Done Inline Actionsyes, but it is already covered in the current failure scenario when passing an unknown reference as swhid... ah no, the assertion will make it fail badly. ardumont: yes, but it is already covered in the current failure scenario when passing an unknown… | |||||
Done Inline Actions
I mean, the code will be adapted (if it needs to) when we will add the code about the metadata-only deposit. ardumont: > yes, i'll adapt the code then
I mean, the code will be adapted (if it needs to) when we will… | |||||
# "Please provide an existing SWHID.", | |||||
# ) | |||||
discovery_date = datetime.now(tz=timezone.utc) | |||||
# QUESTION: Do we also put the raw xml? | |||||
Not Done Inline Actionsfor the extrinsic metadata storage we keep only raw entries, which means only the xml, we don't need to translate to json. moranegg: for the extrinsic metadata storage we keep only raw entries, which means only the xml, we don't… | |||||
Not Done Inline ActionsI agree with Morane. (and eventually we should do the same with metadata provided on the initial deposit, but I didn't do it yet because it would be a major refactoring) vlorentz: I agree with Morane.
(and eventually we should do the same with metadata provided on the… | |||||
Done Inline ActionsWe don't do that with the current deposit loader. We send the xml translated to So I'm just aligning the loader and the deposit server in that regard to avoid If that's not ok, we should reconsider and fix the loader as well then (new task! ;) ardumont: We don't do that with the current deposit loader. We send the xml translated to
json [1]
[1]… | |||||
Not Done Inline ActionsIf that's major refactoring, I propose to keep this diff as is (since for now we don't receive only-metadata deposit anyway) and do a full refactor for all metadata that is sent to the storage. In parallel, I need an endpoint to check the metadata in the REMD storage easily (to see if HAL or other clients sent the correct stuff). moranegg: If that's major refactoring, I propose to keep this diff as is (since for now we don't receive… | |||||
Done Inline Actionsyes, i don't want to introduce discrepancy, so let's just align with the loader for now. And then open a new task to both adapt the loader and this one to actually send the raw metadata then. ardumont: yes, i don't want to introduce discrepancy, so let's just align with the loader for now.
And… | |||||
# A priori, no since we don't do this originally in the loader. | |||||
metadata_object = RawExtrinsicMetadata( | |||||
Not Done Inline Actionsno, it should not replace, but add a new request. vlorentz: no, it should not replace, but add a new request. | |||||
Done Inline Actionswell, then it should be in another endpoint then, we are in the one that replaces stuff. ardumont: well, then it should be in another endpoint then, we are in the one that replaces stuff. | |||||
Done Inline Actionswe are in the put endpoint not post... ardumont: we are in the `put` endpoint not `post`...
which I realize now is not the right place, thanks… | |||||
type=MetadataTargetType.DIRECTORY, | |||||
id=deposit_swhid, | |||||
discovery_date=discovery_date, | |||||
authority=SWH_METADATA_AUTHORITY, | |||||
fetcher=metadata_fetcher, | |||||
format="sword-v2-atom-codemeta-v2-in-json", | |||||
metadata=json.dumps(metadata).encode(), | |||||
Done Inline Actionsardumont: @vlorentz @moranegg unclear part to me mentioned in the diff description here
All this chunk… | |||||
Not Done Inline Actionsso here the answer is keep the metadata as an xml blob. moranegg: so here the answer is keep the metadata as an xml blob.
This way we loose less information. | |||||
Done Inline Actionslose ;) ok then in my previous comment, i said that we need to open another task to fix the current deposit loader (which actually means fix the private deposit read api from the deposit server, implementation detail again ;). ardumont: `lose` ;)
ok then in my previous comment, i said that we need to open another task to fix the… | |||||
Done Inline ActionsAlso discrepancy anyway in the end... well, i guess, we could parse the "swh_id_context" column instead to retrieve the revision (in the "anchor" attribute) or even the snapshot (visit) if we need to... ardumont: Also discrepancy anyway in the end...
The deposit loader sends the metadata on the revision… | |||||
Done Inline Actions
but we need then the client to send that one though... ardumont: > well, i guess, we could parse the "swh_id_context" ...
but we need then the client to send… | |||||
Not Done Inline Actionsan update of the metadata is only in the REMD, not in the storage, so no metadata on revisions. moranegg: an update of the metadata is only in the REMD, not in the storage, so no metadata on revisions. | |||||
Done Inline Actionsok, so here is the kicker ;) Tthe loader deposit, when sending the metadata to the metadata storage, should attach those to the directory SWHID and not the revision like it is currently doing. @vlorentz heads up ^ Question: Is the current metadata revision migration script already doing that change? ardumont: ok, so here is the kicker ;)
Tthe loader deposit, when sending the metadata to the metadata… | |||||
Not Done Inline Actionsooooh... now I get it. This is another big issue !! I didn't see coming, and I might need to check @vlorentz's script: Maybe it should be on all nodes? moranegg: ooooh... now I get it.
With the content deposit, the metadata is sent to the loader to be added… | |||||
) | |||||
Done Inline Actionsardumont: (repeat) I actually am unsure what authority to specify with the metadata...
@vlorentz… | |||||
Not Done Inline ActionsThe authority is the entity that gave the information. Let's check with @vlorentz how to format correctly the authority, because it shouldn't be just a string, but maybe it is. moranegg: The authority is the entity that gave the information.
In this case it's the client.
Let's… | |||||
# write to metadata storage | |||||
self.storage.metadata_authority_add([metadata_authority]) | |||||
Not Done Inline Actionsshouldn't we check that the authority isn't already in the REMD storage? REMD = Raw Extrinsic Metadata Storage same question for 249 about the fetcher (which is the tool, right?)? moranegg: shouldn't we check that the authority isn't already in the REMD storage?
REMD = Raw Extrinsic… | |||||
Not Done Inline Actionsyes. This could be done with the filtering proxy, though. vlorentz: yes. This could be done with the filtering proxy, though. | |||||
Done Inline Actions
you tell me ;) Actually, it should already be there and the add call is idempotent so that should be fine.
same answer. I've kept the same information provided to the deposit loader.
yes, that's the tool defined. @vlorentz should have some valuable input here ^ ardumont: > shouldn't we check that the authority isn't already in the REMD storage?
you tell me ;)… | |||||
self.storage.metadata_fetcher_add([metadata_fetcher]) | |||||
self.storage.raw_extrinsic_metadata_add([metadata_object]) | |||||
# replace metadata within the deposit backend | |||||
deposit = Deposit.objects.get(pk=deposit_id) | |||||
self._deposit_request_put( | |||||
deposit, {METADATA_KEY: metadata}, replace_metadata=True, | |||||
) | |||||
return { | |||||
"deposit_id": deposit_id, | |||||
"deposit_date": discovery_date, | |||||
"status": deposit.status, | |||||
"archive": None, | |||||
} | |||||
def process_post( | def process_post( | ||||
self, | self, | ||||
request, | request, | ||||
headers: Dict, | headers: Dict, | ||||
collection_name: str, | collection_name: str, | ||||
deposit_id: Optional[int] = None, | deposit_id: Optional[int] = None, | ||||
) -> Tuple[int, str, Dict]: | ) -> Tuple[int, str, Dict]: | ||||
"""Add new metadata/archive to existing deposit. | """Add new metadata/archive to existing deposit. | ||||
▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines |
missing test for another diff.