Changeset View
Standalone View
swh/deposit/client.py
# Copyright (C) 2017-2020 The Software Heritage developers | # Copyright (C) 2017-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
"""Module in charge of defining an swh-deposit client | """Module in charge of defining an swh-deposit client | ||||
""" | """ | ||||
from abc import ABCMeta, abstractmethod | from abc import ABCMeta, abstractmethod | ||||
import hashlib | import hashlib | ||||
import logging | import logging | ||||
import os | import os | ||||
from typing import Any, Dict | from typing import Any, Dict, Optional | ||||
from urllib.parse import urljoin | from urllib.parse import urljoin | ||||
import requests | import requests | ||||
import xmltodict | import xmltodict | ||||
from swh.core.config import load_from_envvar | from swh.core.config import load_from_envvar | ||||
logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
def compute_unified_information( | |||||
collection: str, | |||||
in_progress: bool, | |||||
slug: str, | |||||
*, | |||||
filepath: Optional[str] = None, | |||||
swhid: Optional[str] = None, | |||||
**kwargs, | |||||
) -> Dict[str, Any]: | |||||
vlorentz: sorry but that's not a great function name/docstring.
computes what information? based on what… | |||||
Done Inline Actions
i don't think self_compute_information, the old name was great either... Note the existing code... I moved it around because I had typing issues i tried to make sense of...
Out of the input params of the method (which is not exactly the same, unfortunately), it tries to compute a unified information dict which will serve as param input for the subsequent methods called. Those information are details in the Returns statement. The dict with keys method in the docstring?
either the archive_path or the metadata_path passed along by the client using this code.
already answered in my previous point. ardumont: > sorry but that's not a great function name/docstring.
i don't think… | |||||
"""Given a filepath, compute necessary information on that file. | |||||
Args: | |||||
collection: Deposit collection | |||||
in_progress: do we finalize the deposit? | |||||
slug: external id to use | |||||
filepath: Path to the file to compute the necessary information out of | |||||
swhid: Deposit swhid if any | |||||
Returns: | |||||
dict with keys: | |||||
'slug': external id to use | |||||
'in_progress': do we finalize the deposit? | |||||
'content-type': content type associated | |||||
'md5sum': md5 sum | |||||
'filename': filename | |||||
'filepath': filepath | |||||
'swhid': deposit swhid | |||||
""" | |||||
result: Dict[str, Any] = { | |||||
"slug": slug, | |||||
"in_progress": in_progress, | |||||
"swhid": swhid, | |||||
} | |||||
content_type: Optional[str] = None | |||||
md5sum: Optional[str] = None | |||||
if filepath: | |||||
filename = os.path.basename(filepath) | |||||
md5sum = hashlib.md5(open(filepath, "rb").read()).hexdigest() | |||||
extension = filename.split(".")[-1] | |||||
if "zip" in extension: | |||||
content_type = "application/zip" | |||||
else: | |||||
content_type = "application/x-tar" | |||||
result.update( | |||||
{ | |||||
"content-type": content_type, | |||||
"md5sum": md5sum, | |||||
"filename": filename, | |||||
"filepath": filepath, | |||||
} | |||||
) | |||||
return result | |||||
class MaintenanceError(ValueError): | class MaintenanceError(ValueError): | ||||
"""Informational maintenance error exception | """Informational maintenance error exception | ||||
""" | """ | ||||
pass | pass | ||||
▲ Show 20 Lines • Show All 205 Lines • ▼ Show 20 Lines | class BaseDepositClient(BaseApiDepositClient, metaclass=ABCMeta): | ||||
@abstractmethod | @abstractmethod | ||||
def parse_result_ok(self, xml_content): | def parse_result_ok(self, xml_content): | ||||
"""Given an xml result from the api endpoint, parse it and returns a | """Given an xml result from the api endpoint, parse it and returns a | ||||
dict. | dict. | ||||
""" | """ | ||||
pass | pass | ||||
def compute_information(self, *args, **kwargs): | def compute_information(self, *args, **kwargs) -> Dict[str, Any]: | ||||
"""Compute some more information given the inputs (e.g http headers, | """Compute some more information given the inputs (e.g http headers, | ||||
...) | ...) | ||||
""" | """ | ||||
return {} | return {} | ||||
def parse_result_error(self, xml_content): | def parse_result_error(self, xml_content): | ||||
"""Given an error response in xml, parse it into a dict. | """Given an error response in xml, parse it into a dict. | ||||
▲ Show 20 Lines • Show All 155 Lines • ▼ Show 20 Lines | def parse_result_ok(self, xml_content): | ||||
keys=[ | keys=[ | ||||
"deposit_id", | "deposit_id", | ||||
"deposit_status", | "deposit_status", | ||||
"deposit_status_detail", | "deposit_status_detail", | ||||
"deposit_date", | "deposit_date", | ||||
], | ], | ||||
) | ) | ||||
def _compute_information( | def compute_headers(self, info: Dict[str, Any]) -> Dict[str, Any]: | ||||
self, collection, filepath, in_progress, slug, is_archive=True | |||||
): | |||||
"""Given a filepath, compute necessary information on that file. | |||||
Args: | |||||
filepath (str): Path to a file | |||||
is_archive (bool): is it an archive or not? | |||||
Returns: | |||||
dict with keys: | |||||
'content-type': content type associated | |||||
'md5sum': md5 sum | |||||
'filename': filename | |||||
""" | |||||
filename = os.path.basename(filepath) | |||||
if is_archive: | |||||
md5sum = hashlib.md5(open(filepath, "rb").read()).hexdigest() | |||||
extension = filename.split(".")[-1] | |||||
if "zip" in extension: | |||||
content_type = "application/zip" | |||||
else: | |||||
content_type = "application/x-tar" | |||||
else: | |||||
content_type = None | |||||
md5sum = None | |||||
return { | |||||
"slug": slug, | |||||
"in_progress": in_progress, | |||||
"content-type": content_type, | |||||
"md5sum": md5sum, | |||||
"filename": filename, | |||||
"filepath": filepath, | |||||
} | |||||
def compute_information( | |||||
self, collection, filepath, in_progress, slug, is_archive=True, **kwargs | |||||
): | |||||
info = self._compute_information( | |||||
collection, filepath, in_progress, slug, is_archive=is_archive | |||||
) | |||||
info["headers"] = self.compute_headers(info) | |||||
return info | return info | ||||
Done Inline Actionsthose compute_information methods are coming from this implementation which got removed because:
So now, it's removed and other subclasses that needs it properly implement it. ardumont: those `compute_information` methods are coming from this implementation which got removed… | |||||
def do_execute(self, method, url, info): | def do_execute(self, method, url, info): | ||||
with open(info["filepath"], "rb") as f: | with open(info["filepath"], "rb") as f: | ||||
return self.do(method, url, data=f, headers=info["headers"]) | return self.do(method, url, data=f, headers=info["headers"]) | ||||
class CreateArchiveDepositClient(BaseCreateDepositClient): | class CreateArchiveDepositClient(BaseCreateDepositClient): | ||||
"""Post an archive (binary) deposit client.""" | """Post an archive (binary) deposit client.""" | ||||
def compute_headers(self, info): | def compute_headers(self, info): | ||||
return { | return { | ||||
"SLUG": info["slug"], | "SLUG": info["slug"], | ||||
"CONTENT_MD5": info["md5sum"], | "CONTENT_MD5": info["md5sum"], | ||||
"IN-PROGRESS": str(info["in_progress"]), | "IN-PROGRESS": str(info["in_progress"]), | ||||
"CONTENT-TYPE": info["content-type"], | "CONTENT-TYPE": info["content-type"], | ||||
"CONTENT-DISPOSITION": "attachment; filename=%s" % (info["filename"],), | "CONTENT-DISPOSITION": "attachment; filename=%s" % (info["filename"],), | ||||
} | } | ||||
def compute_information(self, *args, **kwargs) -> Dict[str, Any]: | |||||
info = compute_unified_information( | |||||
*args, filepath=kwargs["archive_path"], **kwargs | |||||
) | |||||
info["headers"] = self.compute_headers(info) | |||||
return info | |||||
class UpdateArchiveDepositClient(CreateArchiveDepositClient): | class UpdateArchiveDepositClient(CreateArchiveDepositClient): | ||||
"""Update (add/replace) an archive (binary) deposit client.""" | """Update (add/replace) an archive (binary) deposit client.""" | ||||
def compute_url(self, collection, *args, deposit_id=None, **kwargs): | def compute_url(self, collection, *args, deposit_id=None, **kwargs): | ||||
return "/%s/%s/media/" % (collection, deposit_id) | return "/%s/%s/media/" % (collection, deposit_id) | ||||
def compute_method(self, *args, replace=False, **kwargs): | def compute_method(self, *args, replace=False, **kwargs): | ||||
return "put" if replace else "post" | return "put" if replace else "post" | ||||
class CreateMetadataDepositClient(BaseCreateDepositClient): | class CreateMetadataDepositClient(BaseCreateDepositClient): | ||||
"""Post a metadata deposit client.""" | """Post a metadata deposit client.""" | ||||
def compute_headers(self, info): | def compute_headers(self, info): | ||||
return { | return { | ||||
"SLUG": info["slug"], | "SLUG": info["slug"], | ||||
"IN-PROGRESS": str(info["in_progress"]), | "IN-PROGRESS": str(info["in_progress"]), | ||||
"CONTENT-TYPE": "application/atom+xml;type=entry", | "CONTENT-TYPE": "application/atom+xml;type=entry", | ||||
} | } | ||||
def compute_information(self, *args, **kwargs) -> Dict[str, Any]: | |||||
info = compute_unified_information( | |||||
*args, filepath=kwargs["metadata_path"], **kwargs | |||||
) | |||||
info["headers"] = self.compute_headers(info) | |||||
return info | |||||
class UpdateMetadataDepositClient(CreateMetadataDepositClient): | |||||
"""Update (add/replace) a metadata deposit client.""" | class UpdateMetadataOnPartialDepositClient(CreateMetadataDepositClient): | ||||
"""Update (add/replace) metadata on partial deposit scenario.""" | |||||
def compute_url(self, collection, *args, deposit_id=None, **kwargs): | def compute_url(self, collection, *args, deposit_id=None, **kwargs): | ||||
return "/%s/%s/metadata/" % (collection, deposit_id) | return f"/{collection}/{deposit_id}/metadata/" | ||||
def compute_method(self, *args, replace=False, **kwargs): | def compute_method(self, *args, replace: bool = False, **kwargs) -> str: | ||||
return "put" if replace else "post" | return "put" if replace else "post" | ||||
class UpdateMetadataOnDoneDepositClient(UpdateMetadataOnPartialDepositClient): | |||||
"""Update metadata on "done" deposit. This requires the deposit swhid.""" | |||||
def compute_headers(self, info: Dict[str, Any]) -> Dict[str, Any]: | |||||
return { | |||||
"CONTENT-TYPE": "application/atom+xml;type=entry", | |||||
"X_CHECK_SWHID": info["swhid"], | |||||
} | |||||
def compute_method(self, *args, **kwargs) -> str: | |||||
return "put" | |||||
class CreateMultipartDepositClient(BaseCreateDepositClient): | class CreateMultipartDepositClient(BaseCreateDepositClient): | ||||
"""Create a multipart deposit client.""" | """Create a multipart deposit client.""" | ||||
def _multipart_info(self, info, info_meta): | def _multipart_info(self, info, info_meta): | ||||
files = [ | files = [ | ||||
( | ( | ||||
"file", | "file", | ||||
(info["filename"], open(info["filepath"], "rb"), info["content-type"]), | (info["filename"], open(info["filepath"], "rb"), info["content-type"]), | ||||
Show All 11 Lines | def _multipart_info(self, info, info_meta): | ||||
headers = { | headers = { | ||||
"SLUG": info["slug"], | "SLUG": info["slug"], | ||||
"CONTENT_MD5": info["md5sum"], | "CONTENT_MD5": info["md5sum"], | ||||
"IN-PROGRESS": str(info["in_progress"]), | "IN-PROGRESS": str(info["in_progress"]), | ||||
} | } | ||||
return files, headers | return files, headers | ||||
def compute_information( | def compute_information(self, *args, **kwargs) -> Dict[str, Any]: | ||||
self, collection, archive, metadata, in_progress, slug, **kwargs | info = compute_unified_information(*args, filepath=kwargs["archive_path"],) | ||||
): | info_meta = compute_unified_information( | ||||
info = self._compute_information(collection, archive, in_progress, slug) | *args, filepath=kwargs["metadata_path"], | ||||
info_meta = self._compute_information( | |||||
collection, metadata, in_progress, slug, is_archive=False | |||||
) | ) | ||||
files, headers = self._multipart_info(info, info_meta) | files, headers = self._multipart_info(info, info_meta) | ||||
return {"files": files, "headers": headers} | return {"files": files, "headers": headers} | ||||
def do_execute(self, method, url, info): | def do_execute(self, method, url, info): | ||||
return self.do(method, url, files=info["files"], headers=info["headers"]) | return self.do(method, url, files=info["files"], headers=info["headers"]) | ||||
Show All 9 Lines | |||||
class PublicApiDepositClient(BaseApiDepositClient): | class PublicApiDepositClient(BaseApiDepositClient): | ||||
"""Public api deposit client.""" | """Public api deposit client.""" | ||||
def service_document(self): | def service_document(self): | ||||
"""Retrieve service document endpoint's information.""" | """Retrieve service document endpoint's information.""" | ||||
return ServiceDocumentDepositClient(self.config).execute() | return ServiceDocumentDepositClient(self.config).execute() | ||||
def deposit_status(self, collection, deposit_id): | def deposit_status(self, collection: str, deposit_id: int): | ||||
"""Retrieve status information on a deposit.""" | """Retrieve status information on a deposit.""" | ||||
return StatusDepositClient(self.config).execute(collection, deposit_id) | return StatusDepositClient(self.config).execute(collection, deposit_id) | ||||
def deposit_create( | def deposit_create( | ||||
self, collection, slug, archive=None, metadata=None, in_progress=False | self, | ||||
collection: str, | |||||
slug: str, | |||||
archive: Optional[str] = None, | |||||
metadata: Optional[str] = None, | |||||
in_progress: bool = False, | |||||
): | ): | ||||
"""Create a new deposit (archive, metadata, both as multipart).""" | """Create a new deposit (archive, metadata, both as multipart).""" | ||||
if archive and not metadata: | if archive and not metadata: | ||||
return CreateArchiveDepositClient(self.config).execute( | return CreateArchiveDepositClient(self.config).execute( | ||||
collection, archive, in_progress, slug | collection, in_progress, slug, archive_path=archive | ||||
) | ) | ||||
elif not archive and metadata: | elif not archive and metadata: | ||||
return CreateMetadataDepositClient(self.config).execute( | return CreateMetadataDepositClient(self.config).execute( | ||||
collection, metadata, in_progress, slug, is_archive=False | collection, in_progress, slug, metadata_path=metadata | ||||
) | ) | ||||
else: | else: | ||||
return CreateMultipartDepositClient(self.config).execute( | return CreateMultipartDepositClient(self.config).execute( | ||||
collection, archive, metadata, in_progress, slug | collection, | ||||
in_progress, | |||||
slug, | |||||
archive_path=archive, | |||||
metadata_path=metadata, | |||||
) | ) | ||||
def deposit_update( | def deposit_update( | ||||
self, | self, | ||||
collection, | collection: str, | ||||
deposit_id, | deposit_id: int, | ||||
slug, | slug: str, | ||||
archive=None, | archive: Optional[str] = None, | ||||
metadata=None, | metadata: Optional[str] = None, | ||||
in_progress=False, | in_progress: bool = False, | ||||
replace=False, | replace: bool = False, | ||||
swhid: Optional[str] = None, | |||||
): | ): | ||||
"""Update (add/replace) existing deposit (archive, metadata, both).""" | """Update (add/replace) existing deposit (archive, metadata, both).""" | ||||
r = self.deposit_status(collection, deposit_id) | r = self.deposit_status(collection, deposit_id) | ||||
if "error" in r: | if "error" in r: | ||||
return r | return r | ||||
status = r["deposit_status"] | status = r["deposit_status"] | ||||
if status != "partial": | if swhid is None and status != "partial": | ||||
return { | return { | ||||
"error": "You can only act on deposit with status 'partial'", | "error": "You can only act on deposit with status 'partial'", | ||||
"detail": "The deposit %s has status '%s'" % (deposit_id, status), | "detail": f"The deposit {deposit_id} has status '{status}'", | ||||
"deposit_status": status, | |||||
"deposit_id": deposit_id, | |||||
} | |||||
if swhid is not None and status != "done": | |||||
return { | |||||
"error": "You can only update metadata on deposit with status 'done'", | |||||
"detail": f"The deposit {deposit_id} has status '{status}'", | |||||
"deposit_status": status, | "deposit_status": status, | ||||
"deposit_id": deposit_id, | "deposit_id": deposit_id, | ||||
} | } | ||||
if archive and not metadata: | if archive and not metadata: | ||||
r = UpdateArchiveDepositClient(self.config).execute( | r = UpdateArchiveDepositClient(self.config).execute( | ||||
collection, | collection, | ||||
archive, | |||||
in_progress, | in_progress, | ||||
slug, | slug, | ||||
deposit_id=deposit_id, | deposit_id=deposit_id, | ||||
archive_path=archive, | |||||
replace=replace, | replace=replace, | ||||
) | ) | ||||
elif not archive and metadata: | elif not archive and metadata and swhid is None: | ||||
r = UpdateMetadataDepositClient(self.config).execute( | r = UpdateMetadataOnPartialDepositClient(self.config).execute( | ||||
collection, | collection, | ||||
metadata, | |||||
in_progress, | in_progress, | ||||
slug, | slug, | ||||
deposit_id=deposit_id, | deposit_id=deposit_id, | ||||
metadata_path=metadata, | |||||
replace=replace, | replace=replace, | ||||
) | ) | ||||
elif not archive and metadata and swhid is not None: | |||||
r = UpdateMetadataOnDoneDepositClient(self.config).execute( | |||||
collection, | |||||
in_progress, | |||||
slug, | |||||
deposit_id=deposit_id, | |||||
metadata_path=metadata, | |||||
swhid=swhid, | |||||
) | |||||
else: | else: | ||||
r = UpdateMultipartDepositClient(self.config).execute( | r = UpdateMultipartDepositClient(self.config).execute( | ||||
collection, | collection, | ||||
archive, | |||||
metadata, | |||||
in_progress, | in_progress, | ||||
slug, | slug, | ||||
deposit_id=deposit_id, | deposit_id=deposit_id, | ||||
archive_path=archive, | |||||
metadata_path=metadata, | |||||
replace=replace, | replace=replace, | ||||
) | ) | ||||
if "error" in r: | if "error" in r: | ||||
return r | return r | ||||
return self.deposit_status(collection, deposit_id) | return self.deposit_status(collection, deposit_id) |
sorry but that's not a great function name/docstring.
computes what information? based on what file? and used for what?