diff --git a/swh/deposit/cli/client.py b/swh/deposit/cli/client.py index 6965dcc6..1f906be4 100644 --- a/swh/deposit/cli/client.py +++ b/swh/deposit/cli/client.py @@ -1,519 +1,530 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from __future__ import annotations import logging # WARNING: do not import unnecessary things here to keep cli startup time under # control import os import sys from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple import click from swh.deposit.cli import deposit logger = logging.getLogger(__name__) if TYPE_CHECKING: from swh.deposit.client import PublicApiDepositClient class InputError(ValueError): """Input script error """ pass def generate_slug() -> str: """Generate a slug (sample purposes). """ import uuid return str(uuid.uuid4()) def _url(url: str) -> str: """Force the /1 api version at the end of the url (avoiding confusing issues without it). Args: url (str): api url used by cli users Returns: Top level api url to actually request """ if not url.endswith("/1"): url = "%s/1" % url return url def generate_metadata_file( name: str, external_id: str, authors: List[str], temp_dir: str ) -> str: """Generate a temporary metadata file with the minimum required metadata This generates a xml file in a temporary location and returns the path to that file. This is up to the client of that function to clean up the temporary file. Args: name: Software's name external_id: External identifier (slug) or generated one authors: List of author names Returns: Filepath to the metadata generated file """ import xmltodict path = os.path.join(temp_dir, "metadata.xml") # generate a metadata file with the minimum required metadata codemetadata = { "entry": { "@xmlns": "http://www.w3.org/2005/Atom", "@xmlns:codemeta": "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0", "codemeta:name": name, "codemeta:identifier": external_id, "codemeta:author": [ {"codemeta:name": author_name} for author_name in authors ], }, } logging.debug("Temporary file: %s", path) logging.debug("Metadata dict to generate as xml: %s", codemetadata) s = xmltodict.unparse(codemetadata, pretty=True) logging.debug("Metadata dict as xml generated: %s", s) with open(path, "w") as fp: fp.write(s) return path def _client(url: str, username: str, password: str) -> PublicApiDepositClient: """Instantiate a client to access the deposit api server Args: url (str): Deposit api server username (str): User password (str): User's password """ from swh.deposit.client import PublicApiDepositClient return PublicApiDepositClient( {"url": url, "auth": {"username": username, "password": password},} ) def _collection(client: PublicApiDepositClient) -> str: """Retrieve the client's collection """ # retrieve user's collection sd_content = client.service_document() if "error" in sd_content: raise InputError("Service document retrieval: %s" % (sd_content["error"],)) collection = sd_content["service"]["workspace"]["collection"]["sword:name"] return collection def client_command_parse_input( username: str, password: str, archive: Optional[str], metadata: Optional[str], archive_deposit: bool, metadata_deposit: bool, collection: Optional[str], slug: Optional[str], partial: bool, deposit_id: Optional[int], + swhid: Optional[str], replace: bool, url: str, name: Optional[str], authors: List[str], temp_dir: str, ) -> Dict[str, Any]: """Parse the client subcommand options and make sure the combination is acceptable*. If not, an InputError exception is raised explaining the issue. By acceptable, we mean: - A multipart deposit (create or update) requires: - an existing software archive - an existing metadata file or author(s) and name provided in params - A binary deposit (create/update) requires an existing software archive - A metadata deposit (create/update) requires an existing metadata file or author(s) and name provided in params - A deposit update requires a deposit_id This will not prevent all failure cases though. The remaining errors are already dealt with by the underlying api client. Raises: InputError explaining the user input related issue MaintenanceError explaining the api status Returns: dict with the following keys: 'archive': the software archive to deposit 'username': username 'password': associated password 'metadata': the metadata file to deposit 'collection': the username's associated client 'slug': the slug or external id identifying the deposit to make 'partial': if the deposit is partial or not 'client': instantiated class 'url': deposit's server main entry point 'deposit_type': deposit's type (binary, multipart, metadata) 'deposit_id': optional deposit identifier + 'swhid': optional deposit swhid """ if archive_deposit and metadata_deposit: # too many flags use, remove redundant ones (-> multipart deposit) archive_deposit = False metadata_deposit = False if not slug: # generate one as this is mandatory slug = generate_slug() if not metadata: if metadata_deposit: raise InputError( "Metadata deposit must be provided for metadata " "deposit, either a filepath with --metadata or --name and --author" ) if name and authors: metadata = generate_metadata_file(name, slug, authors, temp_dir) elif not archive_deposit and not partial and not deposit_id: # If we meet all the following conditions: # * this is not an archive-only deposit request # * it is not part of a multipart deposit (either create/update # or finish) # * it misses either name or authors raise InputError( "For metadata deposit request, either a metadata file with " "--metadata or both --author and --name must be provided. " "If this is an archive deposit request, none is required." ) elif name or authors: # If we are generating metadata, then all mandatory metadata # must be present raise InputError( "For metadata deposit request, either a metadata file with " "--metadata or both --author and --name must be provided." ) else: # TODO: this is a multipart deposit, we might want to check that # metadata are deposited at some point pass elif name or authors: raise InputError( "Using --metadata flag is incompatible with both " "--author and --name (Those are used to generate one metadata file)." ) if metadata_deposit: archive = None if archive_deposit: metadata = None if not archive and not metadata and partial: raise InputError( "Please provide an actionable command. See --help for more information" ) if replace and not deposit_id: raise InputError("To update an existing deposit, you must provide its id") client = _client(url, username, password) if not collection: collection = _collection(client) return { "archive": archive, "username": username, "password": password, "metadata": metadata, "collection": collection, "slug": slug, "in_progress": partial, "client": client, "url": url, "deposit_id": deposit_id, + "swhid": swhid, "replace": replace, } def _subdict(d: Dict[str, Any], keys: Tuple[str, ...]) -> Dict[str, Any]: "return a dict from d with only given keys" return {k: v for k, v in d.items() if k in keys} def deposit_create(config: Dict[str, Any]) -> Dict[str, Any]: """Delegate the actual deposit to the deposit client. """ logger.debug("Create deposit") client = config["client"] keys = ("collection", "archive", "metadata", "slug", "in_progress") return client.deposit_create(**_subdict(config, keys)) def deposit_update(config: Dict[str, Any]) -> Dict[str, Any]: """Delegate the actual deposit to the deposit client. """ logger.debug("Update deposit") client = config["client"] keys = ( "collection", "deposit_id", "archive", "metadata", "slug", "in_progress", "replace", + "swhid", ) return client.deposit_update(**_subdict(config, keys)) @deposit.command() @click.option("--username", required=True, help="(Mandatory) User's name") @click.option( "--password", required=True, help="(Mandatory) User's associated password" ) @click.option( "--archive", type=click.Path(exists=True), help="(Optional) Software archive to deposit", ) @click.option( "--metadata", type=click.Path(exists=True), help=( "(Optional) Path to xml metadata file. If not provided, " "this will use a file named .metadata.xml" ), ) # noqa @click.option( "--archive-deposit/--no-archive-deposit", default=False, help="(Optional) Software archive only deposit", ) @click.option( "--metadata-deposit/--no-metadata-deposit", default=False, help="(Optional) Metadata only deposit", ) @click.option( "--collection", help="(Optional) User's collection. If not provided, this will be fetched.", ) # noqa @click.option( "--slug", help=( "(Optional) External system information identifier. " "If not provided, it will be generated" ), ) # noqa @click.option( "--partial/--no-partial", default=False, help=( "(Optional) The deposit will be partial, other deposits " "will have to take place to finalize it." ), ) # noqa @click.option( "--deposit-id", default=None, help="(Optional) Update an existing partial deposit with its identifier", ) # noqa +@click.option( + "--swhid", + default=None, + help="(Optional) Update existing completed deposit (status done) with new metadata", +) @click.option( "--replace/--no-replace", default=False, help="(Optional) Update by replacing existing metadata to a deposit", ) # noqa @click.option( "--url", default="https://deposit.softwareheritage.org", help=( "(Optional) Deposit server api endpoint. By default, " "https://deposit.softwareheritage.org/1" ), ) # noqa @click.option("--verbose/--no-verbose", default=False, help="Verbose mode") @click.option("--name", help="Software name") @click.option( "--author", multiple=True, help="Software author(s), this can be repeated as many times" " as there are authors", ) @click.option( "-f", "--format", "output_format", default="logging", type=click.Choice(["logging", "yaml", "json"]), help="Output format results.", ) @click.pass_context def upload( ctx, username: str, password: str, archive: Optional[str] = None, metadata: Optional[str] = None, archive_deposit: bool = False, metadata_deposit: bool = False, collection: Optional[str] = None, slug: Optional[str] = None, partial: bool = False, deposit_id: Optional[int] = None, + swhid: Optional[str] = None, replace: bool = False, url: str = "https://deposit.softwareheritage.org", verbose: bool = False, name: Optional[str] = None, author: List[str] = [], output_format: Optional[str] = None, ): """Software Heritage Public Deposit Client Create/Update deposit through the command line. More documentation can be found at https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html. """ import tempfile from swh.deposit.client import MaintenanceError url = _url(url) config = {} with tempfile.TemporaryDirectory() as temp_dir: try: logger.debug("Parsing cli options") config = client_command_parse_input( username, password, archive, metadata, archive_deposit, metadata_deposit, collection, slug, partial, deposit_id, + swhid, replace, url, name, author, temp_dir, ) except InputError as e: logger.error("Problem during parsing options: %s", e) sys.exit(1) except MaintenanceError as e: logger.error(e) sys.exit(1) if verbose: logger.info("Parsed configuration: %s", config) deposit_id = config["deposit_id"] if deposit_id: data = deposit_update(config) else: data = deposit_create(config) print_result(data, output_format) @deposit.command() @click.option( "--url", default="https://deposit.softwareheritage.org", help="(Optional) Deposit server api endpoint. By default, " "https://deposit.softwareheritage.org/1", ) @click.option("--username", required=True, help="(Mandatory) User's name") @click.option( "--password", required=True, help="(Mandatory) User's associated password" ) @click.option("--deposit-id", default=None, required=True, help="Deposit identifier.") @click.option( "-f", "--format", "output_format", default="logging", type=click.Choice(["logging", "yaml", "json"]), help="Output format results.", ) @click.pass_context def status(ctx, url, username, password, deposit_id, output_format): """Deposit's status """ from swh.deposit.client import MaintenanceError url = _url(url) logger.debug("Status deposit") try: client = _client(url, username, password) collection = _collection(client) except InputError as e: logger.error("Problem during parsing options: %s", e) sys.exit(1) except MaintenanceError as e: logger.error(e) sys.exit(1) print_result( client.deposit_status(collection=collection, deposit_id=deposit_id), output_format, ) def print_result(data: Dict[str, Any], output_format: Optional[str]) -> None: """Display the result data into a dedicated output format. """ import json import yaml if output_format == "json": click.echo(json.dumps(data)) elif output_format == "yaml": click.echo(yaml.dump(data)) else: logger.info(data) diff --git a/swh/deposit/client.py b/swh/deposit/client.py index edc5f111..d6d36c06 100644 --- a/swh/deposit/client.py +++ b/swh/deposit/client.py @@ -1,646 +1,712 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining an swh-deposit client """ from abc import ABCMeta, abstractmethod import hashlib import logging import os -from typing import Any, Dict +from typing import Any, Dict, Optional from urllib.parse import urljoin import requests import xmltodict from swh.core.config import load_from_envvar logger = logging.getLogger(__name__) +def compute_unified_information( + collection: str, + in_progress: bool, + slug: str, + *, + filepath: Optional[str] = None, + swhid: Optional[str] = None, + **kwargs, +) -> Dict[str, Any]: + """Given a filepath, compute necessary information on that file. + + Args: + collection: Deposit collection + in_progress: do we finalize the deposit? + slug: external id to use + filepath: Path to the file to compute the necessary information out of + swhid: Deposit swhid if any + + Returns: + dict with keys: + + 'slug': external id to use + 'in_progress': do we finalize the deposit? + 'content-type': content type associated + 'md5sum': md5 sum + 'filename': filename + 'filepath': filepath + 'swhid': deposit swhid + + """ + result: Dict[str, Any] = { + "slug": slug, + "in_progress": in_progress, + "swhid": swhid, + } + content_type: Optional[str] = None + md5sum: Optional[str] = None + + if filepath: + filename = os.path.basename(filepath) + md5sum = hashlib.md5(open(filepath, "rb").read()).hexdigest() + extension = filename.split(".")[-1] + if "zip" in extension: + content_type = "application/zip" + else: + content_type = "application/x-tar" + result.update( + { + "content-type": content_type, + "md5sum": md5sum, + "filename": filename, + "filepath": filepath, + } + ) + + return result + + class MaintenanceError(ValueError): """Informational maintenance error exception """ pass def _parse(stream, encoding="utf-8"): """Given a xml stream, parse the result. Args: stream (bytes/text): The stream to parse encoding (str): The encoding to use if to decode the bytes stream Returns: A dict of values corresponding to the parsed xml """ if isinstance(stream, bytes): stream = stream.decode(encoding) data = xmltodict.parse(stream, encoding=encoding, process_namespaces=False) if "entry" in data: data = data["entry"] if "sword:error" in data: data = data["sword:error"] return dict(data) def _parse_with_filter(stream, encoding="utf-8", keys=[]): """Given a xml stream, parse the result and filter with keys. Args: stream (bytes/text): The stream to parse encoding (str): The encoding to use if to decode the bytes stream keys ([str]): Keys to filter the parsed result Returns: A dict of values corresponding to the parsed xml filtered by the keys provided. """ data = _parse(stream, encoding=encoding) m = {} for key in keys: m[key] = data.get(key) return m class BaseApiDepositClient: """Deposit client base class """ def __init__(self, config=None, _client=requests): self.config: Dict[str, Any] = config or load_from_envvar() self._client = _client self.base_url = self.config["url"].strip("/") + "/" auth = self.config["auth"] if auth == {}: self.auth = None else: self.auth = (auth["username"], auth["password"]) def do(self, method, url, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in self._methods' keys Returns: The request's execution """ if hasattr(self._client, method): method_fn = getattr(self._client, method) else: raise ValueError("Development error, unsupported method %s" % (method)) if self.auth: kwargs["auth"] = self.auth full_url = urljoin(self.base_url, url.lstrip("/")) return method_fn(full_url, *args, **kwargs) class PrivateApiDepositClient(BaseApiDepositClient): """Private API deposit client to: - read a given deposit's archive(s) - read a given deposit's metadata - update a given deposit's status """ def archive_get(self, archive_update_url, archive): """Retrieve the archive from the deposit to a local directory. Args: archive_update_url (str): The full deposit archive(s)'s raw content to retrieve locally archive (str): the local archive's path where to store the raw content Returns: The archive path to the local archive to load. Or None if any problem arose. """ r = self.do("get", archive_update_url, stream=True) if r.ok: with open(archive, "wb") as f: for chunk in r.iter_content(): f.write(chunk) return archive msg = "Problem when retrieving deposit archive at %s" % (archive_update_url,) logger.error(msg) raise ValueError(msg) def metadata_get(self, metadata_url): """Retrieve the metadata information on a given deposit. Args: metadata_url (str): The full deposit metadata url to retrieve locally Returns: The dictionary of metadata for that deposit or None if any problem arose. """ r = self.do("get", metadata_url) if r.ok: return r.json() msg = "Problem when retrieving metadata at %s" % metadata_url logger.error(msg) raise ValueError(msg) def status_update( self, update_status_url, status, revision_id=None, directory_id=None, origin_url=None, ): """Update the deposit's status. Args: update_status_url (str): the full deposit's archive status (str): The status to update the deposit with revision_id (str/None): the revision's identifier to update to directory_id (str/None): the directory's identifier to update to origin_url (str/None): deposit's associated origin url """ payload = {"status": status} if revision_id: payload["revision_id"] = revision_id if directory_id: payload["directory_id"] = directory_id if origin_url: payload["origin_url"] = origin_url self.do("put", update_status_url, json=payload) def check(self, check_url): """Check the deposit's associated data (metadata, archive(s)) Args: check_url (str): the full deposit's check url """ r = self.do("get", check_url) if r.ok: data = r.json() return data["status"] msg = "Problem when checking deposit %s" % check_url logger.error(msg) raise ValueError(msg) class BaseDepositClient(BaseApiDepositClient, metaclass=ABCMeta): """Base Deposit client to access the public api. """ def __init__(self, config, error_msg=None, empty_result={}): super().__init__(config) self.error_msg = error_msg self.empty_result = empty_result @abstractmethod def compute_url(self, *args, **kwargs): """Compute api url endpoint to query.""" pass @abstractmethod def compute_method(self, *args, **kwargs): """Http method to use on the url""" pass @abstractmethod def parse_result_ok(self, xml_content): """Given an xml result from the api endpoint, parse it and returns a dict. """ pass - def compute_information(self, *args, **kwargs): + def compute_information(self, *args, **kwargs) -> Dict[str, Any]: """Compute some more information given the inputs (e.g http headers, ...) """ return {} def parse_result_error(self, xml_content): """Given an error response in xml, parse it into a dict. Returns: dict with following keys: 'error': The error message 'detail': Some more detail about the error if any """ return _parse_with_filter( xml_content, keys=["summary", "detail", "sword:verboseDescription"] ) def do_execute(self, method, url, info): """Execute the http query to url using method and info information. By default, execute a simple query to url with the http method. Override this in daughter class to improve the default behavior if needed. """ return self.do(method, url) def execute(self, *args, **kwargs) -> Dict[str, Any]: """Main endpoint to prepare and execute the http query to the api. Raises: MaintenanceError if some api maintenance is happening. Returns: Dict of computed api data """ url = self.compute_url(*args, **kwargs) method = self.compute_method(*args, **kwargs) info = self.compute_information(*args, **kwargs) try: r = self.do_execute(method, url, info) except Exception as e: msg = self.error_msg % (url, e) r = self.empty_result r.update( {"error": msg,} ) return r else: if r.ok: if int(r.status_code) == 204: # 204 returns no body return {"status": r.status_code} else: return self.parse_result_ok(r.text) else: error = self.parse_result_error(r.text) empty = self.empty_result error.update(empty) if r.status_code == 503: summary = error.get("summary") detail = error.get("sword:verboseDescription") # Maintenance error if summary and detail: raise MaintenanceError(f"{summary}: {detail}") error.update( {"status": r.status_code,} ) return error class ServiceDocumentDepositClient(BaseDepositClient): """Service Document information retrieval. """ def __init__(self, config): super().__init__( config, error_msg="Service document failure at %s: %s", empty_result={"collection": None}, ) def compute_url(self, *args, **kwargs): return "/servicedocument/" def compute_method(self, *args, **kwargs): return "get" def parse_result_ok(self, xml_content): """Parse service document's success response. """ return _parse(xml_content) class StatusDepositClient(BaseDepositClient): """Status information on a deposit. """ def __init__(self, config): super().__init__( config, error_msg="Status check failure at %s: %s", empty_result={ "deposit_status": None, "deposit_status_detail": None, "deposit_swh_id": None, }, ) def compute_url(self, collection, deposit_id): return "/%s/%s/status/" % (collection, deposit_id) def compute_method(self, *args, **kwargs): return "get" def parse_result_ok(self, xml_content): """Given an xml content as string, returns a deposit dict. """ return _parse_with_filter( xml_content, keys=[ "deposit_id", "deposit_status", "deposit_status_detail", "deposit_swh_id", "deposit_swh_id_context", "deposit_external_id", ], ) class BaseCreateDepositClient(BaseDepositClient): """Deposit client base class to post new deposit. """ def __init__(self, config): super().__init__( config, error_msg="Post Deposit failure at %s: %s", empty_result={"deposit_id": None, "deposit_status": None,}, ) def compute_url(self, collection, *args, **kwargs): return "/%s/" % collection def compute_method(self, *args, **kwargs): return "post" def parse_result_ok(self, xml_content): """Given an xml content as string, returns a deposit dict. """ return _parse_with_filter( xml_content, keys=[ "deposit_id", "deposit_status", "deposit_status_detail", "deposit_date", ], ) - def _compute_information( - self, collection, filepath, in_progress, slug, is_archive=True - ): - """Given a filepath, compute necessary information on that file. - - Args: - filepath (str): Path to a file - is_archive (bool): is it an archive or not? - - Returns: - dict with keys: - 'content-type': content type associated - 'md5sum': md5 sum - 'filename': filename - """ - filename = os.path.basename(filepath) - - if is_archive: - md5sum = hashlib.md5(open(filepath, "rb").read()).hexdigest() - extension = filename.split(".")[-1] - if "zip" in extension: - content_type = "application/zip" - else: - content_type = "application/x-tar" - else: - content_type = None - md5sum = None - - return { - "slug": slug, - "in_progress": in_progress, - "content-type": content_type, - "md5sum": md5sum, - "filename": filename, - "filepath": filepath, - } - - def compute_information( - self, collection, filepath, in_progress, slug, is_archive=True, **kwargs - ): - info = self._compute_information( - collection, filepath, in_progress, slug, is_archive=is_archive - ) - info["headers"] = self.compute_headers(info) + def compute_headers(self, info: Dict[str, Any]) -> Dict[str, Any]: return info def do_execute(self, method, url, info): with open(info["filepath"], "rb") as f: return self.do(method, url, data=f, headers=info["headers"]) class CreateArchiveDepositClient(BaseCreateDepositClient): """Post an archive (binary) deposit client.""" def compute_headers(self, info): return { "SLUG": info["slug"], "CONTENT_MD5": info["md5sum"], "IN-PROGRESS": str(info["in_progress"]), "CONTENT-TYPE": info["content-type"], "CONTENT-DISPOSITION": "attachment; filename=%s" % (info["filename"],), } + def compute_information(self, *args, **kwargs) -> Dict[str, Any]: + info = compute_unified_information( + *args, filepath=kwargs["archive_path"], **kwargs + ) + info["headers"] = self.compute_headers(info) + return info + class UpdateArchiveDepositClient(CreateArchiveDepositClient): """Update (add/replace) an archive (binary) deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return "/%s/%s/media/" % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return "put" if replace else "post" class CreateMetadataDepositClient(BaseCreateDepositClient): """Post a metadata deposit client.""" def compute_headers(self, info): return { "SLUG": info["slug"], "IN-PROGRESS": str(info["in_progress"]), "CONTENT-TYPE": "application/atom+xml;type=entry", } + def compute_information(self, *args, **kwargs) -> Dict[str, Any]: + info = compute_unified_information( + *args, filepath=kwargs["metadata_path"], **kwargs + ) + info["headers"] = self.compute_headers(info) + return info + -class UpdateMetadataDepositClient(CreateMetadataDepositClient): - """Update (add/replace) a metadata deposit client.""" +class UpdateMetadataOnPartialDepositClient(CreateMetadataDepositClient): + """Update (add/replace) metadata on partial deposit scenario.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): - return "/%s/%s/metadata/" % (collection, deposit_id) + return f"/{collection}/{deposit_id}/metadata/" - def compute_method(self, *args, replace=False, **kwargs): + def compute_method(self, *args, replace: bool = False, **kwargs) -> str: return "put" if replace else "post" +class UpdateMetadataOnDoneDepositClient(UpdateMetadataOnPartialDepositClient): + """Update metadata on "done" deposit. This requires the deposit swhid.""" + + def compute_headers(self, info: Dict[str, Any]) -> Dict[str, Any]: + return { + "CONTENT-TYPE": "application/atom+xml;type=entry", + "X_CHECK_SWHID": info["swhid"], + } + + def compute_method(self, *args, **kwargs) -> str: + return "put" + + class CreateMultipartDepositClient(BaseCreateDepositClient): """Create a multipart deposit client.""" def _multipart_info(self, info, info_meta): files = [ ( "file", (info["filename"], open(info["filepath"], "rb"), info["content-type"]), ), ( "atom", ( info_meta["filename"], open(info_meta["filepath"], "rb"), "application/atom+xml", ), ), ] headers = { "SLUG": info["slug"], "CONTENT_MD5": info["md5sum"], "IN-PROGRESS": str(info["in_progress"]), } return files, headers - def compute_information( - self, collection, archive, metadata, in_progress, slug, **kwargs - ): - info = self._compute_information(collection, archive, in_progress, slug) - info_meta = self._compute_information( - collection, metadata, in_progress, slug, is_archive=False + def compute_information(self, *args, **kwargs) -> Dict[str, Any]: + info = compute_unified_information(*args, filepath=kwargs["archive_path"],) + info_meta = compute_unified_information( + *args, filepath=kwargs["metadata_path"], ) files, headers = self._multipart_info(info, info_meta) return {"files": files, "headers": headers} def do_execute(self, method, url, info): return self.do(method, url, files=info["files"], headers=info["headers"]) class UpdateMultipartDepositClient(CreateMultipartDepositClient): """Update a multipart deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return "/%s/%s/metadata/" % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return "put" if replace else "post" class PublicApiDepositClient(BaseApiDepositClient): """Public api deposit client.""" def service_document(self): """Retrieve service document endpoint's information.""" return ServiceDocumentDepositClient(self.config).execute() - def deposit_status(self, collection, deposit_id): + def deposit_status(self, collection: str, deposit_id: int): """Retrieve status information on a deposit.""" return StatusDepositClient(self.config).execute(collection, deposit_id) def deposit_create( - self, collection, slug, archive=None, metadata=None, in_progress=False + self, + collection: str, + slug: str, + archive: Optional[str] = None, + metadata: Optional[str] = None, + in_progress: bool = False, ): """Create a new deposit (archive, metadata, both as multipart).""" if archive and not metadata: return CreateArchiveDepositClient(self.config).execute( - collection, archive, in_progress, slug + collection, in_progress, slug, archive_path=archive ) elif not archive and metadata: return CreateMetadataDepositClient(self.config).execute( - collection, metadata, in_progress, slug, is_archive=False + collection, in_progress, slug, metadata_path=metadata ) else: return CreateMultipartDepositClient(self.config).execute( - collection, archive, metadata, in_progress, slug + collection, + in_progress, + slug, + archive_path=archive, + metadata_path=metadata, ) def deposit_update( self, - collection, - deposit_id, - slug, - archive=None, - metadata=None, - in_progress=False, - replace=False, + collection: str, + deposit_id: int, + slug: str, + archive: Optional[str] = None, + metadata: Optional[str] = None, + in_progress: bool = False, + replace: bool = False, + swhid: Optional[str] = None, ): """Update (add/replace) existing deposit (archive, metadata, both).""" r = self.deposit_status(collection, deposit_id) if "error" in r: return r status = r["deposit_status"] - if status != "partial": + if swhid is None and status != "partial": return { "error": "You can only act on deposit with status 'partial'", - "detail": "The deposit %s has status '%s'" % (deposit_id, status), + "detail": f"The deposit {deposit_id} has status '{status}'", + "deposit_status": status, + "deposit_id": deposit_id, + } + if swhid is not None and status != "done": + return { + "error": "You can only update metadata on deposit with status 'done'", + "detail": f"The deposit {deposit_id} has status '{status}'", "deposit_status": status, "deposit_id": deposit_id, } if archive and not metadata: r = UpdateArchiveDepositClient(self.config).execute( collection, - archive, in_progress, slug, deposit_id=deposit_id, + archive_path=archive, replace=replace, ) - elif not archive and metadata: - r = UpdateMetadataDepositClient(self.config).execute( + elif not archive and metadata and swhid is None: + r = UpdateMetadataOnPartialDepositClient(self.config).execute( collection, - metadata, in_progress, slug, deposit_id=deposit_id, + metadata_path=metadata, replace=replace, ) + elif not archive and metadata and swhid is not None: + r = UpdateMetadataOnDoneDepositClient(self.config).execute( + collection, + in_progress, + slug, + deposit_id=deposit_id, + metadata_path=metadata, + swhid=swhid, + ) else: r = UpdateMultipartDepositClient(self.config).execute( collection, - archive, - metadata, in_progress, slug, deposit_id=deposit_id, + archive_path=archive, + metadata_path=metadata, replace=replace, ) if "error" in r: return r return self.deposit_status(collection, deposit_id) diff --git a/swh/deposit/tests/cli/test_client.py b/swh/deposit/tests/cli/test_client.py index 7239da44..b5064e1d 100644 --- a/swh/deposit/tests/cli/test_client.py +++ b/swh/deposit/tests/cli/test_client.py @@ -1,630 +1,723 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import ast import contextlib import json import logging import os from unittest.mock import MagicMock from click.testing import CliRunner import pytest import yaml from swh.deposit.cli import deposit as cli from swh.deposit.cli.client import InputError, _client, _collection, _url, generate_slug from swh.deposit.client import MaintenanceError, PublicApiDepositClient from swh.deposit.parsers import parse_xml from ..conftest import TEST_USER @pytest.fixture def deposit_config(): return { "url": "https://deposit.swh.test/1", "auth": {"username": "test", "password": "test",}, } @pytest.fixture def datadir(request): """Override default datadir to target main test datadir""" return os.path.join(os.path.dirname(str(request.fspath)), "../data") @pytest.fixture def slug(): return generate_slug() @pytest.fixture def patched_tmp_path(tmp_path, mocker): mocker.patch( "tempfile.TemporaryDirectory", return_value=contextlib.nullcontext(str(tmp_path)), ) return tmp_path @pytest.fixture def cli_runner(): return CliRunner() @pytest.fixture def client_mock_api_down(mocker, slug): """A mock client whose connection with api fails due to maintenance issue """ mock_client = MagicMock() mocker.patch("swh.deposit.cli.client._client", return_value=mock_client) mock_client.service_document.side_effect = MaintenanceError( "Database backend maintenance: Temporarily unavailable, try again later." ) return mock_client def test_cli_url(): assert _url("http://deposit") == "http://deposit/1" assert _url("https://other/1") == "https://other/1" def test_cli_client(): client = _client("http://deposit", "user", "pass") assert isinstance(client, PublicApiDepositClient) def test_cli_collection_error(): mock_client = MagicMock() mock_client.service_document.return_value = {"error": "something went wrong"} with pytest.raises(InputError) as e: _collection(mock_client) assert "Service document retrieval: something went wrong" == str(e.value) def test_cli_collection_ok(deposit_config, requests_mock_datadir): client = PublicApiDepositClient(deposit_config) collection_name = _collection(client) assert collection_name == "test" def test_cli_collection_ko_because_downtime(): mock_client = MagicMock() mock_client.service_document.side_effect = MaintenanceError("downtime") with pytest.raises(MaintenanceError, match="downtime"): _collection(mock_client) def test_cli_deposit_with_server_down_for_maintenance( sample_archive, caplog, client_mock_api_down, slug, patched_tmp_path, cli_runner ): """ Deposit failure due to maintenance down time should be explicit """ result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", ], ) assert result.exit_code == 1, result.output assert result.output == "" down_for_maintenance_log_record = ( "swh.deposit.cli.client", logging.ERROR, "Database backend maintenance: Temporarily unavailable, try again later.", ) assert down_for_maintenance_log_record in caplog.record_tuples client_mock_api_down.service_document.assert_called_once_with() def test_cli_single_minimal_deposit( sample_archive, slug, patched_tmp_path, requests_mock_datadir, cli_runner ): """ This ensure a single deposit upload through the cli is fine, cf. https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa metadata_path = os.path.join(patched_tmp_path, "metadata.xml") result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", "--slug", slug, "--format", "json", ], ) assert result.exit_code == 0, result.output assert json.loads(result.output) == { "deposit_id": "615", "deposit_status": "partial", "deposit_status_detail": None, "deposit_date": "Oct. 8, 2020, 4:57 p.m.", } with open(metadata_path) as fd: assert ( fd.read() == f"""\ \ttest-project \t{slug} \t \t\tJane Doe \t """ ) def test_cli_validation_metadata( sample_archive, caplog, patched_tmp_path, cli_runner, slug ): """Multiple metadata flags scenario (missing, conflicts) properly fails the calls """ metadata_path = os.path.join(patched_tmp_path, "metadata.xml") with open(metadata_path, "a"): pass # creates the file for flag_title_or_name, author_or_name in [ ("--author", "no one"), ("--name", "test-project"), ]: # Test missing author then missing name result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--slug", slug, flag_title_or_name, author_or_name, ], ) assert result.exit_code == 1, f"unexpected result: {result.output}" assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "For metadata deposit request, either a metadata file with " "--metadata or both --author and --name must be provided. " "If this is an archive deposit request, none is required." ), ) assert expected_error_log_record in caplog.record_tuples # Clear mocking state caplog.clear() # incompatible flags: Test both --metadata and --author, then --metadata and # --name result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--deposit-id", 666, "--archive", sample_archive["path"], "--slug", slug, ], ) assert result.exit_code == 1, f"unexpected result: {result.output}" assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "For metadata deposit request, either a metadata file with " "--metadata or both --author and --name must be provided." ), ) assert expected_error_log_record in caplog.record_tuples # Clear mocking state caplog.clear() # incompatible flags check (Test both --metadata and --author, # then --metadata and --name) result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--metadata", metadata_path, "--author", "Jane Doe", "--slug", slug, ], ) assert result.exit_code == 1, result.output assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "Using --metadata flag is incompatible with both " "--author and --name (Those are used to generate one metadata file)." ), ) assert expected_error_log_record in caplog.record_tuples caplog.clear() def test_cli_validation_no_actionable_command(caplog, cli_runner): """Multiple metadata flags scenario (missing, conflicts) properly fails the calls """ # no actionable command result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--partial", ], ) assert result.exit_code == 1, result.output assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "Please provide an actionable command. See --help for more information" ), ) assert expected_error_log_record in caplog.record_tuples def test_cli_validation_missing_metadata_flag(caplog, cli_runner): """--metadata-deposit requires --metadata (or --name and --author) otherwise fails """ # --metadata-deposit without --metadata flag fails result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata-deposit", # should fail because missing --metadata flag ], ) assert result.exit_code == 1, result.output assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "Metadata deposit must be provided for metadata " "deposit, either a filepath with --metadata or --name and --author" ), ) assert expected_error_log_record in caplog.record_tuples def test_cli_validation_replace_with_no_deposit_id_fails( sample_archive, caplog, patched_tmp_path, requests_mock_datadir, datadir, cli_runner ): """--replace flags require --deposit-id otherwise fails """ metadata_path = os.path.join(datadir, "atom", "entry-data-deposit-binary.xml") for flags in [ ["--replace"], ["--replace", "--metadata-deposit", "--archive-deposit"], ]: result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--archive", sample_archive["path"], ] + flags, ) assert result.exit_code == 1, result.output assert result.output == "" expected_error_log_record = ( "swh.deposit.cli.client", logging.ERROR, ( "Problem during parsing options: " "To update an existing deposit, you must provide its id" ), ) assert expected_error_log_record in caplog.record_tuples def test_cli_single_deposit_slug_generation( sample_archive, patched_tmp_path, requests_mock_datadir, cli_runner ): """Single deposit scenario without providing the slug, the slug is generated nonetheless https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#single-deposit """ # noqa metadata_path = os.path.join(patched_tmp_path, "metadata.xml") result = cli_runner.invoke( cli, [ "upload", "--url", "https://deposit.swh.test/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--name", "test-project", "--archive", sample_archive["path"], "--author", "Jane Doe", "--format", "json", ], ) assert result.exit_code == 0, result.output assert json.loads(result.output) == { "deposit_id": "615", "deposit_status": "partial", "deposit_status_detail": None, "deposit_date": "Oct. 8, 2020, 4:57 p.m.", } with open(metadata_path) as fd: metadata_xml = fd.read() actual_metadata = parse_xml(metadata_xml) assert actual_metadata["codemeta:identifier"] is not None def test_cli_multisteps_deposit( sample_archive, datadir, slug, requests_mock_datadir, cli_runner ): """ First deposit a partial deposit (no metadata, only archive), then update the metadata part. https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#multisteps-deposit """ # noqa api_url = "https://deposit.test.metadata/1" deposit_id = 666 # Create a partial deposit with only 1 archive result = cli_runner.invoke( cli, [ "upload", "--url", api_url, "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--partial", "--slug", slug, "--format", "json", ], ) assert result.exit_code == 0, f"unexpected output: {result.output}" actual_deposit = json.loads(result.output) assert actual_deposit == { "deposit_id": str(deposit_id), "deposit_status": "partial", "deposit_status_detail": None, "deposit_date": "Oct. 8, 2020, 4:57 p.m.", } # Update the partial deposit with only 1 archive result = cli_runner.invoke( cli, [ "upload", "--url", api_url, "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--archive", sample_archive["path"], "--deposit-id", deposit_id, "--partial", # in-progress: True, because remains the metadata to upload "--slug", slug, "--format", "json", ], ) assert result.exit_code == 0, f"unexpected output: {result.output}" assert result.output is not None actual_deposit = json.loads(result.output) # deposit update scenario actually returns a deposit status dict assert actual_deposit["deposit_id"] == str(deposit_id) assert actual_deposit["deposit_status"] == "partial" # Update the partial deposit with only some metadata (and then finalize it) # https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html#add-content-or-metadata-to-the-deposit metadata_path = os.path.join(datadir, "atom", "entry-data-deposit-binary.xml") # Update deposit with metadata result = cli_runner.invoke( cli, [ "upload", "--url", api_url, "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--metadata", metadata_path, "--deposit-id", deposit_id, "--slug", slug, "--format", "json", ], # this time, ^ we no longer flag it to partial, so the status changes to # in-progress false ) assert result.exit_code == 0, f"unexpected output: {result.output}" assert result.output is not None actual_deposit = json.loads(result.output) # deposit update scenario actually returns a deposit status dict assert actual_deposit["deposit_id"] == str(deposit_id) # FIXME: should be "deposited" but current limitation in the # requests_mock_datadir_visits use, cannot find a way to make it work right now assert actual_deposit["deposit_status"] == "partial" @pytest.mark.parametrize( "output_format,callable_fn", [ ("json", json.loads), ("yaml", yaml.safe_load), ( "logging", ast.literal_eval, ), # not enough though, the caplog fixture is needed ], ) def test_cli_deposit_status_with_output_format( output_format, callable_fn, datadir, slug, requests_mock_datadir, caplog, cli_runner ): """Check deposit status cli with all possible output formats (json, yaml, logging). """ api_url_basename = "deposit.test.status" deposit_id = 1033 deposit_status_xml_path = os.path.join( datadir, f"https_{api_url_basename}", f"1_test_{deposit_id}_status" ) with open(deposit_status_xml_path, "r") as f: deposit_status_xml = f.read() - expected_deposit_dict = dict(parse_xml(deposit_status_xml)) + expected_deposit_status = dict(parse_xml(deposit_status_xml)) result = cli_runner.invoke( cli, [ "status", "--url", f"https://{api_url_basename}/1", "--username", TEST_USER["username"], "--password", TEST_USER["password"], "--deposit-id", deposit_id, "--format", output_format, ], ) assert result.exit_code == 0, f"unexpected output: {result.output}" if output_format == "logging": assert len(caplog.record_tuples) == 1 # format: (, , ) _, _, result_output = caplog.record_tuples[0] else: result_output = result.output actual_deposit = callable_fn(result_output) - assert actual_deposit == expected_deposit_dict + assert actual_deposit == expected_deposit_status + + +def test_cli_update_metadata_with_swhid_on_completed_deposit( + datadir, requests_mock_datadir, cli_runner +): + """Update new metadata on a completed deposit (status done) is ok + """ + api_url_basename = "deposit.test.updateswhid" + deposit_id = 123 + deposit_status_xml_path = os.path.join( + datadir, f"https_{api_url_basename}", f"1_test_{deposit_id}_status" + ) + with open(deposit_status_xml_path, "r") as f: + deposit_status_xml = f.read() + expected_deposit_status = dict(parse_xml(deposit_status_xml)) + + assert expected_deposit_status["deposit_status"] == "done" + assert expected_deposit_status["deposit_swh_id"] is not None + + result = cli_runner.invoke( + cli, + [ + "upload", + "--url", + f"https://{api_url_basename}/1", + "--username", + TEST_USER["username"], + "--password", + TEST_USER["password"], + "--name", + "test-project", + "--author", + "John Doe", + "--deposit-id", + deposit_id, + "--swhid", + expected_deposit_status["deposit_swh_id"], + "--format", + "json", + ], + ) + assert result.exit_code == 0, result.output + actual_deposit_status = json.loads(result.output) + assert "error" not in actual_deposit_status + assert actual_deposit_status == expected_deposit_status + + +def test_cli_update_metadata_with_swhid_on_other_status_deposit( + datadir, requests_mock_datadir, cli_runner +): + """Update new metadata with swhid on other deposit status is not possible + """ + api_url_basename = "deposit.test.updateswhid" + deposit_id = 321 + deposit_status_xml_path = os.path.join( + datadir, f"https_{api_url_basename}", f"1_test_{deposit_id}_status" + ) + with open(deposit_status_xml_path, "r") as f: + deposit_status_xml = f.read() + expected_deposit_status = dict(parse_xml(deposit_status_xml)) + assert expected_deposit_status["deposit_status"] != "done" + + result = cli_runner.invoke( + cli, + [ + "upload", + "--url", + f"https://{api_url_basename}/1", + "--username", + TEST_USER["username"], + "--password", + TEST_USER["password"], + "--name", + "test-project", + "--author", + "John Doe", + "--deposit-id", + deposit_id, + "--swhid", + "swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea", + "--format", + "json", + ], + ) + assert result.exit_code == 0, result.output + actual_result = json.loads(result.output) + assert "error" in actual_result + assert actual_result == { + "error": "You can only update metadata on deposit with status 'done'", + "detail": "The deposit 321 has status 'partial'", + "deposit_status": "partial", + "deposit_id": 321, + } diff --git a/swh/deposit/tests/data/https_deposit.test.updateswhid/1_servicedocument b/swh/deposit/tests/data/https_deposit.test.updateswhid/1_servicedocument new file mode 100644 index 00000000..81147fa1 --- /dev/null +++ b/swh/deposit/tests/data/https_deposit.test.updateswhid/1_servicedocument @@ -0,0 +1,26 @@ + + + + 2.0 + 209715200 + + + The Software Heritage (SWH) Archive + + test Software Collection + application/zip + application/x-tar + Collection Policy + Software Heritage Archive + Collect, Preserve, Share + false + false + http://purl.org/net/sword/package/SimpleZip + https://deposit.test.updateswhid/1/test/ + test + + + diff --git a/swh/deposit/tests/data/https_deposit.test.updateswhid/1_test_123_metadata b/swh/deposit/tests/data/https_deposit.test.updateswhid/1_test_123_metadata new file mode 100644 index 00000000..270d91f4 --- /dev/null +++ b/swh/deposit/tests/data/https_deposit.test.updateswhid/1_test_123_metadata @@ -0,0 +1,10 @@ + + 123 + done + The deposit has been successfully loaded into the Software Heritage archive + swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea + swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea;origin=https://www.softwareheritage.org/check-deposit-2020-10-08T13:52:34.509655;visit=swh:1:snp:c477c6ef51833127b13a86ece7d75e5b3cc4e93d;anchor=swh:1:rev:f26f3960c175f15f6e24200171d446b86f6f7230;path=/ + check-deposit-2020-10-08T13:52:34.509655 + diff --git a/swh/deposit/tests/data/https_deposit.test.updateswhid/1_test_123_status b/swh/deposit/tests/data/https_deposit.test.updateswhid/1_test_123_status new file mode 100644 index 00000000..270d91f4 --- /dev/null +++ b/swh/deposit/tests/data/https_deposit.test.updateswhid/1_test_123_status @@ -0,0 +1,10 @@ + + 123 + done + The deposit has been successfully loaded into the Software Heritage archive + swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea + swh:1:dir:ef04a768181417fbc5eef4243e2507915f24deea;origin=https://www.softwareheritage.org/check-deposit-2020-10-08T13:52:34.509655;visit=swh:1:snp:c477c6ef51833127b13a86ece7d75e5b3cc4e93d;anchor=swh:1:rev:f26f3960c175f15f6e24200171d446b86f6f7230;path=/ + check-deposit-2020-10-08T13:52:34.509655 + diff --git a/swh/deposit/tests/data/https_deposit.test.updateswhid/1_test_321_status b/swh/deposit/tests/data/https_deposit.test.updateswhid/1_test_321_status new file mode 100644 index 00000000..557e2167 --- /dev/null +++ b/swh/deposit/tests/data/https_deposit.test.updateswhid/1_test_321_status @@ -0,0 +1,8 @@ + + 321 + partial + The deposit is in partial state + check-deposit-2020-10-08T13:52:34.509655 +