diff --git a/swh/deposit/client/__init__.py b/swh/deposit/client/__init__.py index 73db3ab5..1ae7780a 100644 --- a/swh/deposit/client/__init__.py +++ b/swh/deposit/client/__init__.py @@ -1,540 +1,540 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining an swh-deposit client """ import hashlib import os import requests from swh.core.config import SWHConfig from lxml import etree class ApiDepositClient(SWHConfig): """Deposit client to: - read a given deposit's archive(s) - read a given deposit's metadata - update a given deposit's status """ CONFIG_BASE_FILENAME = 'deposit/client' DEFAULT_CONFIG = { 'url': ('str', 'http://localhost:5006'), 'auth': ('dict', {}), # with optional 'username'/'password' keys } def __init__(self, config=None, _client=requests): super().__init__() if config is None: self.config = super().parse_config_file() else: self.config = config self._client = _client self.base_url = self.config['url'] auth = self.config['auth'] if auth == {}: self.auth = None else: self.auth = (auth['username'], auth['password']) def do(self, method, url, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in self._methods' keys Returns: The request's execution """ if hasattr(self._client, method): method_fn = getattr(self._client, method) else: raise ValueError('Development error, unsupported method %s' % ( method)) if self.auth: kwargs['auth'] = self.auth full_url = '%s%s' % (self.base_url.rstrip('/'), url) return method_fn(full_url, *args, **kwargs) def archive_get(self, archive_update_url, archive_path, log=None): """Retrieve the archive from the deposit to a local directory. Args: archive_update_url (str): The full deposit archive(s)'s raw content to retrieve locally archive_path (str): the local archive's path where to store the raw content Returns: The archive path to the local archive to load. Or None if any problem arose. """ r = self.do('get', archive_update_url, stream=True) if r.ok: with open(archive_path, 'wb') as f: for chunk in r.iter_content(): f.write(chunk) return archive_path msg = 'Problem when retrieving deposit archive at %s' % ( archive_update_url, ) if log: log.error(msg) raise ValueError(msg) def metadata_get(self, metadata_url, log=None): """Retrieve the metadata information on a given deposit. Args: metadata_url (str): The full deposit metadata url to retrieve locally Returns: The dictionary of metadata for that deposit or None if any problem arose. """ r = self.do('get', metadata_url) if r.ok: return r.json() msg = 'Problem when retrieving metadata at %s' % metadata_url if log: log.error(msg) raise ValueError(msg) def status_update(self, update_status_url, status, revision_id=None): """Update the deposit's status. Args: update_status_url (str): the full deposit's archive status (str): The status to update the deposit with revision_id (str/None): the revision's identifier to update to """ payload = {'status': status} if revision_id: payload['revision_id'] = revision_id self.do('put', update_status_url, json=payload) def check(self, check_url, log=None): """Check the deposit's associated data (metadata, archive(s)) Args: check_url (str): the full deposit's check url """ r = self.do('get', check_url) if r.ok: data = r.json() return data['status'] msg = 'Problem when checking deposit %s' % check_url if log: log.error(msg) raise ValueError(msg) class PublicApiDepositClient(ApiDepositClient): """Public api deposit client. """ def service_document(self, log=None): sd_url = '/servicedocument/' try: r = self.do('get', sd_url) except Exception as e: msg = 'Service document failure at %s: %s' % (sd_url, e) if log: log.error(msg) return { 'collection': None, 'error': msg, } else: if r.ok: tree = etree.fromstring(r.text) collections = tree.xpath( '/x:service/x:workspace/x:collection', namespaces={'x': 'http://www.w3.org/2007/app'}) items = dict(collections[0].items()) collection = items['href'].rsplit(self.base_url)[1] return { 'collection': collection } else: return { 'collection': None, 'error': r.status_code } def _compute_information(self, filepath, in_progress, slug, is_archive=True): """Given a filepath, compute necessary information on that file. Args: filepath (str): Path to a file is_archive (bool): is it an archive or not? Returns: dict with keys: 'content-type': content type associated 'md5sum': md5 sum 'filename': filename """ md5sum = hashlib.md5(open(filepath, 'rb').read()).hexdigest() filename = os.path.basename(filepath) if is_archive: extension = filename.split('.')[-1] if 'zip' in extension: content_type = 'application/zip' else: content_type = 'application/x-tar' else: content_type = None return { 'slug': slug, 'in_progress': in_progress, 'content-type': content_type, 'md5sum': md5sum, 'filename': filename, 'filepath': filepath, } def _parse_deposit_xml(self, xml_content): """Given an xml content as string, returns a deposit dict. """ tree = etree.fromstring(xml_content.encode('utf-8')) vals = tree.xpath( '/x:entry/x:deposit_id', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_id = vals[0].text vals = tree.xpath( '/x:entry/x:deposit_status', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_status = vals[0].text return {'deposit_id': deposit_id, 'deposit_status': deposit_status} def _parse_deposit_error(self, xml_content): """Parse xml error response to a dict. """ tree = etree.fromstring(xml_content.encode('utf-8')) vals = tree.xpath('/x:error/y:summary', namespaces={ 'x': 'http://purl.org/net/sword/', 'y': 'http://www.w3.org/2005/Atom' }) summary = vals[0].text if summary: summary = summary.strip() vals = tree.xpath( '/x:error/x:verboseDescription', namespaces={'x': 'http://purl.org/net/sword/'}) detail = vals[0].text if detail: detail = detail.strip() return {'summary': summary, 'detail': detail} def _compute_deposit_url(self, collection): return '/%s/' % collection def _compute_binary_url(self, collection, deposit_id): return '/%s/%s/media/' % (collection, deposit_id) def _compute_metadata_url(self, collection, deposit_id): return '/%s/%s/metadata/' % (collection, deposit_id) def _compute_multipart_url(self, collection, deposit_id): return self._compute_metadata_url(collection, deposit_id) - def deposit(self, collection, slug, archive_path=None, - metadata_path=None, in_progress=False, log=None): - """Post a new deposit + def deposit_create(self, collection, slug, archive_path=None, + metadata_path=None, in_progress=False, log=None): + """Create a new deposit. """ if archive_path and not metadata_path: return self.deposit_binary(collection, archive_path, slug, in_progress, log) elif not archive_path and metadata_path: return self.deposit_metadata(collection, metadata_path, slug, in_progress, log) else: return self.deposit_multipart(collection, archive_path, metadata_path, slug, in_progress, log) def _binary_headers(self, info): return { 'SLUG': info['slug'], 'CONTENT_MD5': info['md5sum'], 'IN-PROGRESS': str(info['in_progress']), 'CONTENT-TYPE': info['content-type'], 'CONTENT-DISPOSITION': 'attachment; filename=%s' % ( info['filename'], ), } def deposit_binary(self, collection, archive_path, slug, in_progress=False, log=None): deposit_url = self._compute_deposit_url(collection) info = self._compute_information(archive_path, in_progress, slug) headers = self._binary_headers(info) try: with open(archive_path, 'rb') as f: r = self.do('post', deposit_url, data=f, headers=headers) except Exception as e: msg = 'Binary posting deposit failure at %s: %s' % (deposit_url, e) if log: log.error(msg) return { 'deposit_id': None, 'error': msg, } else: if r.ok: return self._parse_deposit_xml(r.text) else: error = self._parse_deposit_error(r.text) error.update({ 'deposit_id': None, 'status': r.status_code, }) return error def _metadata_headers(self, info): return { 'SLUG': info['slug'], 'IN-PROGRESS': str(info['in_progress']), 'CONTENT-TYPE': 'application/atom+xml;type=entry', } def deposit_metadata(self, collection, metadata_path, slug, in_progress, log=None): deposit_url = self._compute_deposit_url(collection) headers = self._metadata_headers( {'slug': slug, 'in_progress': in_progress}) try: with open(metadata_path, 'rb') as f: r = self.do('post', deposit_url, data=f, headers=headers) except Exception as e: msg = 'Metadata posting deposit failure at %s: %s' % ( deposit_url, e) if log: log.error(msg) return { 'deposit_id': None, 'error': msg, } else: if r.ok: return self._parse_deposit_xml(r.text) else: error = self._parse_deposit_error(r.text) error.update({ 'deposit_id': None, 'status': r.status_code, }) return error def _multipart_info(self, info, info_meta): files = [ ('file', (info['filename'], open(info['filepath'], 'rb'), info['content-type'])), ('atom', (info_meta['filename'], open(info_meta['filepath'], 'rb'), 'application/atom+xml')), ] headers = { 'SLUG': info['slug'], 'CONTENT_MD5': info['md5sum'], 'IN-PROGRESS': str(info['in_progress']), } return files, headers def deposit_multipart(self, collection, archive_path, metadata_path, slug, in_progress, log=None): deposit_url = self._compute_deposit_url(collection) info = self._compute_information(archive_path, in_progress, slug) info_meta = self._compute_information( metadata_path, in_progress, slug, is_archive=False) files, headers = self._multipart_info(info, info_meta) try: r = self.do('post', deposit_url, files=files, headers=headers) except Exception as e: msg = 'Multipart posting deposit failure at %s: %s' % ( deposit_url, e) if log: log.error(msg) return { 'deposit_id': None, 'error': msg, } else: if r.ok: return self._parse_deposit_xml(r.text) else: error = self._parse_deposit_error(r.text) error.update({ 'deposit_id': None, 'status': r.status_code, }) return error # replace PUT EM binary # !replace POST EM binary # replace PUT EDIT multipart ; atom # !replace POST EDIT multipart ; atom def deposit_update(self, collection, deposit_id, slug, archive_path=None, metadata_path=None, in_progress=False, replace=False, log=None): - """Post a new deposit + """Update an existing deposit. """ if archive_path and not metadata_path: return self.deposit_binary_update( collection, deposit_id, archive_path, slug, in_progress, replace, log) elif not archive_path and metadata_path: return self.deposit_metadata_update( collection, deposit_id, metadata_path, slug, in_progress, replace, log) else: return self.deposit_multipart_update( collection, deposit_id, archive_path, metadata_path, slug, in_progress, replace, log) def deposit_binary_update(self, collection, deposit_id, archive_path, slug, in_progress, replace, log=None): method = 'put' if replace else 'post' deposit_url = self._compute_binary_url(collection, deposit_id) info = self._compute_information(archive_path, in_progress, slug) headers = self._binary_headers(info) try: with open(archive_path, 'rb') as f: r = self.do(method, deposit_url, data=f, headers=headers) except Exception as e: msg = 'Binary deposit updating failure at %s: %s' % ( deposit_url, e) if log: log.error(msg) return { 'deposit_id': None, 'error': msg, } else: if r.ok: return self._parse_deposit_xml(r.text) else: error = self._parse_deposit_error(r.text) error.update({ 'deposit_id': None, 'status': r.status_code, }) return error def deposit_metadata_update(self, collection, deposit_id, metadata_path, slug, in_progress, replace, log=None): method = 'put' if replace else 'post' deposit_url = self._compute_metadata_url(collection, deposit_id) headers = self._metadata_headers( {'slug': slug, 'in_progress': in_progress}) try: with open(metadata_path, 'rb') as f: r = self.do(method, deposit_url, data=f, headers=headers) except Exception as e: msg = 'Metadata deposit updating deposit failure at %s: %s' % ( deposit_url, e) if log: log.error(msg) return { 'deposit_id': None, 'error': msg, } else: if r.ok: return self._parse_deposit_xml(r.text) else: error = self._parse_deposit_error(r.text) error.update({ 'deposit_id': None, 'status': r.status_code, }) return error def deposit_multipart_update(self, collection, deposit_id, archive_path, metadata_path, slug, in_progress, replace, log=None): method = 'put' if replace else 'post' deposit_url = self._compute_multipart_url(collection, deposit_id) info = self._compute_information(archive_path, in_progress, slug) info_meta = self._compute_information(metadata_path, in_progress, slug, is_archive=False) files, headers = self._multipart_info(info, info_meta) try: r = self.do(method, deposit_url, files=files, headers=headers) except Exception as e: msg = 'Multipart deposit updating failure at %s: %s' % ( deposit_url, e) if log: log.error(msg) return { 'deposit_id': None, 'error': msg, } else: if r.ok: return self._parse_deposit_xml(r.text) else: error = self._parse_deposit_error(r.text) error.update({ 'deposit_id': None, 'status': r.status_code, }) return error diff --git a/swh/deposit/client/cli.py b/swh/deposit/client/cli.py index 7f2b1324..522cfc00 100755 --- a/swh/deposit/client/cli.py +++ b/swh/deposit/client/cli.py @@ -1,261 +1,262 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Script to demonstrate software deposit scenario to https://deposit.sofwareheritage.org. Use: python3 -m swh.deposit.client.cli --help Documentation: https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html # noqa """ import os import click import logging import uuid from . import PublicApiDepositClient class InputError(ValueError): """Input script error """ pass def generate_slug(prefix='swh-sample'): """Generate a slug (sample purposes). """ return '%s-%s' % (prefix, uuid.uuid4()) def parse_cli_options(archive, username, password, metadata, binary_deposit, metadata_deposit, collection, slug, partial, deposit_id, replace, url): """Parse the cli options and make sure the combination is acceptable*. If not, an InputError exception is raised explaining the issue. By acceptable, we mean: - A multipart deposit (create or update) needs both an existing software archive and an existing metadata file - A binary deposit (create/update) needs an existing software archive - A metadata deposit (create/update) needs an existing metadata file - A deposit update needs a deposit_id to be provided - This won't prevent all cases though. + This won't prevent all failure cases though. The remaining + errors are already dealt with the underlying api client. Raises: InputError explaining the issue Returns: dict with the following keys: 'archive': the software archive to deposit 'username': username 'password': associated password 'metadata': the metadata file to deposit 'collection': the username's associated client 'slug': the slug or external id identifying the deposit to make 'partial': if the deposit is partial or not 'client': instantiated class 'url': deposit's server main entry point 'deposit_type': deposit's type (binary, multipart, metadata) 'deposit_id': optional deposit identifier """ if binary_deposit and metadata_deposit: # too many flags use, remove redundant ones (-> multipart deposit) binary_deposit = False metadata_deposit = False if archive and not os.path.exists(archive): raise InputError('Software Archive %s must exist!' % archive) if archive and not metadata: metadata = '%s.metadata.xml' % archive if metadata_deposit: archive = None if binary_deposit: metadata = None if metadata_deposit and not metadata: raise InputError("Metadata deposit filepath must be provided for a metadata deposit") # noqa if not binary_deposit and not os.path.exists(metadata): raise InputError('Software Archive metadata %s must exist!' % metadata) if replace and not deposit_id: raise InputError( 'To update an existing deposit, you must provide its id') client = PublicApiDepositClient({ 'url': url, 'auth': { 'username': username, 'password': password }, }) if not collection: # retrieve user's collection sd_content = client.service_document() if 'error' in sd_content: raise InputError(sd_content['error']) collection = sd_content['collection'].replace('/', '') if not slug: # generate slug slug = generate_slug() return { 'archive': archive, 'username': username, 'password': password, 'metadata': metadata, 'collection': collection, 'slug': slug, 'partial': partial, 'client': client, 'url': url, 'deposit_id': deposit_id, 'replace': replace, } def deposit_create(config, dry_run, log): """Delegate the actual deposit to the deposit client. """ log.debug('Create deposit') client = config['client'] collection = config['collection'] archive_path = config['archive'] metadata_path = config['metadata'] slug = config['slug'] in_progress = config['partial'] client = config['client'] if not dry_run: - r = client.deposit(collection, slug, archive_path, metadata_path, - in_progress, log) + r = client.deposit_create(collection, slug, archive_path, + metadata_path, in_progress, log) return r return {} def deposit_update(config, dry_run, log): """Delegate the actual deposit to the deposit client. """ log.debug('Update deposit') client = config['client'] collection = config['collection'] deposit_id = config['deposit_id'] archive_path = config['archive'] metadata_path = config['metadata'] slug = config['slug'] in_progress = config['partial'] replace = config['replace'] client = config['client'] if not dry_run: r = client.deposit_update(collection, deposit_id, slug, archive_path, metadata_path, in_progress, replace, log) return r return {} @click.command() -@click.option('--archive', '(Optional) Software archive to deposit') @click.option('--username', required=1, help="(Mandatory) User's name") @click.option('--password', required=1, help="(Mandatory) User's associated password") +@click.option('--archive', + help='(Optional) Software archive to deposit') @click.option('--metadata', help="(Optional) Path to xml metadata file. If not provided, this will use a file named .metadata.xml") # noqa @click.option('--binary-deposit/--no-binary-deposit', default=False, help='(Optional) Software archive only deposit') @click.option('--metadata-deposit/--no-metadata-deposit', default=False, help='(Optional) Metadata only deposit') @click.option('--collection', help="(Optional) User's collection. If not provided, this will be fetched.") # noqa @click.option('--slug', help="""(Optional) External system information identifier. If not provided, it will be generated""") # noqa @click.option('--partial/--no-partial', default=False, help='(Optional) The deposit will be partial, other deposits will have to take place to finalize it.') # noqa @click.option('--deposit-id', default=None, help='(Optional) Update an existing partial deposit with its identifier') # noqa @click.option('--replace/--no-replace', default=False, help='(Optional) Update by replacing existing metadata to a deposit') # noqa @click.option('--url', default='http://localhost:5006/1', help="(Optional) Deposit server api endpoint. By default, https://deposit.softwareheritage.org/1") # noqa @click.option('--dry-run/--no-dry-run', default=False, help='(Optional) No-op deposit') @click.option('--verbose/--no-verbose', default=False, help='Verbose mode') -def main(archive, username, password, - metadata=None, binary_deposit=False, metadata_deposit=False, - collection=None, slug=None, partial=False, - deposit_id=None, replace=False, - url='https://deposit.softwareheritage.org/1', +def main(username, password, archive=None, metadata=None, + binary_deposit=False, metadata_deposit=False, + collection=None, slug=None, partial=False, deposit_id=None, + replace=False, url='https://deposit.softwareheritage.org/1', dry_run=True, verbose=False): """Software Heritage Deposit client - Create (or update partial) deposit through the command line. More documentation can be found at https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html. """ log = logging.getLogger('swh-deposit') log.addHandler(logging.StreamHandler()) _loglevel = logging.DEBUG if verbose else logging.INFO log.setLevel(_loglevel) if dry_run: log.info("**DRY RUN**") config = {} try: log.debug('Parsing cli options') config = parse_cli_options( archive, username, password, metadata, binary_deposit, metadata_deposit, collection, slug, partial, deposit_id, replace, url) except InputError as e: log.error('Problem during parsing options: %s' % e) return 1 if verbose: log.info("Parsed configuration: %s" % ( config, )) deposit_id = config['deposit_id'] if not deposit_id: r = deposit_create(config, dry_run, log) else: r = deposit_update(config, dry_run, log) if r: log.info(r) if __name__ == '__main__': main()