diff --git a/bin/swh-deposit b/bin/swh-deposit index c7b562e9..aae5f134 100755 --- a/bin/swh-deposit +++ b/bin/swh-deposit @@ -1,172 +1,242 @@ #!/usr/bin/env python3 # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Script to demonstrate software deposit scenario to https://deposit.sofwareheritage.org. Use: ./swh-deposit --help """ import os import click import logging import uuid from swh.deposit.loader.client import DepositClient class InputError(ValueError): """Input script error """ pass def generate_slug(prefix='swh-sample'): """Generate a slug (sample purposes). """ return '%s-%s' % (prefix, uuid.uuid4()) -def binary_deposit(config): - deposit_url = config['collection'] - filepath = config['archive'] - slug = config['slug'] - client = config['client'] - in_progress = config['partial'] - - return client.deposit_binary_post( - deposit_url, filepath, slug, in_progress) - - -def parse_cli_options(archive, username, password, metadata, binary, +def parse_cli_options(archive, username, password, metadata, + binary_deposit, metadata_deposit, collection, slug, partial, deposit_id, url): - """Parse the cli options. - - if any issues is detected, an InputError exception is raised - explaining the issue. + """Parse the cli options and make sure the combination is acceptable. + If not, an InputError exception is raised explaining the issue. + + Raises: + InputError explaining the issue + + Returns: + dict with the following keys: + + 'archive': the software archive to deposit + 'username': username + 'password': associated password + 'metadata': the metadata file to deposit + 'collection': the username's associated client + 'slug': the slug or external id identifying the deposit to make + 'partial': if the deposit is partial or not + 'client': instantiated class + 'url': deposit's server main entry point + 'deposit_type': deposit's type (binary, multipart, metadata) """ + if binary_deposit and metadata_deposit: + # too many flags use, remove redundant ones (-> multipart deposit) + binary_deposit = False + metadata_deposit = False + if not os.path.exists(archive): raise InputError('Software Archive %s must exist!' % archive) if not metadata: metadata = '%s.metadata.xml' % archive - if not binary and metadata: + if not binary_deposit and not os.path.exists(metadata): raise InputError('Software Archive metadata %s must exist!' % metadata) client = DepositClient({ 'url': url, 'auth': { 'username': username, 'password': password }, }) + if collection: # transpose to the right collection path + collection = '/%s/' % collection + if not collection: - try: - # retrieve user's collection - sd_content = client.service_document_get() - collection = sd_content['collection'] - except Exception: - raise InputError('Connection problem with deposit server %s' % ( - url, )) + # retrieve user's collection + sd_content = client.service_document() + if 'error' in sd_content: + raise InputError(sd_content['error']) + collection = sd_content['collection'] if not slug: # generate slug slug = generate_slug() + if binary_deposit: + deposit_type = 'binary' + elif metadata_deposit: + deposit_type = 'metadata' + else: + deposit_type = 'multipart' + return { 'archive': archive, 'username': username, 'password': password, 'metadata': metadata, 'collection': collection, 'slug': slug, 'partial': partial, 'client': client, 'url': url, + 'deposit_type': deposit_type, } -def make_deposit(config): - """Evaluate the configuration. +def do_binary_deposit(config, dry_run, log): + """Execute the binary deposit. """ - print(config) + log.debug('Binary deposit') + + deposit_url = config['collection'] + filepath = config['archive'] + slug = config['slug'] + client = config['client'] + in_progress = config['partial'] + + if not dry_run: + return client.deposit_binary(deposit_url, filepath, slug, in_progress) + return {} + + +def do_metadata_deposit(config, dry_run, log): + log.debug('Metadata deposit') + deposit_url = config['collection'] + filepath = config['metadata'] + slug = config['slug'] + client = config['client'] + in_progress = config['partial'] + + if not dry_run: + r = client.deposit_metadata(deposit_url, filepath, slug, in_progress) + return r + return {} + + +def do_multipart_deposit(config, dry_run, log): + log.debug('Multipart deposit') + client = config['client'] + deposit_url = config['collection'] + archive_path = config['archive'] + metadata_path = config['metadata'] + slug = config['slug'] + client = config['client'] + in_progress = config['partial'] + if not dry_run: + r = client.deposit_multipart(deposit_url, archive_path, metadata_path, + slug, in_progress) + return r + return {} @click.command(help='Software Heritage Deposit client') @click.argument('archive', required=1) @click.option('--username', required=1, help="Mandatory user's name") @click.option('--password', required=1, help="Mandatory user's associated password") -@click.option('--metadata', default="je-suis-gpl.metadata.xml", +@click.option('--metadata', help="""Optional path to an xml metadata file. If not provided, this will use a file named .metadata.xml""") -@click.option('--binary/--no-binary', default=False, - help='Binary deposit only') +@click.option('--binary-deposit/--no-binary-deposit', default=False, + help='Software archive only deposit') +@click.option('--metadata-deposit/--no-metadata-deposit', default=False, + help='Metadata only deposit') @click.option('--collection', help="""Optional user's collection. If not provided, this will be retrieved.""") @click.option('--slug', help="""External system information identifier. If not provided, it will be generated""") @click.option('--partial', type=click.BOOL, help='The deposit will be partial (as in not finished)') @click.option('--deposit-id', type=click.INT, help='Update an existing partial deposit with its identifier') @click.option('--url', default='http://localhost:5006/1') @click.option('--dry-run/--no-dry-run', default=False) @click.option('--verbose/--no-verbose', default=False) def main(archive, username, password, - metadata=None, binary=False, collection=None, slug=None, + metadata=None, binary_deposit=False, metadata_deposit=False, + collection=None, slug=None, partial=False, deposit_id=None, url='http://localhost:5006/1', dry_run=True, verbose=False): log = logging.getLogger('swh-deposit') log.addHandler(logging.StreamHandler()) _loglevel = logging.DEBUG if verbose else logging.INFO log.setLevel(_loglevel) if dry_run: log.info("**DRY RUN**") config = {} try: log.debug('Parsing cli options') - config = parse_cli_options(archive, username, password, - metadata, binary, collection, slug, - partial, deposit_id, url) + config = parse_cli_options( + archive, username, password, metadata, binary_deposit, + metadata_deposit, collection, slug, partial, deposit_id, url) except InputError as e: log.error('Problem during parsing options: %s' % e) return 1 - if dry_run: + if verbose: log.info("Parsed configuration: %s" % ( config, )) - log.debug('Executing deposit') - if binary: - log.debug('Binary deposit') - if not dry_run: - binary_deposit(config) - else: - pass + deposit_fn = { + 'binary': + lambda config, dry_run=dry_run, log=log: do_binary_deposit( + config, dry_run, log), + 'metadata': + lambda config, dry_run=dry_run, log=log: do_metadata_deposit( + config, dry_run, log), + 'multipart': + lambda config, dry_run=dry_run, log=log: do_multipart_deposit( + config, dry_run, log), + } + + deposit_type = config['deposit_type'] + r = deposit_fn[deposit_type](config) + log.info(r) if __name__ == '__main__': main() diff --git a/swh/deposit/loader/client.py b/swh/deposit/loader/client.py index bed7acb5..3b7aab03 100644 --- a/swh/deposit/loader/client.py +++ b/swh/deposit/loader/client.py @@ -1,219 +1,279 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining an swh-deposit client """ import hashlib import os import requests from swh.core.config import SWHConfig from lxml import etree class DepositClient(SWHConfig): """Deposit client to: - read a given deposit's archive(s) - read a given deposit's metadata - update a given deposit's status """ CONFIG_BASE_FILENAME = 'deposit/client' DEFAULT_CONFIG = { 'url': ('str', 'http://localhost:5006'), 'auth': ('dict', {}), # with optional 'username'/'password' keys } def __init__(self, config=None, _client=requests): super().__init__() if config is None: self.config = super().parse_config_file() else: self.config = config self._client = _client self.base_url = self.config['url'] auth = self.config['auth'] if auth == {}: self.auth = None else: self.auth = (auth['username'], auth['password']) def do(self, method, url, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in self._methods' keys Returns: The request's execution """ if hasattr(self._client, method): method_fn = getattr(self._client, method) else: raise ValueError('Development error, unsupported method %s' % ( method)) if self.auth: kwargs['auth'] = self.auth full_url = '%s%s' % (self.base_url.rstrip('/'), url) return method_fn(full_url, *args, **kwargs) def archive_get(self, archive_update_url, archive_path, log=None): """Retrieve the archive from the deposit to a local directory. Args: archive_update_url (str): The full deposit archive(s)'s raw content to retrieve locally archive_path (str): the local archive's path where to store the raw content Returns: The archive path to the local archive to load. Or None if any problem arose. """ r = self.do('get', archive_update_url, stream=True) if r.ok: with open(archive_path, 'wb') as f: for chunk in r.iter_content(): f.write(chunk) return archive_path msg = 'Problem when retrieving deposit archive at %s' % ( archive_update_url, ) if log: log.error(msg) raise ValueError(msg) def metadata_get(self, metadata_url, log=None): """Retrieve the metadata information on a given deposit. Args: metadata_url (str): The full deposit metadata url to retrieve locally Returns: The dictionary of metadata for that deposit or None if any problem arose. """ r = self.do('get', metadata_url) if r.ok: return r.json() msg = 'Problem when retrieving metadata at %s' % metadata_url if log: log.error(msg) raise ValueError(msg) def status_update(self, update_status_url, status, revision_id=None): """Update the deposit's status. Args: update_status_url (str): the full deposit's archive status (str): The status to update the deposit with revision_id (str/None): the revision's identifier to update to """ payload = {'status': status} if revision_id: payload['revision_id'] = revision_id self.do('put', update_status_url, json=payload) def check(self, check_url, log=None): """Check the deposit's associated data (metadata, archive(s)) Args: check_url (str): the full deposit's check url """ r = self.do('get', check_url) if r.ok: data = r.json() return data['status'] msg = 'Problem when checking deposit %s' % check_url if log: log.error(msg) raise ValueError(msg) - def service_document_get(self, log=None): + def service_document(self, log=None): sd_url = '/servicedocument/' - r = self.do('get', sd_url) - if r.ok: - tree = etree.fromstring(r.text) - collections = tree.xpath( - '/x:service/x:workspace/x:collection', - namespaces={'x': 'http://www.w3.org/2007/app'}) - items = dict(collections[0].items()) - collection = items['href'].rsplit(self.base_url)[1] + try: + r = self.do('get', sd_url) + except Exception as e: + msg = 'Service document failure at %s: %s' % (sd_url, e) + if log: + log.error(msg) return { - 'collection': collection + 'collection': None, + 'error': msg, } - - msg = 'Service document failure at %s' % sd_url - if log: - log.error(msg) - - raise ValueError(msg) - - def deposit_binary_post(self, deposit_url, filepath, slug, - in_progress=False, log=None): + else: + if r.ok: + tree = etree.fromstring(r.text) + collections = tree.xpath( + '/x:service/x:workspace/x:collection', + namespaces={'x': 'http://www.w3.org/2007/app'}) + items = dict(collections[0].items()) + collection = items['href'].rsplit(self.base_url)[1] + return { + 'collection': collection + } + else: + return { + 'collection': None, + 'error': r.status_code + } + + def deposit_binary(self, deposit_url, filepath, slug, in_progress=False, + log=None): md5sum = hashlib.md5(open(filepath, 'rb').read()).hexdigest() filename = os.path.basename(filepath) extension = filename.split('.')[-1] if 'zip' in extension: content_type = 'application/zip' else: content_type = 'application/x-tar' headers = { 'SLUG': slug, 'CONTENT_MD5': md5sum, 'IN-PROGRESS': str(in_progress), 'CONTENT-TYPE': content_type, 'CONTENT-DISPOSITION': 'attachment; filename=%s' % filename, } - with open(filepath, 'rb') as f: - r = self.do('post', deposit_url, data=f, headers=headers) + try: + with open(filepath, 'rb') as f: + r = self.do('post', deposit_url, data=f, headers=headers) - if r.ok: - tree = etree.fromstring(r.text) - vals = tree.xpath( - '/x:entry/x:deposit_id', - namespaces={'x': 'http://www.w3.org/2005/Atom'}) - deposit_id = vals[0].text + except Exception as e: + msg = 'Binary posting deposit failure at %s: %s' % (deposit_url, e) + if log: + log.error(msg) return { - 'deposit_id': deposit_id, + 'deposit_id': None, + 'error': msg, } + else: + if r.ok: + tree = etree.fromstring(r.text) + vals = tree.xpath( + '/x:entry/x:deposit_id', + namespaces={'x': 'http://www.w3.org/2005/Atom'}) + deposit_id = vals[0].text + + return { + 'deposit_id': deposit_id, + } + else: + return { + 'deposit_id': None, + 'error': r.status_code + } + + def deposit_metadata(self, deposit_url, filepath, slug, in_progress, + log=None): + headers = { + 'SLUG': slug, + 'IN-PROGRESS': str(in_progress), + 'CONTENT-TYPE': 'application/atom+xml;type=entry', + } - msg = 'Binary posting deposit failure at %s' % deposit_url - if log: - log.error(msg) - - raise ValueError(msg) + try: + with open(filepath, 'rb') as f: + r = self.do('post', deposit_url, data=f, headers=headers) - def deposit_atom_post(self, deposit_url, log=None): - pass + except Exception as e: + msg = 'Metadata posting deposit failure at %s: %s' % ( + deposit_url, e) + if log: + log.error(msg) - def deposit_multipart_post(self, deposit_url, log=None): - pass + return { + 'deposit_id': None, + 'error': msg, + } + else: + if r.ok: + tree = etree.fromstring(r.text) + vals = tree.xpath( + '/x:entry/x:deposit_id', + namespaces={'x': 'http://www.w3.org/2005/Atom'}) + deposit_id = vals[0].text + + return { + 'deposit_id': deposit_id, + } + else: + return { + 'deposit_id': None, + 'error': r.status_code + } + + def deposit_multipart(self, deposit_url, archive_path, metadata_path, + slug, in_progress, log=None): + + return { + 'deposit_id': None, + 'error': None, + }