diff --git a/bin/swh-deposit b/bin/swh-deposit new file mode 100755 index 00000000..c7b562e9 --- /dev/null +++ b/bin/swh-deposit @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +"""Script to demonstrate software deposit scenario to +https://deposit.sofwareheritage.org. + +Use: ./swh-deposit --help + +""" + +import os +import click +import logging +import uuid + + +from swh.deposit.loader.client import DepositClient + + +class InputError(ValueError): + """Input script error + + """ + pass + + +def generate_slug(prefix='swh-sample'): + """Generate a slug (sample purposes). + + """ + return '%s-%s' % (prefix, uuid.uuid4()) + + +def binary_deposit(config): + deposit_url = config['collection'] + filepath = config['archive'] + slug = config['slug'] + client = config['client'] + in_progress = config['partial'] + + return client.deposit_binary_post( + deposit_url, filepath, slug, in_progress) + + +def parse_cli_options(archive, username, password, metadata, binary, + collection, slug, partial, deposit_id, url): + """Parse the cli options. + + if any issues is detected, an InputError exception is raised + explaining the issue. + + """ + if not os.path.exists(archive): + raise InputError('Software Archive %s must exist!' % archive) + + if not metadata: + metadata = '%s.metadata.xml' % archive + + if not binary and metadata: + raise InputError('Software Archive metadata %s must exist!' % metadata) + + client = DepositClient({ + 'url': url, + 'auth': { + 'username': username, + 'password': password + }, + }) + + if not collection: + try: + # retrieve user's collection + sd_content = client.service_document_get() + collection = sd_content['collection'] + except Exception: + raise InputError('Connection problem with deposit server %s' % ( + url, )) + + if not slug: + # generate slug + slug = generate_slug() + + return { + 'archive': archive, + 'username': username, + 'password': password, + 'metadata': metadata, + 'collection': collection, + 'slug': slug, + 'partial': partial, + 'client': client, + 'url': url, + } + + +def make_deposit(config): + """Evaluate the configuration. + + """ + print(config) + + +@click.command(help='Software Heritage Deposit client') +@click.argument('archive', required=1) +@click.option('--username', required=1, + help="Mandatory user's name") +@click.option('--password', required=1, + help="Mandatory user's associated password") +@click.option('--metadata', default="je-suis-gpl.metadata.xml", + help="""Optional path to an xml metadata file. + If not provided, this will use a file named + .metadata.xml""") +@click.option('--binary/--no-binary', default=False, + help='Binary deposit only') +@click.option('--collection', + help="""Optional user's collection. + If not provided, this will be retrieved.""") +@click.option('--slug', + help="""External system information identifier. + If not provided, it will be generated""") +@click.option('--partial', type=click.BOOL, + help='The deposit will be partial (as in not finished)') +@click.option('--deposit-id', type=click.INT, + help='Update an existing partial deposit with its identifier') +@click.option('--url', default='http://localhost:5006/1') +@click.option('--dry-run/--no-dry-run', default=False) +@click.option('--verbose/--no-verbose', default=False) +def main(archive, username, password, + metadata=None, binary=False, collection=None, slug=None, + partial=False, deposit_id=None, url='http://localhost:5006/1', + dry_run=True, verbose=False): + + log = logging.getLogger('swh-deposit') + log.addHandler(logging.StreamHandler()) + _loglevel = logging.DEBUG if verbose else logging.INFO + log.setLevel(_loglevel) + + if dry_run: + log.info("**DRY RUN**") + + config = {} + + try: + log.debug('Parsing cli options') + config = parse_cli_options(archive, username, password, + metadata, binary, collection, slug, + partial, deposit_id, url) + + except InputError as e: + log.error('Problem during parsing options: %s' % e) + return 1 + + if dry_run: + log.info("Parsed configuration: %s" % ( + config, )) + + log.debug('Executing deposit') + if binary: + log.debug('Binary deposit') + if not dry_run: + binary_deposit(config) + else: + pass + + +if __name__ == '__main__': + main() diff --git a/swh/deposit/loader/client.py b/swh/deposit/loader/client.py index 4830a0ef..bed7acb5 100644 --- a/swh/deposit/loader/client.py +++ b/swh/deposit/loader/client.py @@ -1,150 +1,219 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -"""Module in charge of defining a swh-deposit client +"""Module in charge of defining an swh-deposit client """ +import hashlib +import os import requests + from swh.core.config import SWHConfig +from lxml import etree class DepositClient(SWHConfig): """Deposit client to: - read a given deposit's archive(s) - read a given deposit's metadata - update a given deposit's status """ CONFIG_BASE_FILENAME = 'deposit/client' DEFAULT_CONFIG = { 'url': ('str', 'http://localhost:5006'), - 'auth': ('dict', {}) # with optional 'username'/'password' keys + 'auth': ('dict', {}), # with optional 'username'/'password' keys } def __init__(self, config=None, _client=requests): super().__init__() if config is None: self.config = super().parse_config_file() else: self.config = config self._client = _client self.base_url = self.config['url'] auth = self.config['auth'] if auth == {}: self.auth = None else: self.auth = (auth['username'], auth['password']) def do(self, method, url, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in self._methods' keys Returns: The request's execution """ if hasattr(self._client, method): method_fn = getattr(self._client, method) else: raise ValueError('Development error, unsupported method %s' % ( method)) if self.auth: kwargs['auth'] = self.auth full_url = '%s%s' % (self.base_url.rstrip('/'), url) return method_fn(full_url, *args, **kwargs) def archive_get(self, archive_update_url, archive_path, log=None): """Retrieve the archive from the deposit to a local directory. Args: archive_update_url (str): The full deposit archive(s)'s raw content to retrieve locally archive_path (str): the local archive's path where to store the raw content Returns: The archive path to the local archive to load. Or None if any problem arose. """ r = self.do('get', archive_update_url, stream=True) if r.ok: with open(archive_path, 'wb') as f: for chunk in r.iter_content(): f.write(chunk) return archive_path msg = 'Problem when retrieving deposit archive at %s' % ( archive_update_url, ) if log: log.error(msg) raise ValueError(msg) def metadata_get(self, metadata_url, log=None): """Retrieve the metadata information on a given deposit. Args: metadata_url (str): The full deposit metadata url to retrieve locally Returns: The dictionary of metadata for that deposit or None if any problem arose. """ r = self.do('get', metadata_url) if r.ok: return r.json() msg = 'Problem when retrieving metadata at %s' % metadata_url if log: log.error(msg) raise ValueError(msg) def status_update(self, update_status_url, status, revision_id=None): """Update the deposit's status. Args: update_status_url (str): the full deposit's archive status (str): The status to update the deposit with revision_id (str/None): the revision's identifier to update to """ payload = {'status': status} if revision_id: payload['revision_id'] = revision_id self.do('put', update_status_url, json=payload) def check(self, check_url, log=None): """Check the deposit's associated data (metadata, archive(s)) Args: check_url (str): the full deposit's check url """ r = self.do('get', check_url) if r.ok: data = r.json() return data['status'] msg = 'Problem when checking deposit %s' % check_url if log: log.error(msg) raise ValueError(msg) + + def service_document_get(self, log=None): + sd_url = '/servicedocument/' + r = self.do('get', sd_url) + if r.ok: + tree = etree.fromstring(r.text) + collections = tree.xpath( + '/x:service/x:workspace/x:collection', + namespaces={'x': 'http://www.w3.org/2007/app'}) + items = dict(collections[0].items()) + collection = items['href'].rsplit(self.base_url)[1] + return { + 'collection': collection + } + + msg = 'Service document failure at %s' % sd_url + if log: + log.error(msg) + + raise ValueError(msg) + + def deposit_binary_post(self, deposit_url, filepath, slug, + in_progress=False, log=None): + md5sum = hashlib.md5(open(filepath, 'rb').read()).hexdigest() + filename = os.path.basename(filepath) + + extension = filename.split('.')[-1] + if 'zip' in extension: + content_type = 'application/zip' + else: + content_type = 'application/x-tar' + + headers = { + 'SLUG': slug, + 'CONTENT_MD5': md5sum, + 'IN-PROGRESS': str(in_progress), + 'CONTENT-TYPE': content_type, + 'CONTENT-DISPOSITION': 'attachment; filename=%s' % filename, + } + + with open(filepath, 'rb') as f: + r = self.do('post', deposit_url, data=f, headers=headers) + + if r.ok: + tree = etree.fromstring(r.text) + vals = tree.xpath( + '/x:entry/x:deposit_id', + namespaces={'x': 'http://www.w3.org/2005/Atom'}) + deposit_id = vals[0].text + + return { + 'deposit_id': deposit_id, + } + + msg = 'Binary posting deposit failure at %s' % deposit_url + if log: + log.error(msg) + + raise ValueError(msg) + + def deposit_atom_post(self, deposit_url, log=None): + pass + + def deposit_multipart_post(self, deposit_url, log=None): + pass