diff --git a/swh/deposit/client/__init__.py b/swh/deposit/client/__init__.py old mode 100755 new mode 100644 index e69de29b..d4ee4441 --- a/swh/deposit/client/__init__.py +++ b/swh/deposit/client/__init__.py @@ -0,0 +1,337 @@ +# Copyright (C) 2017-2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Module in charge of defining an swh-deposit client + +""" + +import hashlib +import os +import requests + +from swh.core.config import SWHConfig +from lxml import etree + + +class DepositClient(SWHConfig): + """Deposit client to: + + - read a given deposit's archive(s) + - read a given deposit's metadata + - update a given deposit's status + + """ + CONFIG_BASE_FILENAME = 'deposit/client' + DEFAULT_CONFIG = { + 'url': ('str', 'http://localhost:5006'), + 'auth': ('dict', {}), # with optional 'username'/'password' keys + } + + def __init__(self, config=None, _client=requests): + super().__init__() + if config is None: + self.config = super().parse_config_file() + else: + self.config = config + + self._client = _client + self.base_url = self.config['url'] + auth = self.config['auth'] + if auth == {}: + self.auth = None + else: + self.auth = (auth['username'], auth['password']) + + def do(self, method, url, *args, **kwargs): + """Internal method to deal with requests, possibly with basic http + authentication. + + Args: + method (str): supported http methods as in self._methods' keys + + Returns: + The request's execution + + """ + if hasattr(self._client, method): + method_fn = getattr(self._client, method) + else: + raise ValueError('Development error, unsupported method %s' % ( + method)) + + if self.auth: + kwargs['auth'] = self.auth + + full_url = '%s%s' % (self.base_url.rstrip('/'), url) + return method_fn(full_url, *args, **kwargs) + + def archive_get(self, archive_update_url, archive_path, log=None): + """Retrieve the archive from the deposit to a local directory. + + Args: + archive_update_url (str): The full deposit archive(s)'s raw content + to retrieve locally + + archive_path (str): the local archive's path where to store + the raw content + + Returns: + The archive path to the local archive to load. + Or None if any problem arose. + + """ + r = self.do('get', archive_update_url, stream=True) + if r.ok: + with open(archive_path, 'wb') as f: + for chunk in r.iter_content(): + f.write(chunk) + + return archive_path + + msg = 'Problem when retrieving deposit archive at %s' % ( + archive_update_url, ) + if log: + log.error(msg) + + raise ValueError(msg) + + def metadata_get(self, metadata_url, log=None): + """Retrieve the metadata information on a given deposit. + + Args: + metadata_url (str): The full deposit metadata url to retrieve + locally + + Returns: + The dictionary of metadata for that deposit or None if any + problem arose. + + """ + r = self.do('get', metadata_url) + if r.ok: + return r.json() + + msg = 'Problem when retrieving metadata at %s' % metadata_url + if log: + log.error(msg) + + raise ValueError(msg) + + def status_update(self, update_status_url, status, + revision_id=None): + """Update the deposit's status. + + Args: + update_status_url (str): the full deposit's archive + status (str): The status to update the deposit with + revision_id (str/None): the revision's identifier to update to + + """ + payload = {'status': status} + if revision_id: + payload['revision_id'] = revision_id + + self.do('put', update_status_url, json=payload) + + def check(self, check_url, log=None): + """Check the deposit's associated data (metadata, archive(s)) + + Args: + check_url (str): the full deposit's check url + + """ + r = self.do('get', check_url) + if r.ok: + data = r.json() + return data['status'] + + msg = 'Problem when checking deposit %s' % check_url + if log: + log.error(msg) + + raise ValueError(msg) + + def service_document(self, log=None): + sd_url = '/servicedocument/' + try: + r = self.do('get', sd_url) + except Exception as e: + msg = 'Service document failure at %s: %s' % (sd_url, e) + if log: + log.error(msg) + return { + 'collection': None, + 'error': msg, + } + else: + if r.ok: + tree = etree.fromstring(r.text) + collections = tree.xpath( + '/x:service/x:workspace/x:collection', + namespaces={'x': 'http://www.w3.org/2007/app'}) + items = dict(collections[0].items()) + collection = items['href'].rsplit(self.base_url)[1] + return { + 'collection': collection + } + else: + return { + 'collection': None, + 'error': r.status_code + } + + def _compute_information_on(self, filepath, is_archive=True): + """Given a filepath, compute necessary information on that file. + + Args: + filepath (str): Path to a file + is_archive (bool): is it an archive or not? + + Returns: + dict with keys: + 'content-type': content type associated + 'md5sum': md5 sum + 'filename': filename + """ + md5sum = hashlib.md5(open(filepath, 'rb').read()).hexdigest() + filename = os.path.basename(filepath) + + if is_archive: + extension = filename.split('.')[-1] + if 'zip' in extension: + content_type = 'application/zip' + else: + content_type = 'application/x-tar' + else: + content_type = None + + return { + 'content-type': content_type, + 'md5sum': md5sum, + 'filename': filename, + } + + def _parse_deposit_xml(self, xml_content): + """Given an xml content as string, returns a deposit dict. + + """ + tree = etree.fromstring(xml_content) + vals = tree.xpath( + '/x:entry/x:deposit_id', + namespaces={'x': 'http://www.w3.org/2005/Atom'}) + deposit_id = vals[0].text + + return {'deposit_id': deposit_id} + + def deposit_binary(self, deposit_url, filepath, slug, in_progress=False, + log=None): + + info = self._compute_information_on(filepath) + + headers = { + 'SLUG': slug, + 'CONTENT_MD5': info['md5sum'], + 'IN-PROGRESS': str(in_progress), + 'CONTENT-TYPE': info['content_type'], + 'CONTENT-DISPOSITION': 'attachment; filename=%s' % ( + info['filename'], ), + } + + try: + with open(filepath, 'rb') as f: + r = self.do('post', deposit_url, data=f, headers=headers) + + except Exception as e: + msg = 'Binary posting deposit failure at %s: %s' % (deposit_url, e) + if log: + log.error(msg) + + return { + 'deposit_id': None, + 'error': msg, + } + else: + if r.ok: + return self._parse_deposit_xml(r.text) + else: + return { + 'deposit_id': None, + 'error': r.status_code + } + + def deposit_metadata(self, deposit_url, filepath, slug, in_progress, + log=None): + headers = { + 'SLUG': slug, + 'IN-PROGRESS': str(in_progress), + 'CONTENT-TYPE': 'application/atom+xml;type=entry', + } + + try: + with open(filepath, 'rb') as f: + r = self.do('post', deposit_url, data=f, headers=headers) + + except Exception as e: + msg = 'Metadata posting deposit failure at %s: %s' % ( + deposit_url, e) + if log: + log.error(msg) + + return { + 'deposit_id': None, + 'error': msg, + } + else: + if r.ok: + return self._parse_deposit_xml(r.text) + else: + return { + 'deposit_id': None, + 'error': r.status_code + } + + def deposit_multipart(self, deposit_url, archive_path, metadata_path, + slug, in_progress, log=None): + info = self._compute_information_on(archive_path) + info_meta = self._compute_information_on(metadata_path, + is_archive=False) + + files = [ + ('file', + (info['filename'], + open(archive_path, 'rb'), + info['content-type'])), + ('atom', + (info_meta['filename'], + open(metadata_path, 'rb'), + 'application/atom+xml')), + ] + + headers = { + 'SLUG': slug, + 'CONTENT_MD5': info['md5sum'], + 'IN-PROGRESS': str(in_progress), + } + + try: + r = self.do('post', deposit_url, files=files, headers=headers) + except Exception as e: + msg = 'Multipart posting deposit failure at %s: %s' % ( + deposit_url, e) + if log: + log.error(msg) + + return { + 'deposit_id': None, + 'error': msg, + } + else: + if r.ok: + return self._parse_deposit_xml(r.text) + else: + return { + 'deposit_id': None, + 'error': r.status_code + } diff --git a/swh/deposit/client/cli.py b/swh/deposit/client/cli.py index 29419b47..23fa8b22 100755 --- a/swh/deposit/client/cli.py +++ b/swh/deposit/client/cli.py @@ -1,240 +1,240 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Script to demonstrate software deposit scenario to https://deposit.sofwareheritage.org. -Use: ./swh-deposit --help +Use: python3 -m swh.deposit.client.cli --help """ import os import click import logging import uuid -from swh.deposit.loader.client import DepositClient +from . import DepositClient class InputError(ValueError): """Input script error """ pass def generate_slug(prefix='swh-sample'): """Generate a slug (sample purposes). """ return '%s-%s' % (prefix, uuid.uuid4()) def parse_cli_options(archive, username, password, metadata, binary_deposit, metadata_deposit, collection, slug, partial, deposit_id, url): """Parse the cli options and make sure the combination is acceptable. If not, an InputError exception is raised explaining the issue. Raises: InputError explaining the issue Returns: dict with the following keys: 'archive': the software archive to deposit 'username': username 'password': associated password 'metadata': the metadata file to deposit 'collection': the username's associated client 'slug': the slug or external id identifying the deposit to make 'partial': if the deposit is partial or not 'client': instantiated class 'url': deposit's server main entry point 'deposit_type': deposit's type (binary, multipart, metadata) """ if binary_deposit and metadata_deposit: # too many flags use, remove redundant ones (-> multipart deposit) binary_deposit = False metadata_deposit = False if not os.path.exists(archive): raise InputError('Software Archive %s must exist!' % archive) if not metadata: metadata = '%s.metadata.xml' % archive if not binary_deposit and not os.path.exists(metadata): raise InputError('Software Archive metadata %s must exist!' % metadata) client = DepositClient({ 'url': url, 'auth': { 'username': username, 'password': password }, }) if collection: # transpose to the right collection path collection = '/%s/' % collection if not collection: # retrieve user's collection sd_content = client.service_document() if 'error' in sd_content: raise InputError(sd_content['error']) collection = sd_content['collection'] if not slug: # generate slug slug = generate_slug() if binary_deposit: deposit_type = 'binary' elif metadata_deposit: deposit_type = 'metadata' else: deposit_type = 'multipart' return { 'archive': archive, 'username': username, 'password': password, 'metadata': metadata, 'collection': collection, 'slug': slug, 'partial': partial, 'client': client, 'url': url, 'deposit_type': deposit_type, } def do_binary_deposit(config, dry_run, log): """Execute the binary deposit. """ log.debug('Binary deposit') deposit_url = config['collection'] filepath = config['archive'] slug = config['slug'] client = config['client'] in_progress = config['partial'] if not dry_run: return client.deposit_binary(deposit_url, filepath, slug, in_progress) return {} def do_metadata_deposit(config, dry_run, log): log.debug('Metadata deposit') deposit_url = config['collection'] filepath = config['metadata'] slug = config['slug'] client = config['client'] in_progress = config['partial'] if not dry_run: r = client.deposit_metadata(deposit_url, filepath, slug, in_progress) return r return {} def do_multipart_deposit(config, dry_run, log): log.debug('Multipart deposit') client = config['client'] deposit_url = config['collection'] archive_path = config['archive'] metadata_path = config['metadata'] slug = config['slug'] client = config['client'] in_progress = config['partial'] if not dry_run: r = client.deposit_multipart(deposit_url, archive_path, metadata_path, slug, in_progress) return r return {} @click.command(help='Software Heritage Deposit client') @click.argument('archive', required=1) @click.option('--username', required=1, help="Mandatory user's name") @click.option('--password', required=1, help="Mandatory user's associated password") @click.option('--metadata', help="""Optional path to an xml metadata file. If not provided, this will use a file named .metadata.xml""") @click.option('--binary-deposit/--no-binary-deposit', default=False, help='Software archive only deposit') @click.option('--metadata-deposit/--no-metadata-deposit', default=False, help='Metadata only deposit') @click.option('--collection', help="""Optional user's collection. If not provided, this will be retrieved.""") @click.option('--slug', help="""External system information identifier. If not provided, it will be generated""") @click.option('--partial/--no-partial', default=False, help='The deposit will be partial (as in not finished)') @click.option('--deposit-id', type=click.INT, help='Update an existing partial deposit with its identifier') @click.option('--url', default='http://localhost:5006/1') @click.option('--dry-run/--no-dry-run', default=False) @click.option('--verbose/--no-verbose', default=False) def main(archive, username, password, metadata=None, binary_deposit=False, metadata_deposit=False, collection=None, slug=None, partial=False, deposit_id=None, url='http://localhost:5006/1', dry_run=True, verbose=False): log = logging.getLogger('swh-deposit') log.addHandler(logging.StreamHandler()) _loglevel = logging.DEBUG if verbose else logging.INFO log.setLevel(_loglevel) if dry_run: log.info("**DRY RUN**") config = {} try: log.debug('Parsing cli options') config = parse_cli_options( archive, username, password, metadata, binary_deposit, metadata_deposit, collection, slug, partial, deposit_id, url) except InputError as e: log.error('Problem during parsing options: %s' % e) return 1 if verbose: log.info("Parsed configuration: %s" % ( config, )) deposit_fn = { 'binary': lambda config, dry_run=dry_run, log=log: do_binary_deposit( config, dry_run, log), 'metadata': lambda config, dry_run=dry_run, log=log: do_metadata_deposit( config, dry_run, log), 'multipart': lambda config, dry_run=dry_run, log=log: do_multipart_deposit( config, dry_run, log), } deposit_type = config['deposit_type'] r = deposit_fn[deposit_type](config) if r: log.info(r) if __name__ == '__main__': main() diff --git a/swh/deposit/loader/checker.py b/swh/deposit/loader/checker.py index 34ffd018..88215bdc 100644 --- a/swh/deposit/loader/checker.py +++ b/swh/deposit/loader/checker.py @@ -1,20 +1,20 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from .client import DepositClient +from ..client import DepositClient class DepositChecker(): """Deposit checker implementation. Trigger deposit's checks through the private api. """ def __init__(self, client=None): super().__init__() self.client = client if client else DepositClient() def check(self, deposit_check_url): return self.client.check(deposit_check_url) diff --git a/swh/deposit/loader/client.py b/swh/deposit/loader/client.py deleted file mode 100644 index d4ee4441..00000000 --- a/swh/deposit/loader/client.py +++ /dev/null @@ -1,337 +0,0 @@ -# Copyright (C) 2017-2018 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -"""Module in charge of defining an swh-deposit client - -""" - -import hashlib -import os -import requests - -from swh.core.config import SWHConfig -from lxml import etree - - -class DepositClient(SWHConfig): - """Deposit client to: - - - read a given deposit's archive(s) - - read a given deposit's metadata - - update a given deposit's status - - """ - CONFIG_BASE_FILENAME = 'deposit/client' - DEFAULT_CONFIG = { - 'url': ('str', 'http://localhost:5006'), - 'auth': ('dict', {}), # with optional 'username'/'password' keys - } - - def __init__(self, config=None, _client=requests): - super().__init__() - if config is None: - self.config = super().parse_config_file() - else: - self.config = config - - self._client = _client - self.base_url = self.config['url'] - auth = self.config['auth'] - if auth == {}: - self.auth = None - else: - self.auth = (auth['username'], auth['password']) - - def do(self, method, url, *args, **kwargs): - """Internal method to deal with requests, possibly with basic http - authentication. - - Args: - method (str): supported http methods as in self._methods' keys - - Returns: - The request's execution - - """ - if hasattr(self._client, method): - method_fn = getattr(self._client, method) - else: - raise ValueError('Development error, unsupported method %s' % ( - method)) - - if self.auth: - kwargs['auth'] = self.auth - - full_url = '%s%s' % (self.base_url.rstrip('/'), url) - return method_fn(full_url, *args, **kwargs) - - def archive_get(self, archive_update_url, archive_path, log=None): - """Retrieve the archive from the deposit to a local directory. - - Args: - archive_update_url (str): The full deposit archive(s)'s raw content - to retrieve locally - - archive_path (str): the local archive's path where to store - the raw content - - Returns: - The archive path to the local archive to load. - Or None if any problem arose. - - """ - r = self.do('get', archive_update_url, stream=True) - if r.ok: - with open(archive_path, 'wb') as f: - for chunk in r.iter_content(): - f.write(chunk) - - return archive_path - - msg = 'Problem when retrieving deposit archive at %s' % ( - archive_update_url, ) - if log: - log.error(msg) - - raise ValueError(msg) - - def metadata_get(self, metadata_url, log=None): - """Retrieve the metadata information on a given deposit. - - Args: - metadata_url (str): The full deposit metadata url to retrieve - locally - - Returns: - The dictionary of metadata for that deposit or None if any - problem arose. - - """ - r = self.do('get', metadata_url) - if r.ok: - return r.json() - - msg = 'Problem when retrieving metadata at %s' % metadata_url - if log: - log.error(msg) - - raise ValueError(msg) - - def status_update(self, update_status_url, status, - revision_id=None): - """Update the deposit's status. - - Args: - update_status_url (str): the full deposit's archive - status (str): The status to update the deposit with - revision_id (str/None): the revision's identifier to update to - - """ - payload = {'status': status} - if revision_id: - payload['revision_id'] = revision_id - - self.do('put', update_status_url, json=payload) - - def check(self, check_url, log=None): - """Check the deposit's associated data (metadata, archive(s)) - - Args: - check_url (str): the full deposit's check url - - """ - r = self.do('get', check_url) - if r.ok: - data = r.json() - return data['status'] - - msg = 'Problem when checking deposit %s' % check_url - if log: - log.error(msg) - - raise ValueError(msg) - - def service_document(self, log=None): - sd_url = '/servicedocument/' - try: - r = self.do('get', sd_url) - except Exception as e: - msg = 'Service document failure at %s: %s' % (sd_url, e) - if log: - log.error(msg) - return { - 'collection': None, - 'error': msg, - } - else: - if r.ok: - tree = etree.fromstring(r.text) - collections = tree.xpath( - '/x:service/x:workspace/x:collection', - namespaces={'x': 'http://www.w3.org/2007/app'}) - items = dict(collections[0].items()) - collection = items['href'].rsplit(self.base_url)[1] - return { - 'collection': collection - } - else: - return { - 'collection': None, - 'error': r.status_code - } - - def _compute_information_on(self, filepath, is_archive=True): - """Given a filepath, compute necessary information on that file. - - Args: - filepath (str): Path to a file - is_archive (bool): is it an archive or not? - - Returns: - dict with keys: - 'content-type': content type associated - 'md5sum': md5 sum - 'filename': filename - """ - md5sum = hashlib.md5(open(filepath, 'rb').read()).hexdigest() - filename = os.path.basename(filepath) - - if is_archive: - extension = filename.split('.')[-1] - if 'zip' in extension: - content_type = 'application/zip' - else: - content_type = 'application/x-tar' - else: - content_type = None - - return { - 'content-type': content_type, - 'md5sum': md5sum, - 'filename': filename, - } - - def _parse_deposit_xml(self, xml_content): - """Given an xml content as string, returns a deposit dict. - - """ - tree = etree.fromstring(xml_content) - vals = tree.xpath( - '/x:entry/x:deposit_id', - namespaces={'x': 'http://www.w3.org/2005/Atom'}) - deposit_id = vals[0].text - - return {'deposit_id': deposit_id} - - def deposit_binary(self, deposit_url, filepath, slug, in_progress=False, - log=None): - - info = self._compute_information_on(filepath) - - headers = { - 'SLUG': slug, - 'CONTENT_MD5': info['md5sum'], - 'IN-PROGRESS': str(in_progress), - 'CONTENT-TYPE': info['content_type'], - 'CONTENT-DISPOSITION': 'attachment; filename=%s' % ( - info['filename'], ), - } - - try: - with open(filepath, 'rb') as f: - r = self.do('post', deposit_url, data=f, headers=headers) - - except Exception as e: - msg = 'Binary posting deposit failure at %s: %s' % (deposit_url, e) - if log: - log.error(msg) - - return { - 'deposit_id': None, - 'error': msg, - } - else: - if r.ok: - return self._parse_deposit_xml(r.text) - else: - return { - 'deposit_id': None, - 'error': r.status_code - } - - def deposit_metadata(self, deposit_url, filepath, slug, in_progress, - log=None): - headers = { - 'SLUG': slug, - 'IN-PROGRESS': str(in_progress), - 'CONTENT-TYPE': 'application/atom+xml;type=entry', - } - - try: - with open(filepath, 'rb') as f: - r = self.do('post', deposit_url, data=f, headers=headers) - - except Exception as e: - msg = 'Metadata posting deposit failure at %s: %s' % ( - deposit_url, e) - if log: - log.error(msg) - - return { - 'deposit_id': None, - 'error': msg, - } - else: - if r.ok: - return self._parse_deposit_xml(r.text) - else: - return { - 'deposit_id': None, - 'error': r.status_code - } - - def deposit_multipart(self, deposit_url, archive_path, metadata_path, - slug, in_progress, log=None): - info = self._compute_information_on(archive_path) - info_meta = self._compute_information_on(metadata_path, - is_archive=False) - - files = [ - ('file', - (info['filename'], - open(archive_path, 'rb'), - info['content-type'])), - ('atom', - (info_meta['filename'], - open(metadata_path, 'rb'), - 'application/atom+xml')), - ] - - headers = { - 'SLUG': slug, - 'CONTENT_MD5': info['md5sum'], - 'IN-PROGRESS': str(in_progress), - } - - try: - r = self.do('post', deposit_url, files=files, headers=headers) - except Exception as e: - msg = 'Multipart posting deposit failure at %s: %s' % ( - deposit_url, e) - if log: - log.error(msg) - - return { - 'deposit_id': None, - 'error': msg, - } - else: - if r.ok: - return self._parse_deposit_xml(r.text) - else: - return { - 'deposit_id': None, - 'error': r.status_code - } diff --git a/swh/deposit/loader/loader.py b/swh/deposit/loader/loader.py index d51e7bd3..07774071 100644 --- a/swh/deposit/loader/loader.py +++ b/swh/deposit/loader/loader.py @@ -1,129 +1,129 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import os import tempfile from swh.model import hashutil from swh.loader.tar import loader from swh.loader.core.loader import SWHLoader -from .client import DepositClient +from ..client import DepositClient class DepositLoader(loader.TarLoader): """Deposit loader implementation. This is a subclass of the :class:TarLoader as the main goal of this class is to first retrieve the deposit's tarball contents as one and its associated metadata. Then provide said tarball to be loaded by the TarLoader. This will: - retrieves the deposit's archive locally - provide the archive to be loaded by the tar loader - clean up the temporary location used to retrieve the archive locally - update the deposit's status accordingly """ CONFIG_BASE_FILENAME = 'loader/deposit' ADDITIONAL_CONFIG = { 'extraction_dir': ('str', '/tmp/swh.deposit.loader/'), } def __init__(self, client=None): super().__init__( logging_class='swh.deposit.loader.loader.DepositLoader') self.client = client if client else DepositClient() def load(self, *, archive_url, deposit_meta_url, deposit_update_url): SWHLoader.load( self, archive_url=archive_url, deposit_meta_url=deposit_meta_url, deposit_update_url=deposit_update_url) def prepare(self, *, archive_url, deposit_meta_url, deposit_update_url): """Prepare the loading by first retrieving the deposit's raw archive content. """ self.deposit_update_url = deposit_update_url self.client.status_update(deposit_update_url, 'loading') temporary_directory = tempfile.TemporaryDirectory() self.temporary_directory = temporary_directory archive_path = os.path.join(temporary_directory.name, 'archive.zip') archive = self.client.archive_get( archive_url, archive_path, log=self.log) metadata = self.client.metadata_get( deposit_meta_url, log=self.log) origin = metadata['origin'] visit_date = datetime.datetime.now(tz=datetime.timezone.utc) revision = metadata['revision'] occurrence = metadata['occurrence'] self.origin_metadata = metadata['origin_metadata'] self.prepare_metadata() super().prepare(tar_path=archive, origin=origin, visit_date=visit_date, revision=revision, occurrences=[occurrence]) def store_metadata(self): """Storing the origin_metadata during the load processus. Provider_id and tool_id are resolved during the prepare() method. """ origin_id = self.origin_id visit_date = self.visit_date provider_id = self.origin_metadata['provider']['provider_id'] tool_id = self.origin_metadata['tool']['tool_id'] metadata = self.origin_metadata['metadata'] try: self.send_origin_metadata(origin_id, visit_date, provider_id, tool_id, metadata) except Exception: self.log.exception('Problem when storing origin_metadata') raise def post_load(self, success=True): """Updating the deposit's status according to its loading status. If not successful, we update its status to 'failed'. Otherwise, we update its status to 'done' and pass along its associated revision. """ try: if not success: self.client.status_update(self.deposit_update_url, status='failed') return # first retrieve the new revision [rev_id] = self.objects['revision'].keys() if rev_id: rev_id_hex = hashutil.hash_to_hex(rev_id) # then update the deposit's status to success with its # revision-id self.client.status_update(self.deposit_update_url, status='done', revision_id=rev_id_hex) except Exception: self.log.exception( 'Problem when trying to update the deposit\'s status') def cleanup(self): """Clean up temporary directory where we retrieved the tarball. """ super().cleanup() self.temporary_directory.cleanup() diff --git a/swh/deposit/tests/loader/common.py b/swh/deposit/tests/loader/common.py index a1103943..5d7e4d33 100644 --- a/swh/deposit/tests/loader/common.py +++ b/swh/deposit/tests/loader/common.py @@ -1,49 +1,49 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json -from swh.deposit.loader.client import DepositClient +from swh.deposit.client import DepositClient CLIENT_TEST_CONFIG = { 'url': 'http://nowhere:9000/', 'auth': {}, # no authentication in test scenario } class SWHDepositTestClient(DepositClient): """Deposit test client to permit overriding the default request client. """ def __init__(self, client, config): super().__init__(config=config) self.client = client def archive_get(self, archive_update_url, archive_path, log=None): r = self.client.get(archive_update_url) with open(archive_path, 'wb') as f: for chunk in r.streaming_content: f.write(chunk) return archive_path def metadata_get(self, metadata_url, log=None): r = self.client.get(metadata_url) return json.loads(r.content.decode('utf-8')) def status_update(self, update_status_url, status, revision_id=None): payload = {'status': status} if revision_id: payload['revision_id'] = revision_id self.client.put(update_status_url, content_type='application/json', data=json.dumps(payload)) def check(self, check_url): r = self.client.get(check_url) data = json.loads(r.content.decode('utf-8')) return data['status'] diff --git a/swh/deposit/tests/loader/test_client.py b/swh/deposit/tests/loader/test_client.py index c4ec4963..60dfc1aa 100644 --- a/swh/deposit/tests/loader/test_client.py +++ b/swh/deposit/tests/loader/test_client.py @@ -1,268 +1,268 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import tempfile import unittest from nose.plugins.attrib import attr from nose.tools import istest -from swh.deposit.loader.client import DepositClient +from swh.deposit.client import DepositClient from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.config import DEPOSIT_STATUS_LOAD_FAILURE from .common import CLIENT_TEST_CONFIG class StreamedResponse: """Streamed response facsimile """ def __init__(self, ok, stream): self.ok = ok self.stream = stream def iter_content(self): yield from self.stream class FakeRequestClientGet: """Fake request client dedicated to get method calls. """ def __init__(self, response): self.response = response def get(self, *args, **kwargs): self.args = args self.kwargs = kwargs return self.response @attr('fs') class DepositClientReadArchiveTest(unittest.TestCase): def setUp(self): super().setUp() self.temporary_directory = tempfile.mkdtemp(dir='/tmp') def tearDown(self): super().setUp() shutil.rmtree(self.temporary_directory) @istest def archive_get(self): """Reading archive should write data in temporary directory """ stream_content = [b"some", b"streamed", b"response"] response = StreamedResponse( ok=True, stream=(s for s in stream_content)) _client = FakeRequestClientGet(response) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) archive_path = os.path.join(self.temporary_directory, 'test.archive') archive_path = deposit_client.archive_get('/some/url', archive_path) self.assertTrue(os.path.exists(archive_path)) with open(archive_path, 'rb') as f: actual_content = f.read() self.assertEquals(actual_content, b''.join(stream_content)) self.assertEquals(_client.args, ('http://nowhere:9000/some/url', )) self.assertEquals(_client.kwargs, { 'stream': True }) @istest def archive_get_with_authentication(self): """Reading archive should write data in temporary directory """ stream_content = [b"some", b"streamed", b"response", b"for", b"auth"] response = StreamedResponse( ok=True, stream=(s for s in stream_content)) _client = FakeRequestClientGet(response) _config = CLIENT_TEST_CONFIG.copy() _config['auth'] = { # add authentication setup 'username': 'user', 'password': 'pass' } deposit_client = DepositClient(_config, _client=_client) archive_path = os.path.join(self.temporary_directory, 'test.archive') archive_path = deposit_client.archive_get('/some/url', archive_path) self.assertTrue(os.path.exists(archive_path)) with open(archive_path, 'rb') as f: actual_content = f.read() self.assertEquals(actual_content, b''.join(stream_content)) self.assertEquals(_client.args, ('http://nowhere:9000/some/url', )) self.assertEquals(_client.kwargs, { 'stream': True, 'auth': ('user', 'pass') }) @istest def archive_get_can_fail(self): """Reading archive can fail for some reasons """ response = StreamedResponse(ok=False, stream=None) _client = FakeRequestClientGet(response) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) with self.assertRaisesRegex( ValueError, 'Problem when retrieving deposit archive'): deposit_client.archive_get('/some/url', 'some/path') class JsonResponse: """Json response facsimile """ def __init__(self, ok, response): self.ok = ok self.response = response def json(self): return self.response class DepositClientReadMetadataTest(unittest.TestCase): @istest def metadata_get(self): """Reading archive should write data in temporary directory """ expected_response = {"some": "dict"} response = JsonResponse( ok=True, response=expected_response) _client = FakeRequestClientGet(response) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) actual_metadata = deposit_client.metadata_get('/metadata') self.assertEquals(actual_metadata, expected_response) @istest def metadata_get_can_fail(self): """Reading metadata can fail for some reasons """ _client = FakeRequestClientGet(JsonResponse(ok=False, response=None)) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) with self.assertRaisesRegex( ValueError, 'Problem when retrieving metadata at'): deposit_client.metadata_get('/some/metadata/url') class FakeRequestClientPut: """Fake Request client dedicated to put request method calls. """ args = None kwargs = None def put(self, *args, **kwargs): self.args = args self.kwargs = kwargs class DepositClientStatusUpdateTest(unittest.TestCase): @istest def status_update(self): """Update status """ _client = FakeRequestClientPut() deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) deposit_client.status_update('/update/status', DEPOSIT_STATUS_LOAD_SUCCESS, revision_id='some-revision-id') self.assertEquals(_client.args, ('http://nowhere:9000/update/status', )) self.assertEquals(_client.kwargs, { 'json': { 'status': DEPOSIT_STATUS_LOAD_SUCCESS, 'revision_id': 'some-revision-id', } }) @istest def status_update_with_no_revision_id(self): """Reading metadata can fail for some reasons """ _client = FakeRequestClientPut() deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) deposit_client.status_update('/update/status/fail', DEPOSIT_STATUS_LOAD_FAILURE) self.assertEquals(_client.args, ('http://nowhere:9000/update/status/fail', )) self.assertEquals(_client.kwargs, { 'json': { 'status': DEPOSIT_STATUS_LOAD_FAILURE, } }) class DepositClientCheckTest(unittest.TestCase): @istest def check(self): """When check ok, this should return the deposit's status """ _client = FakeRequestClientGet( JsonResponse(ok=True, response={'status': 'something'})) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) r = deposit_client.check('/check') self.assertEquals(_client.args, ('http://nowhere:9000/check', )) self.assertEquals(_client.kwargs, {}) self.assertEquals(r, 'something') @istest def check_fails(self): """Checking deposit can fail for some reason """ _client = FakeRequestClientGet( JsonResponse(ok=False, response=None)) deposit_client = DepositClient(config=CLIENT_TEST_CONFIG, _client=_client) with self.assertRaisesRegex( ValueError, 'Problem when checking deposit'): deposit_client.check('/check/fails') self.assertEquals(_client.args, ('http://nowhere:9000/check/fails', )) self.assertEquals(_client.kwargs, {})