diff --git a/PKG-INFO b/PKG-INFO index 024b20b4..77466c87 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.deposit -Version: 0.0.45 +Version: 0.0.46 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh.deposit.egg-info/PKG-INFO b/swh.deposit.egg-info/PKG-INFO index 024b20b4..77466c87 100644 --- a/swh.deposit.egg-info/PKG-INFO +++ b/swh.deposit.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 1.0 Name: swh.deposit -Version: 0.0.45 +Version: 0.0.46 Summary: Software Heritage Deposit Server Home-page: https://forge.softwareheritage.org/source/swh-deposit/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Description: UNKNOWN Platform: UNKNOWN diff --git a/swh/deposit/client/__init__.py b/swh/deposit/client/__init__.py index 6761036f..b7712efd 100644 --- a/swh/deposit/client/__init__.py +++ b/swh/deposit/client/__init__.py @@ -1,595 +1,595 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining an swh-deposit client """ import hashlib import os import requests from abc import ABCMeta, abstractmethod from lxml import etree from swh.core.config import SWHConfig class BaseApiDepositClient(SWHConfig): """Deposit client base class """ CONFIG_BASE_FILENAME = 'deposit/client' DEFAULT_CONFIG = { 'url': ('str', 'http://localhost:5006'), 'auth': ('dict', {}), # with optional 'username'/'password' keys } def __init__(self, config=None, _client=requests): super().__init__() if config is None: self.config = super().parse_config_file() else: self.config = config self._client = _client self.base_url = self.config['url'] auth = self.config['auth'] if auth == {}: self.auth = None else: self.auth = (auth['username'], auth['password']) def do(self, method, url, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in self._methods' keys Returns: The request's execution """ if hasattr(self._client, method): method_fn = getattr(self._client, method) else: raise ValueError('Development error, unsupported method %s' % ( method)) if self.auth: kwargs['auth'] = self.auth full_url = '%s%s' % (self.base_url.rstrip('/'), url) return method_fn(full_url, *args, **kwargs) -class ApiDepositClient(BaseApiDepositClient): +class PrivateApiDepositClient(BaseApiDepositClient): """Private API deposit client to: - read a given deposit's archive(s) - read a given deposit's metadata - update a given deposit's status """ def archive_get(self, archive_update_url, archive_path, log=None): """Retrieve the archive from the deposit to a local directory. Args: archive_update_url (str): The full deposit archive(s)'s raw content to retrieve locally archive_path (str): the local archive's path where to store the raw content Returns: The archive path to the local archive to load. Or None if any problem arose. """ r = self.do('get', archive_update_url, stream=True) if r.ok: with open(archive_path, 'wb') as f: for chunk in r.iter_content(): f.write(chunk) return archive_path msg = 'Problem when retrieving deposit archive at %s' % ( archive_update_url, ) if log: log.error(msg) raise ValueError(msg) def metadata_get(self, metadata_url, log=None): """Retrieve the metadata information on a given deposit. Args: metadata_url (str): The full deposit metadata url to retrieve locally Returns: The dictionary of metadata for that deposit or None if any problem arose. """ r = self.do('get', metadata_url) if r.ok: return r.json() msg = 'Problem when retrieving metadata at %s' % metadata_url if log: log.error(msg) raise ValueError(msg) def status_update(self, update_status_url, status, revision_id=None): """Update the deposit's status. Args: update_status_url (str): the full deposit's archive status (str): The status to update the deposit with revision_id (str/None): the revision's identifier to update to """ payload = {'status': status} if revision_id: payload['revision_id'] = revision_id self.do('put', update_status_url, json=payload) def check(self, check_url, log=None): """Check the deposit's associated data (metadata, archive(s)) Args: check_url (str): the full deposit's check url """ r = self.do('get', check_url) if r.ok: data = r.json() return data['status'] msg = 'Problem when checking deposit %s' % check_url if log: log.error(msg) raise ValueError(msg) class BaseDepositClient(BaseApiDepositClient, metaclass=ABCMeta): """Base Deposit client to access the public api. """ def __init__(self, config, error_msg=None, empty_result={}): super().__init__(config) self.error_msg = error_msg self.empty_result = empty_result @abstractmethod def compute_url(self, *args, **kwargs): """Compute api url endpoint to query.""" pass @abstractmethod def compute_method(self, *args, **kwargs): """Http method to use on the url""" pass @abstractmethod def parse_result_ok(self, xml_content): """Given an xml result from the api endpoint, parse it and returns a dict. """ pass def compute_information(self, *args, **kwargs): """Compute some more information given the inputs (e.g http headers, ...) """ return {} def parse_result_error(self, xml_content): """Given an error response in xml, parse it into a dict. Returns: dict with following keys: 'error': The error message 'detail': Some more detail about the error if any """ tree = etree.fromstring(xml_content.encode('utf-8')) vals = tree.xpath('/x:error/y:summary', namespaces={ 'x': 'http://purl.org/net/sword/', 'y': 'http://www.w3.org/2005/Atom' }) summary = vals[0].text if summary: summary = summary.strip() vals = tree.xpath( '/x:error/x:verboseDescription', namespaces={'x': 'http://purl.org/net/sword/'}) if vals: detail = vals[0].text.strip() else: detail = None return {'error': summary, 'detail': detail} def do_execute(self, method, url, info): """Execute the http query to url using method and info information. By default, execute a simple query to url with the http method. Override this in daughter class to improve the default behavior if needed. """ return self.do(method, url) def execute(self, *args, **kwargs): """Main endpoint to prepare and execute the http query to the api. """ url = self.compute_url(*args, **kwargs) method = self.compute_method(*args, **kwargs) info = self.compute_information(*args, **kwargs) try: r = self.do_execute(method, url, info) except Exception as e: msg = self.error_msg % (url, e) r = self.empty_result r.update({ 'error': msg, }) return r else: if r.ok: if int(r.status_code) == 204: # 204 returns no body return {'status': r.status_code} else: return self.parse_result_ok(r.text) else: error = self.parse_result_error(r.text) empty = self.empty_result error.update(empty) error.update({ 'status': r.status_code, }) return error class ServiceDocumentDepositClient(BaseDepositClient): """Service Document information retrieval. """ def __init__(self, config): super().__init__(config, error_msg='Service document failure at %s: %s', empty_result={'collection': None}) def compute_url(self, *args, **kwargs): return '/servicedocument/' def compute_method(self, *args, **kwargs): return 'get' def parse_result_ok(self, xml_content): """Parse service document's success response. """ tree = etree.fromstring(xml_content.encode('utf-8')) collections = tree.xpath( '/x:service/x:workspace/x:collection/y:name', namespaces={'x': 'http://www.w3.org/2007/app', 'y': 'http://purl.org/net/sword/terms/'}) if collections: collection = collections[0].text else: collection = None return { 'collection': collection } class StatusDepositClient(BaseDepositClient): """Status information on a deposit. """ def __init__(self, config): super().__init__(config, error_msg='Status check failure at %s: %s', empty_result={ 'deposit_status': None, 'deposit_status_detail': None, 'deposit_swh_id': None, }) def compute_url(self, collection, deposit_id): return '/%s/%s/status/' % (collection, deposit_id) def compute_method(self, *args, **kwargs): return 'get' def parse_result_ok(self, xml_content): """Given an xml content as string, returns a deposit dict. """ tree = etree.fromstring(xml_content.encode('utf-8')) vals = tree.xpath( '/x:entry/x:deposit_id', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_id = vals[0].text vals = tree.xpath( '/x:entry/x:deposit_status', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_status = vals[0].text vals = tree.xpath( '/x:entry/x:deposit_status_detail', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_status_detail = vals[0].text vals = tree.xpath( '/x:entry/x:deposit_swh_id', namespaces={'x': 'http://www.w3.org/2005/Atom'}) if vals: deposit_swh_id = vals[0].text else: deposit_swh_id = None return { 'deposit_id': deposit_id, 'deposit_status': deposit_status, 'deposit_status_detail': deposit_status_detail, 'deposit_swh_id': deposit_swh_id, } class BaseCreateDepositClient(BaseDepositClient): """Deposit client base class to post new deposit. """ def __init__(self, config): super().__init__(config, error_msg='Post Deposit failure at %s: %s', empty_result={ 'deposit_id': None, 'deposit_status': None, }) def compute_url(self, collection, *args, **kwargs): return '/%s/' % collection def compute_method(self, *args, **kwargs): return 'post' def parse_result_ok(self, xml_content): """Given an xml content as string, returns a deposit dict. """ tree = etree.fromstring(xml_content.encode('utf-8')) vals = tree.xpath( '/x:entry/x:deposit_id', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_id = vals[0].text vals = tree.xpath( '/x:entry/x:deposit_status', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_status = vals[0].text vals = tree.xpath( '/x:entry/x:deposit_date', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_date = vals[0].text return { 'deposit_id': deposit_id, 'deposit_status': deposit_status, 'deposit_date': deposit_date, } def _compute_information(self, collection, filepath, in_progress, slug, is_archive=True): """Given a filepath, compute necessary information on that file. Args: filepath (str): Path to a file is_archive (bool): is it an archive or not? Returns: dict with keys: 'content-type': content type associated 'md5sum': md5 sum 'filename': filename """ filename = os.path.basename(filepath) if is_archive: md5sum = hashlib.md5(open(filepath, 'rb').read()).hexdigest() extension = filename.split('.')[-1] if 'zip' in extension: content_type = 'application/zip' else: content_type = 'application/x-tar' else: content_type = None md5sum = None return { 'slug': slug, 'in_progress': in_progress, 'content-type': content_type, 'md5sum': md5sum, 'filename': filename, 'filepath': filepath, } def compute_information(self, collection, filepath, in_progress, slug, is_archive=True, **kwargs): info = self._compute_information(collection, filepath, in_progress, slug, is_archive=is_archive) info['headers'] = self.compute_headers(info) return info def do_execute(self, method, url, info): with open(info['filepath'], 'rb') as f: return self.do(method, url, data=f, headers=info['headers']) class CreateArchiveDepositClient(BaseCreateDepositClient): """Post an archive (binary) deposit client.""" def compute_headers(self, info): return { 'SLUG': info['slug'], 'CONTENT_MD5': info['md5sum'], 'IN-PROGRESS': str(info['in_progress']), 'CONTENT-TYPE': info['content-type'], 'CONTENT-DISPOSITION': 'attachment; filename=%s' % ( info['filename'], ), } class UpdateArchiveDepositClient(CreateArchiveDepositClient): """Update (add/replace) an archive (binary) deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return '/%s/%s/media/' % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return 'put' if replace else 'post' class CreateMetadataDepositClient(BaseCreateDepositClient): """Post a metadata deposit client.""" def compute_headers(self, info): return { 'SLUG': info['slug'], 'IN-PROGRESS': str(info['in_progress']), 'CONTENT-TYPE': 'application/atom+xml;type=entry', } class UpdateMetadataDepositClient(CreateMetadataDepositClient): """Update (add/replace) a metadata deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return '/%s/%s/metadata/' % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return 'put' if replace else 'post' class CreateMultipartDepositClient(BaseCreateDepositClient): """Create a multipart deposit client.""" def _multipart_info(self, info, info_meta): files = [ ('file', (info['filename'], open(info['filepath'], 'rb'), info['content-type'])), ('atom', (info_meta['filename'], open(info_meta['filepath'], 'rb'), 'application/atom+xml')), ] headers = { 'SLUG': info['slug'], 'CONTENT_MD5': info['md5sum'], 'IN-PROGRESS': str(info['in_progress']), } return files, headers def compute_information(self, collection, archive_path, metadata_path, in_progress, slug, **kwargs): info = self._compute_information( collection, archive_path, in_progress, slug) info_meta = self._compute_information( collection, metadata_path, in_progress, slug, is_archive=False) files, headers = self._multipart_info(info, info_meta) return {'files': files, 'headers': headers} def do_execute(self, method, url, info): return self.do( method, url, files=info['files'], headers=info['headers']) class UpdateMultipartDepositClient(CreateMultipartDepositClient): """Update a multipart deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return '/%s/%s/metadata/' % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return 'put' if replace else 'post' class PublicApiDepositClient(BaseApiDepositClient): """Public api deposit client.""" def service_document(self, log=None): """Retrieve service document endpoint's information.""" return ServiceDocumentDepositClient(self.config).execute() def deposit_status(self, collection, deposit_id, log=None): """Retrieve status information on a deposit.""" return StatusDepositClient(self.config).execute( collection, deposit_id) def deposit_create(self, collection, slug, archive_path=None, metadata_path=None, in_progress=False, log=None): """Create a new deposit (archive, metadata, both as multipart).""" if archive_path and not metadata_path: return CreateArchiveDepositClient(self.config).execute( collection, archive_path, in_progress, slug) elif not archive_path and metadata_path: return CreateMetadataDepositClient(self.config).execute( collection, metadata_path, in_progress, slug, is_archive=False) else: return CreateMultipartDepositClient(self.config).execute( collection, archive_path, metadata_path, in_progress, slug) def deposit_update(self, collection, deposit_id, slug, archive_path=None, metadata_path=None, in_progress=False, replace=False, log=None): """Update (add/replace) existing deposit (archive, metadata, both).""" r = self.deposit_status(collection, deposit_id, log=log) if 'error' in r: return r status = r['deposit_status'] if status != 'partial': return { 'error': "You can only act on deposit with status 'partial'", 'detail': "The deposit %s has status '%s'" % ( deposit_id, status), 'deposit_status': status, 'deposit_id': deposit_id, } if archive_path and not metadata_path: r = UpdateArchiveDepositClient(self.config).execute( collection, archive_path, in_progress, slug, deposit_id=deposit_id, replace=replace, log=log) elif not archive_path and metadata_path: r = UpdateMetadataDepositClient(self.config).execute( collection, metadata_path, in_progress, slug, deposit_id=deposit_id, replace=replace, log=log) else: r = UpdateMultipartDepositClient(self.config).execute( collection, archive_path, metadata_path, in_progress, slug, deposit_id=deposit_id, replace=replace, log=log) if 'error' in r: return r return self.deposit_status(collection, deposit_id, log=log) diff --git a/swh/deposit/client/cli.py b/swh/deposit/client/cli.py index bdbc7768..ea7512b0 100755 --- a/swh/deposit/client/cli.py +++ b/swh/deposit/client/cli.py @@ -1,294 +1,296 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Script to demonstrate software deposit scenario to https://deposit.sofwareheritage.org. Use: python3 -m swh.deposit.client.cli --help Documentation: https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html # noqa """ import os import click import logging import uuid from . import PublicApiDepositClient class InputError(ValueError): """Input script error """ pass def generate_slug(prefix='swh-sample'): """Generate a slug (sample purposes). """ return '%s-%s' % (prefix, uuid.uuid4()) def parse_cli_options(username, password, archive, metadata, archive_deposit, metadata_deposit, collection, slug, partial, deposit_id, replace, url, status): """Parse the cli options and make sure the combination is acceptable*. If not, an InputError exception is raised explaining the issue. By acceptable, we mean: - A multipart deposit (create or update) needs both an existing software archive and an existing metadata file - A binary deposit (create/update) needs an existing software archive - A metadata deposit (create/update) needs an existing metadata file - A deposit update needs a deposit_id to be provided This won't prevent all failure cases though. The remaining errors are already dealt with the underlying api client. Raises: InputError explaining the issue Returns: dict with the following keys: 'archive': the software archive to deposit 'username': username 'password': associated password 'metadata': the metadata file to deposit 'collection': the username's associated client 'slug': the slug or external id identifying the deposit to make 'partial': if the deposit is partial or not 'client': instantiated class 'url': deposit's server main entry point 'deposit_type': deposit's type (binary, multipart, metadata) 'deposit_id': optional deposit identifier """ if status and not deposit_id: raise InputError("Deposit id must be provided for status check") if status and deposit_id: # status is higher priority over deposit archive_deposit = False metadata_deposit = False + archive = None + metadata = None if archive_deposit and metadata_deposit: # too many flags use, remove redundant ones (-> multipart deposit) archive_deposit = False metadata_deposit = False - if archive_deposit and archive and not os.path.exists(archive): + if archive and not os.path.exists(archive): raise InputError('Software Archive %s must exist!' % archive) if archive and not metadata: metadata = '%s.metadata.xml' % archive if metadata_deposit: archive = None if archive_deposit: metadata = None if metadata_deposit and not metadata: raise InputError( "Metadata deposit filepath must be provided for metadata deposit") - if not status and metadata_deposit and not os.path.exists(metadata): + if metadata and not os.path.exists(metadata): raise InputError('Software Archive metadata %s must exist!' % metadata) if not status and not archive and not metadata: raise InputError( 'Please provide an actionable command. See --help for more ' 'information.') if replace and not deposit_id: raise InputError( 'To update an existing deposit, you must provide its id') client = PublicApiDepositClient({ 'url': url, 'auth': { 'username': username, 'password': password }, }) if not collection: # retrieve user's collection sd_content = client.service_document() if 'error' in sd_content: raise InputError('Service document retrieval: %s' % ( sd_content['error'], )) collection = sd_content['collection'] if not slug: # generate slug slug = generate_slug() return { 'archive': archive, 'username': username, 'password': password, 'metadata': metadata, 'collection': collection, 'slug': slug, 'partial': partial, 'client': client, 'url': url, 'deposit_id': deposit_id, 'replace': replace, } def deposit_status(config, dry_run, log): log.debug('Status deposit') client = config['client'] collection = config['collection'] deposit_id = config['deposit_id'] if not dry_run: r = client.deposit_status(collection, deposit_id, log) return r return {} def deposit_create(config, dry_run, log): """Delegate the actual deposit to the deposit client. """ log.debug('Create deposit') client = config['client'] collection = config['collection'] archive_path = config['archive'] metadata_path = config['metadata'] slug = config['slug'] in_progress = config['partial'] if not dry_run: r = client.deposit_create(collection, slug, archive_path, metadata_path, in_progress, log) return r return {} def deposit_update(config, dry_run, log): """Delegate the actual deposit to the deposit client. """ log.debug('Update deposit') client = config['client'] collection = config['collection'] deposit_id = config['deposit_id'] archive_path = config['archive'] metadata_path = config['metadata'] slug = config['slug'] in_progress = config['partial'] replace = config['replace'] if not dry_run: r = client.deposit_update(collection, deposit_id, slug, archive_path, metadata_path, in_progress, replace, log) return r return {} @click.command() @click.option('--username', required=1, help="(Mandatory) User's name") @click.option('--password', required=1, help="(Mandatory) User's associated password") @click.option('--archive', help='(Optional) Software archive to deposit') @click.option('--metadata', help="(Optional) Path to xml metadata file. If not provided, this will use a file named .metadata.xml") # noqa @click.option('--archive-deposit/--no-archive-deposit', default=False, help='(Optional) Software archive only deposit') @click.option('--metadata-deposit/--no-metadata-deposit', default=False, help='(Optional) Metadata only deposit') @click.option('--collection', help="(Optional) User's collection. If not provided, this will be fetched.") # noqa @click.option('--slug', help="""(Optional) External system information identifier. If not provided, it will be generated""") # noqa @click.option('--partial/--no-partial', default=False, help='(Optional) The deposit will be partial, other deposits will have to take place to finalize it.') # noqa @click.option('--deposit-id', default=None, help='(Optional) Update an existing partial deposit with its identifier') # noqa @click.option('--replace/--no-replace', default=False, help='(Optional) Update by replacing existing metadata to a deposit') # noqa @click.option('--url', default='http://deposit.softwareheritage.org/1', help="(Optional) Deposit server api endpoint. By default, https://deposit.softwareheritage.org/1") # noqa @click.option('--status/--no-status', default=False, help="(Optional) Deposit's status") @click.option('--dry-run/--no-dry-run', default=False, help='(Optional) No-op deposit') @click.option('--verbose/--no-verbose', default=False, help='Verbose mode') def main(username, password, archive=None, metadata=None, archive_deposit=False, metadata_deposit=False, collection=None, slug=None, partial=False, deposit_id=None, replace=False, status=False, url='https://deposit.softwareheritage.org/1', dry_run=True, verbose=False): """Software Heritage Deposit client - Create (or update partial) deposit through the command line. More documentation can be found at https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html. """ log = logging.getLogger('swh-deposit') log.addHandler(logging.StreamHandler()) _loglevel = logging.DEBUG if verbose else logging.INFO log.setLevel(_loglevel) if dry_run: log.info("**DRY RUN**") config = {} try: log.debug('Parsing cli options') config = parse_cli_options( username, password, archive, metadata, archive_deposit, metadata_deposit, collection, slug, partial, deposit_id, replace, url, status) except InputError as e: msg = 'Problem during parsing options: %s' % e r = { 'error': msg, } log.info(r) return 1 if verbose: log.info("Parsed configuration: %s" % ( config, )) deposit_id = config['deposit_id'] if status and deposit_id: r = deposit_status(config, dry_run, log) elif not status and deposit_id: r = deposit_update(config, dry_run, log) elif not status and not deposit_id: r = deposit_create(config, dry_run, log) log.info(r) if __name__ == '__main__': main() diff --git a/swh/deposit/loader/checker.py b/swh/deposit/loader/checker.py index cb21ee81..b7677f52 100644 --- a/swh/deposit/loader/checker.py +++ b/swh/deposit/loader/checker.py @@ -1,20 +1,20 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from ..client import ApiDepositClient +from ..client import PrivateApiDepositClient class DepositChecker(): """Deposit checker implementation. Trigger deposit's checks through the private api. """ def __init__(self, client=None): super().__init__() - self.client = client if client else ApiDepositClient() + self.client = client if client else PrivateApiDepositClient() def check(self, deposit_check_url): return self.client.check(deposit_check_url) diff --git a/swh/deposit/loader/loader.py b/swh/deposit/loader/loader.py index 22315d15..855ee415 100644 --- a/swh/deposit/loader/loader.py +++ b/swh/deposit/loader/loader.py @@ -1,129 +1,129 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime import os import tempfile from swh.model import hashutil from swh.loader.tar import loader from swh.loader.core.loader import SWHLoader -from ..client import ApiDepositClient +from ..client import PrivateApiDepositClient class DepositLoader(loader.TarLoader): """Deposit loader implementation. This is a subclass of the :class:TarLoader as the main goal of this class is to first retrieve the deposit's tarball contents as one and its associated metadata. Then provide said tarball to be loaded by the TarLoader. This will: - retrieves the deposit's archive locally - provide the archive to be loaded by the tar loader - clean up the temporary location used to retrieve the archive locally - update the deposit's status accordingly """ CONFIG_BASE_FILENAME = 'loader/deposit' ADDITIONAL_CONFIG = { 'extraction_dir': ('str', '/tmp/swh.deposit.loader/'), } def __init__(self, client=None): super().__init__( logging_class='swh.deposit.loader.loader.DepositLoader') - self.client = client if client else ApiDepositClient() + self.client = client if client else PrivateApiDepositClient() def load(self, *, archive_url, deposit_meta_url, deposit_update_url): SWHLoader.load( self, archive_url=archive_url, deposit_meta_url=deposit_meta_url, deposit_update_url=deposit_update_url) def prepare(self, *, archive_url, deposit_meta_url, deposit_update_url): """Prepare the loading by first retrieving the deposit's raw archive content. """ self.deposit_update_url = deposit_update_url self.client.status_update(deposit_update_url, 'loading') temporary_directory = tempfile.TemporaryDirectory() self.temporary_directory = temporary_directory archive_path = os.path.join(temporary_directory.name, 'archive.zip') archive = self.client.archive_get( archive_url, archive_path, log=self.log) metadata = self.client.metadata_get( deposit_meta_url, log=self.log) origin = metadata['origin'] visit_date = datetime.datetime.now(tz=datetime.timezone.utc) revision = metadata['revision'] occurrence = metadata['occurrence'] self.origin_metadata = metadata['origin_metadata'] self.prepare_metadata() super().prepare(tar_path=archive, origin=origin, visit_date=visit_date, revision=revision, occurrences=[occurrence]) def store_metadata(self): """Storing the origin_metadata during the load processus. Provider_id and tool_id are resolved during the prepare() method. """ origin_id = self.origin_id visit_date = self.visit_date provider_id = self.origin_metadata['provider']['provider_id'] tool_id = self.origin_metadata['tool']['tool_id'] metadata = self.origin_metadata['metadata'] try: self.send_origin_metadata(origin_id, visit_date, provider_id, tool_id, metadata) except Exception: self.log.exception('Problem when storing origin_metadata') raise def post_load(self, success=True): """Updating the deposit's status according to its loading status. If not successful, we update its status to 'failed'. Otherwise, we update its status to 'done' and pass along its associated revision. """ try: if not success: self.client.status_update(self.deposit_update_url, status='failed') return # first retrieve the new revision [rev_id] = self.objects['revision'].keys() if rev_id: rev_id_hex = hashutil.hash_to_hex(rev_id) # then update the deposit's status to success with its # revision-id self.client.status_update(self.deposit_update_url, status='done', revision_id=rev_id_hex) except Exception: self.log.exception( 'Problem when trying to update the deposit\'s status') def cleanup(self): """Clean up temporary directory where we retrieved the tarball. """ super().cleanup() self.temporary_directory.cleanup() diff --git a/swh/deposit/tests/loader/common.py b/swh/deposit/tests/loader/common.py index 530a1fe3..389a1d47 100644 --- a/swh/deposit/tests/loader/common.py +++ b/swh/deposit/tests/loader/common.py @@ -1,49 +1,49 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json -from swh.deposit.client import ApiDepositClient +from swh.deposit.client import PrivateApiDepositClient CLIENT_TEST_CONFIG = { 'url': 'http://nowhere:9000/', 'auth': {}, # no authentication in test scenario } -class SWHDepositTestClient(ApiDepositClient): +class SWHDepositTestClient(PrivateApiDepositClient): """Deposit test client to permit overriding the default request client. """ def __init__(self, client, config): super().__init__(config=config) self.client = client def archive_get(self, archive_update_url, archive_path, log=None): r = self.client.get(archive_update_url) with open(archive_path, 'wb') as f: for chunk in r.streaming_content: f.write(chunk) return archive_path def metadata_get(self, metadata_url, log=None): r = self.client.get(metadata_url) return json.loads(r.content.decode('utf-8')) def status_update(self, update_status_url, status, revision_id=None): payload = {'status': status} if revision_id: payload['revision_id'] = revision_id self.client.put(update_status_url, content_type='application/json', data=json.dumps(payload)) def check(self, check_url): r = self.client.get(check_url) data = json.loads(r.content.decode('utf-8')) return data['status'] diff --git a/swh/deposit/tests/loader/test_client.py b/swh/deposit/tests/loader/test_client.py index c49b21d5..55b857d9 100644 --- a/swh/deposit/tests/loader/test_client.py +++ b/swh/deposit/tests/loader/test_client.py @@ -1,268 +1,268 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import shutil import tempfile import unittest from nose.plugins.attrib import attr from nose.tools import istest -from swh.deposit.client import ApiDepositClient +from swh.deposit.client import PrivateApiDepositClient from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.config import DEPOSIT_STATUS_LOAD_FAILURE from .common import CLIENT_TEST_CONFIG class StreamedResponse: """Streamed response facsimile """ def __init__(self, ok, stream): self.ok = ok self.stream = stream def iter_content(self): yield from self.stream class FakeRequestClientGet: """Fake request client dedicated to get method calls. """ def __init__(self, response): self.response = response def get(self, *args, **kwargs): self.args = args self.kwargs = kwargs return self.response @attr('fs') -class ApiDepositClientReadArchiveTest(unittest.TestCase): +class PrivateApiDepositClientReadArchiveTest(unittest.TestCase): def setUp(self): super().setUp() self.temporary_directory = tempfile.mkdtemp(dir='/tmp') def tearDown(self): super().setUp() shutil.rmtree(self.temporary_directory) @istest def archive_get(self): """Reading archive should write data in temporary directory """ stream_content = [b"some", b"streamed", b"response"] response = StreamedResponse( ok=True, stream=(s for s in stream_content)) _client = FakeRequestClientGet(response) - deposit_client = ApiDepositClient(config=CLIENT_TEST_CONFIG, - _client=_client) + deposit_client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG, + _client=_client) archive_path = os.path.join(self.temporary_directory, 'test.archive') archive_path = deposit_client.archive_get('/some/url', archive_path) self.assertTrue(os.path.exists(archive_path)) with open(archive_path, 'rb') as f: actual_content = f.read() self.assertEquals(actual_content, b''.join(stream_content)) self.assertEquals(_client.args, ('http://nowhere:9000/some/url', )) self.assertEquals(_client.kwargs, { 'stream': True }) @istest def archive_get_with_authentication(self): """Reading archive should write data in temporary directory """ stream_content = [b"some", b"streamed", b"response", b"for", b"auth"] response = StreamedResponse( ok=True, stream=(s for s in stream_content)) _client = FakeRequestClientGet(response) _config = CLIENT_TEST_CONFIG.copy() _config['auth'] = { # add authentication setup 'username': 'user', 'password': 'pass' } - deposit_client = ApiDepositClient(_config, _client=_client) + deposit_client = PrivateApiDepositClient(_config, _client=_client) archive_path = os.path.join(self.temporary_directory, 'test.archive') archive_path = deposit_client.archive_get('/some/url', archive_path) self.assertTrue(os.path.exists(archive_path)) with open(archive_path, 'rb') as f: actual_content = f.read() self.assertEquals(actual_content, b''.join(stream_content)) self.assertEquals(_client.args, ('http://nowhere:9000/some/url', )) self.assertEquals(_client.kwargs, { 'stream': True, 'auth': ('user', 'pass') }) @istest def archive_get_can_fail(self): """Reading archive can fail for some reasons """ response = StreamedResponse(ok=False, stream=None) _client = FakeRequestClientGet(response) - deposit_client = ApiDepositClient(config=CLIENT_TEST_CONFIG, - _client=_client) + deposit_client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG, + _client=_client) with self.assertRaisesRegex( ValueError, 'Problem when retrieving deposit archive'): deposit_client.archive_get('/some/url', 'some/path') class JsonResponse: """Json response facsimile """ def __init__(self, ok, response): self.ok = ok self.response = response def json(self): return self.response -class ApiDepositClientReadMetadataTest(unittest.TestCase): +class PrivateApiDepositClientReadMetadataTest(unittest.TestCase): @istest def metadata_get(self): """Reading archive should write data in temporary directory """ expected_response = {"some": "dict"} response = JsonResponse( ok=True, response=expected_response) _client = FakeRequestClientGet(response) - deposit_client = ApiDepositClient(config=CLIENT_TEST_CONFIG, - _client=_client) + deposit_client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG, + _client=_client) actual_metadata = deposit_client.metadata_get('/metadata') self.assertEquals(actual_metadata, expected_response) @istest def metadata_get_can_fail(self): """Reading metadata can fail for some reasons """ _client = FakeRequestClientGet(JsonResponse(ok=False, response=None)) - deposit_client = ApiDepositClient(config=CLIENT_TEST_CONFIG, - _client=_client) + deposit_client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG, + _client=_client) with self.assertRaisesRegex( ValueError, 'Problem when retrieving metadata at'): deposit_client.metadata_get('/some/metadata/url') class FakeRequestClientPut: """Fake Request client dedicated to put request method calls. """ args = None kwargs = None def put(self, *args, **kwargs): self.args = args self.kwargs = kwargs -class ApiDepositClientStatusUpdateTest(unittest.TestCase): +class PrivateApiDepositClientStatusUpdateTest(unittest.TestCase): @istest def status_update(self): """Update status """ _client = FakeRequestClientPut() - deposit_client = ApiDepositClient(config=CLIENT_TEST_CONFIG, - _client=_client) + deposit_client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG, + _client=_client) deposit_client.status_update('/update/status', DEPOSIT_STATUS_LOAD_SUCCESS, revision_id='some-revision-id') self.assertEquals(_client.args, ('http://nowhere:9000/update/status', )) self.assertEquals(_client.kwargs, { 'json': { 'status': DEPOSIT_STATUS_LOAD_SUCCESS, 'revision_id': 'some-revision-id', } }) @istest def status_update_with_no_revision_id(self): """Reading metadata can fail for some reasons """ _client = FakeRequestClientPut() - deposit_client = ApiDepositClient(config=CLIENT_TEST_CONFIG, - _client=_client) + deposit_client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG, + _client=_client) deposit_client.status_update('/update/status/fail', DEPOSIT_STATUS_LOAD_FAILURE) self.assertEquals(_client.args, ('http://nowhere:9000/update/status/fail', )) self.assertEquals(_client.kwargs, { 'json': { 'status': DEPOSIT_STATUS_LOAD_FAILURE, } }) -class ApiDepositClientCheckTest(unittest.TestCase): +class PrivateApiDepositClientCheckTest(unittest.TestCase): @istest def check(self): """When check ok, this should return the deposit's status """ _client = FakeRequestClientGet( JsonResponse(ok=True, response={'status': 'something'})) - deposit_client = ApiDepositClient(config=CLIENT_TEST_CONFIG, - _client=_client) + deposit_client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG, + _client=_client) r = deposit_client.check('/check') self.assertEquals(_client.args, ('http://nowhere:9000/check', )) self.assertEquals(_client.kwargs, {}) self.assertEquals(r, 'something') @istest def check_fails(self): """Checking deposit can fail for some reason """ _client = FakeRequestClientGet( JsonResponse(ok=False, response=None)) - deposit_client = ApiDepositClient(config=CLIENT_TEST_CONFIG, - _client=_client) + deposit_client = PrivateApiDepositClient(config=CLIENT_TEST_CONFIG, + _client=_client) with self.assertRaisesRegex( ValueError, 'Problem when checking deposit'): deposit_client.check('/check/fails') self.assertEquals(_client.args, ('http://nowhere:9000/check/fails', )) self.assertEquals(_client.kwargs, {}) diff --git a/version.txt b/version.txt index 4aba396b..1e7be37c 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -v0.0.45-0-g188c002 \ No newline at end of file +v0.0.46-0-g6620254 \ No newline at end of file