diff --git a/swh/deposit/api/deposit_status.py b/swh/deposit/api/deposit_status.py index 52930a6d..e212f0ed 100644 --- a/swh/deposit/api/deposit_status.py +++ b/swh/deposit/api/deposit_status.py @@ -1,111 +1,120 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.shortcuts import render from rest_framework import status from .common import SWHBaseDeposit from ..errors import NOT_FOUND, make_error_response from ..errors import make_error_response_from_dict from ..models import DEPOSIT_STATUS_DETAIL, Deposit def convert_status_detail(status_detail): """Given a status_detail dict, transforms it into a human readable string. Dict has the following form (all first level keys are optional): { 'url': { 'summary': , 'fields': }, 'metadata': [{ 'summary': , 'fields': , }], 'archive': [{ 'summary': , 'fields': , }] } Args: status_detail (dict): Returns: Status detail as inlined string. """ if not status_detail: return None msg = [] for key in ['metadata', 'archive']: _detail = status_detail.get(key) if _detail: for data in _detail: suffix_msg = '' fields = data.get('fields') if fields: suffix_msg = ' (%s)' % ', '.join(fields) msg.append('- %s%s\n' % (data['summary'], suffix_msg)) _detail = status_detail.get('url') if _detail: fields = _detail.get('fields') suffix_msg = '' if fields: suffix_msg = ' (%s)' % ', '.join(fields) msg.append('- %s%s\n' % (_detail['summary'], suffix_msg)) if not msg: return None return ''.join(msg) class SWHDepositStatus(SWHBaseDeposit): """Deposit status. What's known as 'State IRI' in the sword specification. HTTP verbs supported: GET """ def get(self, req, collection_name, deposit_id, format=None): checks = self.checks(req, collection_name, deposit_id) if 'error' in checks: return make_error_response_from_dict(req, checks['error']) try: deposit = Deposit.objects.get(pk=deposit_id) if deposit.collection.name != collection_name: raise Deposit.DoesNotExist except Deposit.DoesNotExist: return make_error_response( req, NOT_FOUND, 'deposit %s does not belong to collection %s' % ( deposit_id, collection_name)) status_detail = convert_status_detail(deposit.status_detail) if not status_detail: status_detail = DEPOSIT_STATUS_DETAIL[deposit.status] context = { 'deposit_id': deposit.id, 'status': deposit.status, 'status_detail': status_detail, 'swh_id': None, + 'swh_id_context': None, + 'swh_anchor_id': None, + 'swh_anchor_id_context': None, } if deposit.swh_id: context['swh_id'] = deposit.swh_id + if deposit.swh_id_context: + context['swh_id_context'] = deposit.swh_id_context + if deposit.swh_anchor_id: + context['swh_anchor_id'] = deposit.swh_anchor_id + if deposit.swh_anchor_id_context: + context['swh_anchor_id_context'] = deposit.swh_anchor_id_context return render(req, 'deposit/status.xml', context=context, content_type='application/xml', status=status.HTTP_200_OK) diff --git a/swh/deposit/api/private/deposit_update_status.py b/swh/deposit/api/private/deposit_update_status.py index cbb8ccce..8b907a4c 100644 --- a/swh/deposit/api/private/deposit_update_status.py +++ b/swh/deposit/api/private/deposit_update_status.py @@ -1,74 +1,89 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from rest_framework.parsers import JSONParser -from swh.model.identifiers import persistent_identifier, REVISION +from swh.model.identifiers import ( + persistent_identifier, REVISION, DIRECTORY +) from ..common import SWHPutDepositAPI, SWHPrivateAPIView from ...errors import make_error_dict, BAD_REQUEST from ...models import Deposit, DEPOSIT_STATUS_DETAIL from ...models import DEPOSIT_STATUS_LOAD_SUCCESS class SWHUpdateStatusDeposit(SWHPutDepositAPI, SWHPrivateAPIView): """Deposit request class to update the deposit's status. HTTP verbs supported: PUT """ parser_classes = (JSONParser, ) def additional_checks(self, req, headers, collection_name, deposit_id=None): """Enrich existing checks to the default ones. New checks: - Ensure the status is provided - Ensure it exists """ data = req.data status = data.get('status') if not status: msg = 'The status key is mandatory with possible values %s' % list( DEPOSIT_STATUS_DETAIL.keys()) return make_error_dict(BAD_REQUEST, msg) if status not in DEPOSIT_STATUS_DETAIL: msg = 'Possible status in %s' % list(DEPOSIT_STATUS_DETAIL.keys()) return make_error_dict(BAD_REQUEST, msg) if status == DEPOSIT_STATUS_LOAD_SUCCESS: swh_id = data.get('revision_id') if not swh_id: msg = 'Updating status to %s requires a revision_id key' % ( status, ) return make_error_dict(BAD_REQUEST, msg) return {} def restrict_access(self, req, deposit=None): """Remove restriction modification to 'partial' deposit. Update is possible regardless of the existing status. """ return None def process_put(self, req, headers, collection_name, deposit_id): """Update the deposit's status Returns: 204 No content """ deposit = Deposit.objects.get(pk=deposit_id) deposit.status = req.data['status'] # checks already done before - swh_id = req.data.get('revision_id') - if swh_id: - deposit.swh_id = persistent_identifier(REVISION, {'id': swh_id}) + + origin_url = req.data.get('origin_url') + + dir_id = req.data.get('directory_id') + if dir_id: + deposit.swh_id = persistent_identifier(DIRECTORY, dir_id) + deposit.swh_id_context = persistent_identifier( + DIRECTORY, dir_id, metadata={'origin': origin_url}) + + rev_id = req.data.get('revision_id') + if rev_id: + deposit.swh_anchor_id = persistent_identifier( + REVISION, rev_id) + deposit.swh_anchor_id_context = persistent_identifier( + REVISION, rev_id, metadata={'origin': origin_url}) + deposit.save() return {} diff --git a/swh/deposit/client/__init__.py b/swh/deposit/client/__init__.py index b7712efd..ebcec21c 100644 --- a/swh/deposit/client/__init__.py +++ b/swh/deposit/client/__init__.py @@ -1,595 +1,625 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Module in charge of defining an swh-deposit client """ import hashlib import os import requests from abc import ABCMeta, abstractmethod from lxml import etree from swh.core.config import SWHConfig class BaseApiDepositClient(SWHConfig): """Deposit client base class """ CONFIG_BASE_FILENAME = 'deposit/client' DEFAULT_CONFIG = { 'url': ('str', 'http://localhost:5006'), 'auth': ('dict', {}), # with optional 'username'/'password' keys } def __init__(self, config=None, _client=requests): super().__init__() if config is None: self.config = super().parse_config_file() else: self.config = config self._client = _client self.base_url = self.config['url'] auth = self.config['auth'] if auth == {}: self.auth = None else: self.auth = (auth['username'], auth['password']) def do(self, method, url, *args, **kwargs): """Internal method to deal with requests, possibly with basic http authentication. Args: method (str): supported http methods as in self._methods' keys Returns: The request's execution """ if hasattr(self._client, method): method_fn = getattr(self._client, method) else: raise ValueError('Development error, unsupported method %s' % ( method)) if self.auth: kwargs['auth'] = self.auth full_url = '%s%s' % (self.base_url.rstrip('/'), url) return method_fn(full_url, *args, **kwargs) class PrivateApiDepositClient(BaseApiDepositClient): """Private API deposit client to: - read a given deposit's archive(s) - read a given deposit's metadata - update a given deposit's status """ def archive_get(self, archive_update_url, archive_path, log=None): """Retrieve the archive from the deposit to a local directory. Args: archive_update_url (str): The full deposit archive(s)'s raw content to retrieve locally archive_path (str): the local archive's path where to store the raw content Returns: The archive path to the local archive to load. Or None if any problem arose. """ r = self.do('get', archive_update_url, stream=True) if r.ok: with open(archive_path, 'wb') as f: for chunk in r.iter_content(): f.write(chunk) return archive_path msg = 'Problem when retrieving deposit archive at %s' % ( archive_update_url, ) if log: log.error(msg) raise ValueError(msg) def metadata_get(self, metadata_url, log=None): """Retrieve the metadata information on a given deposit. Args: metadata_url (str): The full deposit metadata url to retrieve locally Returns: The dictionary of metadata for that deposit or None if any problem arose. """ r = self.do('get', metadata_url) if r.ok: return r.json() msg = 'Problem when retrieving metadata at %s' % metadata_url if log: log.error(msg) raise ValueError(msg) def status_update(self, update_status_url, status, - revision_id=None): + revision_id=None, directory_id=None): """Update the deposit's status. Args: update_status_url (str): the full deposit's archive status (str): The status to update the deposit with revision_id (str/None): the revision's identifier to update to + directory_id (str/None): the directory's identifier to update to """ payload = {'status': status} if revision_id: payload['revision_id'] = revision_id + if directory_id: + payload['directory_id'] = directory_id self.do('put', update_status_url, json=payload) def check(self, check_url, log=None): """Check the deposit's associated data (metadata, archive(s)) Args: check_url (str): the full deposit's check url """ r = self.do('get', check_url) if r.ok: data = r.json() return data['status'] msg = 'Problem when checking deposit %s' % check_url if log: log.error(msg) raise ValueError(msg) class BaseDepositClient(BaseApiDepositClient, metaclass=ABCMeta): """Base Deposit client to access the public api. """ def __init__(self, config, error_msg=None, empty_result={}): super().__init__(config) self.error_msg = error_msg self.empty_result = empty_result @abstractmethod def compute_url(self, *args, **kwargs): """Compute api url endpoint to query.""" pass @abstractmethod def compute_method(self, *args, **kwargs): """Http method to use on the url""" pass @abstractmethod def parse_result_ok(self, xml_content): """Given an xml result from the api endpoint, parse it and returns a dict. """ pass def compute_information(self, *args, **kwargs): """Compute some more information given the inputs (e.g http headers, ...) """ return {} def parse_result_error(self, xml_content): """Given an error response in xml, parse it into a dict. Returns: dict with following keys: 'error': The error message 'detail': Some more detail about the error if any """ tree = etree.fromstring(xml_content.encode('utf-8')) vals = tree.xpath('/x:error/y:summary', namespaces={ 'x': 'http://purl.org/net/sword/', 'y': 'http://www.w3.org/2005/Atom' }) summary = vals[0].text if summary: summary = summary.strip() vals = tree.xpath( '/x:error/x:verboseDescription', namespaces={'x': 'http://purl.org/net/sword/'}) if vals: detail = vals[0].text.strip() else: detail = None return {'error': summary, 'detail': detail} def do_execute(self, method, url, info): """Execute the http query to url using method and info information. By default, execute a simple query to url with the http method. Override this in daughter class to improve the default behavior if needed. """ return self.do(method, url) def execute(self, *args, **kwargs): """Main endpoint to prepare and execute the http query to the api. """ url = self.compute_url(*args, **kwargs) method = self.compute_method(*args, **kwargs) info = self.compute_information(*args, **kwargs) try: r = self.do_execute(method, url, info) except Exception as e: msg = self.error_msg % (url, e) r = self.empty_result r.update({ 'error': msg, }) return r else: if r.ok: if int(r.status_code) == 204: # 204 returns no body return {'status': r.status_code} else: return self.parse_result_ok(r.text) else: error = self.parse_result_error(r.text) empty = self.empty_result error.update(empty) error.update({ 'status': r.status_code, }) return error class ServiceDocumentDepositClient(BaseDepositClient): """Service Document information retrieval. """ def __init__(self, config): super().__init__(config, error_msg='Service document failure at %s: %s', empty_result={'collection': None}) def compute_url(self, *args, **kwargs): return '/servicedocument/' def compute_method(self, *args, **kwargs): return 'get' def parse_result_ok(self, xml_content): """Parse service document's success response. """ tree = etree.fromstring(xml_content.encode('utf-8')) collections = tree.xpath( '/x:service/x:workspace/x:collection/y:name', namespaces={'x': 'http://www.w3.org/2007/app', 'y': 'http://purl.org/net/sword/terms/'}) if collections: collection = collections[0].text else: collection = None return { 'collection': collection } class StatusDepositClient(BaseDepositClient): """Status information on a deposit. """ def __init__(self, config): super().__init__(config, error_msg='Status check failure at %s: %s', empty_result={ 'deposit_status': None, 'deposit_status_detail': None, 'deposit_swh_id': None, }) def compute_url(self, collection, deposit_id): return '/%s/%s/status/' % (collection, deposit_id) def compute_method(self, *args, **kwargs): return 'get' def parse_result_ok(self, xml_content): """Given an xml content as string, returns a deposit dict. """ tree = etree.fromstring(xml_content.encode('utf-8')) vals = tree.xpath( '/x:entry/x:deposit_id', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_id = vals[0].text vals = tree.xpath( '/x:entry/x:deposit_status', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_status = vals[0].text vals = tree.xpath( '/x:entry/x:deposit_status_detail', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_status_detail = vals[0].text vals = tree.xpath( '/x:entry/x:deposit_swh_id', namespaces={'x': 'http://www.w3.org/2005/Atom'}) if vals: deposit_swh_id = vals[0].text else: deposit_swh_id = None + vals = tree.xpath( + '/x:entry/x:deposit_swh_id_context', + namespaces={'x': 'http://www.w3.org/2005/Atom'}) + if vals: + deposit_swh_id_context = vals[0].text + else: + deposit_swh_id_context = None + + vals = tree.xpath( + '/x:entry/x:deposit_swh_anchor_id', + namespaces={'x': 'http://www.w3.org/2005/Atom'}) + if vals: + deposit_swh_anchor_id = vals[0].text + else: + deposit_swh_anchor_id = None + + vals = tree.xpath( + '/x:entry/x:deposit_swh_anchor_id_context', + namespaces={'x': 'http://www.w3.org/2005/Atom'}) + if vals: + deposit_swh_anchor_id_context = vals[0].text + else: + deposit_swh_anchor_id_context = None + return { 'deposit_id': deposit_id, 'deposit_status': deposit_status, 'deposit_status_detail': deposit_status_detail, 'deposit_swh_id': deposit_swh_id, + 'deposit_swh_id_context': deposit_swh_id_context, + 'deposit_swh_anchor_id': deposit_swh_anchor_id, + 'deposit_swh_anchor_id_context': deposit_swh_anchor_id_context, } class BaseCreateDepositClient(BaseDepositClient): """Deposit client base class to post new deposit. """ def __init__(self, config): super().__init__(config, error_msg='Post Deposit failure at %s: %s', empty_result={ 'deposit_id': None, 'deposit_status': None, }) def compute_url(self, collection, *args, **kwargs): return '/%s/' % collection def compute_method(self, *args, **kwargs): return 'post' def parse_result_ok(self, xml_content): """Given an xml content as string, returns a deposit dict. """ tree = etree.fromstring(xml_content.encode('utf-8')) vals = tree.xpath( '/x:entry/x:deposit_id', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_id = vals[0].text vals = tree.xpath( '/x:entry/x:deposit_status', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_status = vals[0].text vals = tree.xpath( '/x:entry/x:deposit_date', namespaces={'x': 'http://www.w3.org/2005/Atom'}) deposit_date = vals[0].text return { 'deposit_id': deposit_id, 'deposit_status': deposit_status, 'deposit_date': deposit_date, } def _compute_information(self, collection, filepath, in_progress, slug, is_archive=True): """Given a filepath, compute necessary information on that file. Args: filepath (str): Path to a file is_archive (bool): is it an archive or not? Returns: dict with keys: 'content-type': content type associated 'md5sum': md5 sum 'filename': filename """ filename = os.path.basename(filepath) if is_archive: md5sum = hashlib.md5(open(filepath, 'rb').read()).hexdigest() extension = filename.split('.')[-1] if 'zip' in extension: content_type = 'application/zip' else: content_type = 'application/x-tar' else: content_type = None md5sum = None return { 'slug': slug, 'in_progress': in_progress, 'content-type': content_type, 'md5sum': md5sum, 'filename': filename, 'filepath': filepath, } def compute_information(self, collection, filepath, in_progress, slug, is_archive=True, **kwargs): info = self._compute_information(collection, filepath, in_progress, slug, is_archive=is_archive) info['headers'] = self.compute_headers(info) return info def do_execute(self, method, url, info): with open(info['filepath'], 'rb') as f: return self.do(method, url, data=f, headers=info['headers']) class CreateArchiveDepositClient(BaseCreateDepositClient): """Post an archive (binary) deposit client.""" def compute_headers(self, info): return { 'SLUG': info['slug'], 'CONTENT_MD5': info['md5sum'], 'IN-PROGRESS': str(info['in_progress']), 'CONTENT-TYPE': info['content-type'], 'CONTENT-DISPOSITION': 'attachment; filename=%s' % ( info['filename'], ), } class UpdateArchiveDepositClient(CreateArchiveDepositClient): """Update (add/replace) an archive (binary) deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return '/%s/%s/media/' % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return 'put' if replace else 'post' class CreateMetadataDepositClient(BaseCreateDepositClient): """Post a metadata deposit client.""" def compute_headers(self, info): return { 'SLUG': info['slug'], 'IN-PROGRESS': str(info['in_progress']), 'CONTENT-TYPE': 'application/atom+xml;type=entry', } class UpdateMetadataDepositClient(CreateMetadataDepositClient): """Update (add/replace) a metadata deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return '/%s/%s/metadata/' % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return 'put' if replace else 'post' class CreateMultipartDepositClient(BaseCreateDepositClient): """Create a multipart deposit client.""" def _multipart_info(self, info, info_meta): files = [ ('file', (info['filename'], open(info['filepath'], 'rb'), info['content-type'])), ('atom', (info_meta['filename'], open(info_meta['filepath'], 'rb'), 'application/atom+xml')), ] headers = { 'SLUG': info['slug'], 'CONTENT_MD5': info['md5sum'], 'IN-PROGRESS': str(info['in_progress']), } return files, headers def compute_information(self, collection, archive_path, metadata_path, in_progress, slug, **kwargs): info = self._compute_information( collection, archive_path, in_progress, slug) info_meta = self._compute_information( collection, metadata_path, in_progress, slug, is_archive=False) files, headers = self._multipart_info(info, info_meta) return {'files': files, 'headers': headers} def do_execute(self, method, url, info): return self.do( method, url, files=info['files'], headers=info['headers']) class UpdateMultipartDepositClient(CreateMultipartDepositClient): """Update a multipart deposit client.""" def compute_url(self, collection, *args, deposit_id=None, **kwargs): return '/%s/%s/metadata/' % (collection, deposit_id) def compute_method(self, *args, replace=False, **kwargs): return 'put' if replace else 'post' class PublicApiDepositClient(BaseApiDepositClient): """Public api deposit client.""" def service_document(self, log=None): """Retrieve service document endpoint's information.""" return ServiceDocumentDepositClient(self.config).execute() def deposit_status(self, collection, deposit_id, log=None): """Retrieve status information on a deposit.""" return StatusDepositClient(self.config).execute( collection, deposit_id) def deposit_create(self, collection, slug, archive_path=None, metadata_path=None, in_progress=False, log=None): """Create a new deposit (archive, metadata, both as multipart).""" if archive_path and not metadata_path: return CreateArchiveDepositClient(self.config).execute( collection, archive_path, in_progress, slug) elif not archive_path and metadata_path: return CreateMetadataDepositClient(self.config).execute( collection, metadata_path, in_progress, slug, is_archive=False) else: return CreateMultipartDepositClient(self.config).execute( collection, archive_path, metadata_path, in_progress, slug) def deposit_update(self, collection, deposit_id, slug, archive_path=None, metadata_path=None, in_progress=False, replace=False, log=None): """Update (add/replace) existing deposit (archive, metadata, both).""" r = self.deposit_status(collection, deposit_id, log=log) if 'error' in r: return r status = r['deposit_status'] if status != 'partial': return { 'error': "You can only act on deposit with status 'partial'", 'detail': "The deposit %s has status '%s'" % ( deposit_id, status), 'deposit_status': status, 'deposit_id': deposit_id, } if archive_path and not metadata_path: r = UpdateArchiveDepositClient(self.config).execute( collection, archive_path, in_progress, slug, deposit_id=deposit_id, replace=replace, log=log) elif not archive_path and metadata_path: r = UpdateMetadataDepositClient(self.config).execute( collection, metadata_path, in_progress, slug, deposit_id=deposit_id, replace=replace, log=log) else: r = UpdateMultipartDepositClient(self.config).execute( collection, archive_path, metadata_path, in_progress, slug, deposit_id=deposit_id, replace=replace, log=log) if 'error' in r: return r return self.deposit_status(collection, deposit_id, log=log) diff --git a/swh/deposit/loader/loader.py b/swh/deposit/loader/loader.py index 5e1601b2..00bb2bcf 100644 --- a/swh/deposit/loader/loader.py +++ b/swh/deposit/loader/loader.py @@ -1,130 +1,138 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import tempfile from swh.model import hashutil from swh.loader.tar import loader from swh.loader.core.loader import SWHLoader from ..client import PrivateApiDepositClient class DepositLoader(loader.TarLoader): """Deposit loader implementation. This is a subclass of the :class:TarLoader as the main goal of this class is to first retrieve the deposit's tarball contents as one and its associated metadata. Then provide said tarball to be loaded by the TarLoader. This will: - retrieves the deposit's archive locally - provide the archive to be loaded by the tar loader - clean up the temporary location used to retrieve the archive locally - update the deposit's status accordingly """ CONFIG_BASE_FILENAME = 'loader/deposit' ADDITIONAL_CONFIG = { 'extraction_dir': ('str', '/tmp/swh.deposit.loader/'), } def __init__(self, client=None): super().__init__( logging_class='swh.deposit.loader.loader.DepositLoader') self.client = client if client else PrivateApiDepositClient() def load(self, *, archive_url, deposit_meta_url, deposit_update_url): return SWHLoader.load( self, archive_url=archive_url, deposit_meta_url=deposit_meta_url, deposit_update_url=deposit_update_url) def prepare_origin_visit(self, *, deposit_meta_url, **kwargs): self.metadata = self.client.metadata_get( deposit_meta_url, log=self.log) self.origin = self.metadata['origin'] self.visit_date = None def prepare(self, *, archive_url, deposit_meta_url, deposit_update_url): """Prepare the loading by first retrieving the deposit's raw archive content. """ self.deposit_update_url = deposit_update_url self.client.status_update(deposit_update_url, 'loading') temporary_directory = tempfile.TemporaryDirectory() self.temporary_directory = temporary_directory archive_path = os.path.join(temporary_directory.name, 'archive.zip') archive = self.client.archive_get( archive_url, archive_path, log=self.log) metadata = self.metadata revision = metadata['revision'] branch_name = metadata['branch_name'] self.origin_metadata = metadata['origin_metadata'] self.prepare_metadata() super().prepare(tar_path=archive, origin=self.origin, revision=revision, branch_name=branch_name) def store_metadata(self): """Storing the origin_metadata during the load processus. Provider_id and tool_id are resolved during the prepare() method. """ origin_id = self.origin_id visit_date = self.visit_date provider_id = self.origin_metadata['provider']['provider_id'] tool_id = self.origin_metadata['tool']['tool_id'] metadata = self.origin_metadata['metadata'] try: self.send_origin_metadata(origin_id, visit_date, provider_id, tool_id, metadata) except Exception: self.log.exception('Problem when storing origin_metadata') raise def post_load(self, success=True): """Updating the deposit's status according to its loading status. If not successful, we update its status to 'failed'. Otherwise, we update its status to 'done' and pass along its associated revision. """ try: if not success: self.client.status_update(self.deposit_update_url, status='failed') return - # first retrieve the new revision - [rev_id] = self.objects['revision'].keys() + + revisions = self.objects['revision'] + # Retrieve the revision + [rev_id] = revisions.keys() + rev = revisions[rev_id] if rev_id: - rev_id_hex = hashutil.hash_to_hex(rev_id) - # then update the deposit's status to success with its - # revision-id - self.client.status_update(self.deposit_update_url, - status='done', - revision_id=rev_id_hex) + rev_id = hashutil.hash_to_hex(rev_id) + dir_id = rev['directory'] + if dir_id: + dir_id = hashutil.hash_to_hex(dir_id) + + # update the deposit's status to success with its + # revision-id and directory-id + self.client.status_update(self.deposit_update_url, + status='done', + revision_id=rev_id, + directory_id=dir_id) except Exception: self.log.exception( 'Problem when trying to update the deposit\'s status') def cleanup(self): """Clean up temporary directory where we retrieved the tarball. """ super().cleanup() self.temporary_directory.cleanup() diff --git a/swh/deposit/migrations/0014_auto_20180720_1221.py b/swh/deposit/migrations/0014_auto_20180720_1221.py new file mode 100644 index 00000000..50d4d9a8 --- /dev/null +++ b/swh/deposit/migrations/0014_auto_20180720_1221.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.14 on 2018-07-20 12:21 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('deposit', '0013_depositrequest_raw_metadata'), + ] + + operations = [ + migrations.AddField( + model_name='deposit', + name='swh_anchor_id', + field=models.TextField(blank=True, null=True), + ), + migrations.AddField( + model_name='deposit', + name='swh_anchor_id_context', + field=models.TextField(blank=True, null=True), + ), + migrations.AddField( + model_name='deposit', + name='swh_id_context', + field=models.TextField(blank=True, null=True), + ), + ] diff --git a/swh/deposit/models.py b/swh/deposit/models.py index a5d3e20a..ba6d9320 100644 --- a/swh/deposit/models.py +++ b/swh/deposit/models.py @@ -1,218 +1,221 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # Generated from: # cd swh_deposit && \ # python3 -m manage inspectdb from django.contrib.postgres.fields import JSONField, ArrayField from django.contrib.auth.models import User, UserManager from django.db import models from django.utils.timezone import now from .config import ( DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_REJECTED ) class Dbversion(models.Model): """Db version """ version = models.IntegerField(primary_key=True) release = models.DateTimeField(default=now, null=True) description = models.TextField(blank=True, null=True) class Meta: db_table = 'dbversion' def __str__(self): return str({ 'version': self.version, 'release': self.release, 'description': self.description }) """Possible status""" DEPOSIT_STATUS = [ (DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_PARTIAL), ('expired', 'expired'), (DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED), (DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED), (DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_REJECTED), ('loading', 'loading'), (DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS), (DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE), ] """Possible status and the detailed meaning.""" DEPOSIT_STATUS_DETAIL = { DEPOSIT_STATUS_PARTIAL: 'Deposit is partially received. To finalize it, ' 'In-Progress header should be false', 'expired': 'Deposit has been there too long and is now ' 'deemed ready to be garbage collected', DEPOSIT_STATUS_DEPOSITED: 'Deposit is ready for additional checks ' '(tarball ok, metadata, etc...)', DEPOSIT_STATUS_VERIFIED: 'Deposit is fully received, checked, and ' 'ready for loading', DEPOSIT_STATUS_REJECTED: 'Deposit failed the checks', 'loading': "Loading is ongoing on swh's side", DEPOSIT_STATUS_LOAD_SUCCESS: 'The deposit has been successfully ' 'loaded into the Software Heritage archive', DEPOSIT_STATUS_LOAD_FAILURE: 'The deposit loading into the ' 'Software Heritage archive failed', } class DepositClient(User): """Deposit client """ collections = ArrayField(models.IntegerField(), null=True) objects = UserManager() provider_url = models.TextField(null=False) domain = models.TextField(null=False) class Meta: db_table = 'deposit_client' def __str__(self): return str({ 'id': self.id, 'collections': self.collections, 'username': super().username, 'domain': self.domain, 'provider_url': self.provider_url, }) class Deposit(models.Model): """Deposit reception table """ id = models.BigAutoField(primary_key=True) # First deposit reception date reception_date = models.DateTimeField(auto_now_add=True) # Date when the deposit is deemed complete and ready for loading complete_date = models.DateTimeField(null=True) # collection concerned by the deposit collection = models.ForeignKey( 'DepositCollection', models.DO_NOTHING) # Deposit's external identifier external_id = models.TextField() # Deposit client client = models.ForeignKey('DepositClient', models.DO_NOTHING) # SWH's loading result identifier swh_id = models.TextField(blank=True, null=True) + swh_id_context = models.TextField(blank=True, null=True) + swh_anchor_id = models.TextField(blank=True, null=True) + swh_anchor_id_context = models.TextField(blank=True, null=True) # Deposit's status regarding loading status = models.TextField( choices=DEPOSIT_STATUS, default=DEPOSIT_STATUS_PARTIAL) status_detail = JSONField(null=True) # deposit can have one parent parent = models.ForeignKey('self', null=True) class Meta: db_table = 'deposit' def __str__(self): d = { 'id': self.id, 'reception_date': self.reception_date, 'collection': self.collection.name, 'external_id': self.external_id, 'client': self.client.username, 'status': self.status, } if self.status in (DEPOSIT_STATUS_REJECTED): d['status_detail'] = self.status_detail return str(d) class DepositRequestType(models.Model): """Deposit request type made by clients (either archive or metadata) """ id = models.BigAutoField(primary_key=True) name = models.TextField() class Meta: db_table = 'deposit_request_type' def __str__(self): return str({'id': self.id, 'name': self.name}) def client_directory_path(instance, filename): """Callable to upload archive in MEDIA_ROOT/user_/ Args: instance (DepositRequest): DepositRequest concerned by the upload filename (str): Filename of the uploaded file Returns: A path to be prefixed by the MEDIA_ROOT to access physically to the file uploaded. """ return 'client_{0}/{1}'.format(instance.deposit.client.id, filename) class DepositRequest(models.Model): """Deposit request associated to one deposit. """ id = models.BigAutoField(primary_key=True) # Deposit concerned by the request deposit = models.ForeignKey(Deposit, models.DO_NOTHING) date = models.DateTimeField(auto_now_add=True) # Deposit request information on the data to inject # this can be null when type is 'archive' metadata = JSONField(null=True) raw_metadata = models.TextField(null=True) # this can be null when type is 'metadata' archive = models.FileField(null=True, upload_to=client_directory_path) type = models.ForeignKey( 'DepositRequestType', models.DO_NOTHING) class Meta: db_table = 'deposit_request' def __str__(self): meta = None if self.metadata: from json import dumps meta = dumps(self.metadata) archive_name = None if self.archive: archive_name = self.archive.name return str({ 'id': self.id, 'deposit': self.deposit, 'metadata': meta, 'archive': archive_name }) class DepositCollection(models.Model): id = models.BigAutoField(primary_key=True) # Human readable name for the collection type e.g HAL, arXiv, etc... name = models.TextField() class Meta: db_table = 'deposit_collection' def __str__(self): return str({'id': self.id, 'name': self.name}) diff --git a/swh/deposit/templates/deposit/status.xml b/swh/deposit/templates/deposit/status.xml index 25b1a6f5..a8378ef7 100644 --- a/swh/deposit/templates/deposit/status.xml +++ b/swh/deposit/templates/deposit/status.xml @@ -1,8 +1,11 @@ {{ deposit_id }} {{ status }} {{ status_detail }} {% if swh_id is not None %}{{ swh_id }}{% endif %} + {% if swh_id_context is not None %}{{ swh_id_context }}{% endif %} + {% if swh_anchor_id is not None %}{{ swh_anchor_id }}{% endif %} + {% if swh_anchor_id_context is not None %}{{ swh_anchor_id_context }}{% endif %} diff --git a/swh/deposit/tests/api/test_deposit_status.py b/swh/deposit/tests/api/test_deposit_status.py index ced39896..abbe07ff 100644 --- a/swh/deposit/tests/api/test_deposit_status.py +++ b/swh/deposit/tests/api/test_deposit_status.py @@ -1,230 +1,242 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from django.core.urlresolvers import reverse from io import BytesIO from nose.tools import istest from rest_framework import status from rest_framework.test import APITestCase from swh.deposit.api.deposit_status import convert_status_detail from swh.deposit.config import (COL_IRI, STATE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED) from swh.deposit.models import Deposit, DEPOSIT_STATUS_DETAIL from swh.deposit.models import DEPOSIT_STATUS_LOAD_SUCCESS from swh.deposit.parsers import parse_xml from ..common import BasicTestCase, WithAuthTestCase, FileSystemCreationRoutine from ..common import CommonCreationRoutine class DepositStatusTestCase(APITestCase, WithAuthTestCase, BasicTestCase, FileSystemCreationRoutine, CommonCreationRoutine): """Status on deposit """ @istest def post_deposit_with_status_check(self): """Binary upload should be accepted """ # given url = reverse(COL_IRI, args=[self.collection.name]) external_id = 'some-external-id-1' # when response = self.client.post( url, content_type='application/zip', # as zip data=self.archive['data'], # + headers CONTENT_LENGTH=self.archive['length'], HTTP_SLUG=external_id, HTTP_CONTENT_MD5=self.archive['md5sum'], HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_IN_PROGRESS='false', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) deposit = Deposit.objects.get(external_id=external_id) status_url = reverse(STATE_IRI, args=[self.collection.name, deposit.id]) # check status status_response = self.client.get(status_url) self.assertEqual(status_response.status_code, status.HTTP_200_OK) r = parse_xml(BytesIO(status_response.content)) self.assertEqual(int(r['deposit_id']), deposit.id) self.assertEqual(r['deposit_status'], DEPOSIT_STATUS_DEPOSITED) self.assertEqual(r['deposit_status_detail'], DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_DEPOSITED]) @istest - def status_with_swh_id(self): + def status_with_swh_information(self): _status = DEPOSIT_STATUS_LOAD_SUCCESS - _swh_id = '548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10' + _context = 'https://hal.archives-ouvertes.fr/hal-01727745' + _swh_id = 'swh:1:dir:42a13fc721c8716ff695d0d62fc851d641f3a12b' + _swh_id_context = '%s;%s' % (_swh_id, _context) + _swh_anchor_id = 'swh:rev:1:548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10' + _swh_anchor_id_context = '%s;%s' % (_swh_anchor_id, _context) # given deposit_id = self.create_deposit_with_status( status=_status, - swh_id=_swh_id) + swh_id=_swh_id, + swh_id_context=_swh_id_context, + swh_anchor_id=_swh_anchor_id, + swh_anchor_id_context=_swh_anchor_id_context + ) url = reverse(STATE_IRI, args=[self.collection.name, deposit_id]) # when status_response = self.client.get(url) # then self.assertEqual(status_response.status_code, status.HTTP_200_OK) r = parse_xml(BytesIO(status_response.content)) self.assertEqual(int(r['deposit_id']), deposit_id) self.assertEqual(r['deposit_status'], _status) self.assertEqual(r['deposit_status_detail'], DEPOSIT_STATUS_DETAIL[DEPOSIT_STATUS_LOAD_SUCCESS]) self.assertEqual(r['deposit_swh_id'], _swh_id) + self.assertEqual(r['deposit_swh_id_context'], _swh_id_context) + self.assertEqual(r['deposit_swh_anchor_id'], _swh_anchor_id) + self.assertEqual(r['deposit_swh_anchor_id_context'], + _swh_anchor_id_context) @istest def status_on_unknown_deposit(self): """Asking for the status of unknown deposit returns 404 response""" status_url = reverse(STATE_IRI, args=[self.collection.name, 999]) status_response = self.client.get(status_url) self.assertEqual(status_response.status_code, status.HTTP_404_NOT_FOUND) @istest def status_with_http_accept_header_should_not_break(self): """Asking deposit status with Accept header should return 200 """ deposit_id = self.create_deposit_partial() status_url = reverse(STATE_IRI, args=[ self.collection.name, deposit_id]) response = self.client.get( status_url, HTTP_ACCEPT='text/html,application/xml;q=9,*/*,q=8') self.assertEqual(response.status_code, status.HTTP_200_OK) @istest def convert_status_detail_empty(self): actual_status_detail = convert_status_detail({}) self.assertIsNone(actual_status_detail) actual_status_detail = convert_status_detail({'dummy-keys': []}) self.assertIsNone(actual_status_detail) actual_status_detail = convert_status_detail(None) self.assertIsNone(actual_status_detail) @istest def convert_status_detail(self): status_detail = { 'url': { 'summary': "At least one url field must be compatible with the client\'s domain name. The following url fields failed the check", # noqa 'fields': ['blahurl', 'testurl'], }, 'metadata': [ { 'summary': 'Mandatory fields missing', 'fields': ['url', 'title'], }, { 'summary': 'Alternate fields missing', 'fields': ['name or title', 'url or badurl'] } ], 'archive': [{ 'summary': 'Unreadable archive', 'fields': ['1'], }], } expected_status_detail = '''- Mandatory fields missing (url, title) - Alternate fields missing (name or title, url or badurl) - Unreadable archive (1) - At least one url field must be compatible with the client's domain name. The following url fields failed the check (blahurl, testurl) ''' # noqa actual_status_detail = convert_status_detail(status_detail) self.assertEqual(actual_status_detail, expected_status_detail) @istest def convert_status_detail_2(self): status_detail = { 'url': { 'summary': 'At least one compatible url field. Failed', 'fields': ['testurl'], }, 'metadata': [ { 'summary': 'Mandatory fields missing', 'fields': ['name'], }, ], 'archive': [ { 'summary': 'Invalid archive', 'fields': ['2'], }, { 'summary': 'Unsupported archive', 'fields': ['1'], } ], } expected_status_detail = '''- Mandatory fields missing (name) - Invalid archive (2) - Unsupported archive (1) - At least one compatible url field. Failed (testurl) ''' actual_status_detail = convert_status_detail(status_detail) self.assertEqual(actual_status_detail, expected_status_detail) @istest def convert_status_detail_3(self): status_detail = { 'url': { 'summary': 'At least one compatible url field', }, } expected_status_detail = '- At least one compatible url field\n' actual_status_detail = convert_status_detail(status_detail) self.assertEqual(actual_status_detail, expected_status_detail) @istest def status_on_deposit_rejected(self): _status = DEPOSIT_STATUS_REJECTED _swh_id = '548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10' _status_detail = {'url': {'summary': 'Wrong url'}} # given deposit_id = self.create_deposit_with_status( status=_status, swh_id=_swh_id, status_detail=_status_detail) url = reverse(STATE_IRI, args=[self.collection.name, deposit_id]) # when status_response = self.client.get(url) # then self.assertEqual(status_response.status_code, status.HTTP_200_OK) r = parse_xml(BytesIO(status_response.content)) self.assertEqual(int(r['deposit_id']), deposit_id) self.assertEqual(r['deposit_status'], _status) self.assertEqual(r['deposit_status_detail'], '- Wrong url') self.assertEqual(r['deposit_swh_id'], _swh_id) diff --git a/swh/deposit/tests/api/test_deposit_update_status.py b/swh/deposit/tests/api/test_deposit_update_status.py index 3d6cc2da..23f35052 100644 --- a/swh/deposit/tests/api/test_deposit_update_status.py +++ b/swh/deposit/tests/api/test_deposit_update_status.py @@ -1,123 +1,136 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from django.core.urlresolvers import reverse from nose.tools import istest from rest_framework import status from rest_framework.test import APITestCase from swh.deposit.models import Deposit, DEPOSIT_STATUS_DETAIL from swh.deposit.config import PRIVATE_PUT_DEPOSIT, DEPOSIT_STATUS_VERIFIED from swh.deposit.config import DEPOSIT_STATUS_LOAD_SUCCESS from ..common import BasicTestCase class UpdateDepositStatusTest(APITestCase, BasicTestCase): """Update the deposit's status scenario """ def setUp(self): super().setUp() deposit = Deposit(status=DEPOSIT_STATUS_VERIFIED, collection=self.collection, client=self.user) deposit.save() self.deposit = Deposit.objects.get(pk=deposit.id) assert self.deposit.status == DEPOSIT_STATUS_VERIFIED @istest def update_deposit_status(self): """Existing status for update should return a 204 response """ url = reverse(PRIVATE_PUT_DEPOSIT, args=[self.collection.name, self.deposit.id]) possible_status = set(DEPOSIT_STATUS_DETAIL.keys()) - set( [DEPOSIT_STATUS_LOAD_SUCCESS]) for _status in possible_status: response = self.client.put( url, content_type='application/json', data=json.dumps({'status': _status})) self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) deposit = Deposit.objects.get(pk=self.deposit.id) self.assertEquals(deposit.status, _status) @istest - def update_deposit_with_success_loading_and_swh_id(self): - """Existing status for update should return a 204 response + def update_deposit_status_with_info(self): + """Existing status for update with info should return a 204 response """ url = reverse(PRIVATE_PUT_DEPOSIT, args=[self.collection.name, self.deposit.id]) expected_status = DEPOSIT_STATUS_LOAD_SUCCESS + origin_url = 'something' + directory_id = '42a13fc721c8716ff695d0d62fc851d641f3a12b' revision_id = '47dc6b4636c7f6cba0df83e3d5490bf4334d987e' - expected_id = 'swh:1:rev:%s' % revision_id + expected_swh_id = 'swh:1:dir:%s' % directory_id + expected_swh_id_context = 'swh:1:dir:%s;origin=%s' % ( + directory_id, origin_url) + expected_swh_anchor_id = 'swh:1:rev:%s' % revision_id + expected_swh_anchor_id_context = 'swh:1:rev:%s;origin=%s' % ( + revision_id, origin_url) response = self.client.put( url, content_type='application/json', data=json.dumps({ 'status': expected_status, 'revision_id': revision_id, + 'directory_id': directory_id, + 'origin_url': origin_url, })) self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) deposit = Deposit.objects.get(pk=self.deposit.id) self.assertEquals(deposit.status, expected_status) - self.assertEquals(deposit.swh_id, expected_id) + self.assertEquals(deposit.swh_id, expected_swh_id) + self.assertEquals(deposit.swh_id_context, expected_swh_id_context) + self.assertEquals(deposit.swh_anchor_id, expected_swh_anchor_id) + self.assertEquals(deposit.swh_anchor_id_context, + expected_swh_anchor_id_context) @istest def update_deposit_status_will_fail_with_unknown_status(self): """Unknown status for update should return a 400 response """ url = reverse(PRIVATE_PUT_DEPOSIT, args=[self.collection.name, self.deposit.id]) response = self.client.put( url, content_type='application/json', data=json.dumps({'status': 'unknown'})) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) @istest def update_deposit_status_will_fail_with_no_status_key(self): """No status provided for update should return a 400 response """ url = reverse(PRIVATE_PUT_DEPOSIT, args=[self.collection.name, self.deposit.id]) response = self.client.put( url, content_type='application/json', data=json.dumps({'something': 'something'})) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) @istest def update_deposit_status_success_without_swh_id_fail(self): """Providing successful status without swh_id should return a 400 """ url = reverse(PRIVATE_PUT_DEPOSIT, args=[self.collection.name, self.deposit.id]) response = self.client.put( url, content_type='application/json', data=json.dumps({'status': DEPOSIT_STATUS_LOAD_SUCCESS})) self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) diff --git a/swh/deposit/tests/common.py b/swh/deposit/tests/common.py index 76a4f067..e595a783 100644 --- a/swh/deposit/tests/common.py +++ b/swh/deposit/tests/common.py @@ -1,562 +1,572 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import base64 import hashlib import os import shutil import tarfile import tempfile from django.core.urlresolvers import reverse from django.test import TestCase from io import BytesIO from nose.plugins.attrib import attr from rest_framework import status from swh.deposit.config import (COL_IRI, EM_IRI, EDIT_SE_IRI, DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_DEPOSITED) from swh.deposit.models import DepositClient, DepositCollection, Deposit from swh.deposit.models import DepositRequest from swh.deposit.models import DepositRequestType from swh.deposit.parsers import parse_xml from swh.deposit.settings.testing import MEDIA_ROOT from swh.core import tarball def compute_info(archive_path): """Given a path, compute information on path. """ with open(archive_path, 'rb') as f: length = 0 sha1sum = hashlib.sha1() md5sum = hashlib.md5() data = b'' for chunk in f: sha1sum.update(chunk) md5sum.update(chunk) length += len(chunk) data += chunk return { 'dir': os.path.dirname(archive_path), 'name': os.path.basename(archive_path), 'path': archive_path, 'length': length, 'sha1sum': sha1sum.hexdigest(), 'md5sum': md5sum.hexdigest(), 'data': data } def _compress(path, extension, dir_path): """Compress path according to extension """ if extension == 'zip' or extension == 'tar': return tarball.compress(path, extension, dir_path) elif '.' in extension: split_ext = extension.split('.') if split_ext[0] != 'tar': raise ValueError( 'Development error, only zip or tar archive supported, ' '%s not supported' % extension) # deal with specific tar mode = split_ext[1] supported_mode = ['xz', 'gz', 'bz2'] if mode not in supported_mode: raise ValueError( 'Development error, only %s supported, %s not supported' % ( supported_mode, mode)) files = tarball._ls(dir_path) with tarfile.open(path, 'w:%s' % mode) as t: for fpath, fname in files: t.add(fpath, arcname=fname, recursive=False) return path def create_arborescence_archive(root_path, archive_name, filename, content, up_to_size=None, extension='zip'): """Build an archive named archive_name in the root_path. This archive contains one file named filename with the content content. Args: root_path (str): Location path of the archive to create archive_name (str): Archive's name (without extension) filename (str): Archive's content is only one filename content (bytes): Content of the filename up_to_size (int | None): Fill in the blanks size to oversize or complete an archive's size extension (str): Extension of the archive to write (default is zip) Returns: dict with the keys: - dir: the directory of that archive - path: full path to the archive - sha1sum: archive's sha1sum - length: archive's length """ os.makedirs(root_path, exist_ok=True) archive_path_dir = tempfile.mkdtemp(dir=root_path) dir_path = os.path.join(archive_path_dir, archive_name) os.mkdir(dir_path) filepath = os.path.join(dir_path, filename) _length = len(content) count = 0 batch_size = 128 with open(filepath, 'wb') as f: f.write(content) if up_to_size: # fill with blank content up to a given size count += _length while count < up_to_size: f.write(b'0'*batch_size) count += batch_size _path = '%s.%s' % (dir_path, extension) _path = _compress(_path, extension, dir_path) return compute_info(_path) def create_archive_with_archive(root_path, name, archive): """Create an archive holding another. """ invalid_archive_path = os.path.join(root_path, name) with tarfile.open(invalid_archive_path, 'w:gz') as _archive: _archive.add(archive['path'], arcname=archive['name']) return compute_info(invalid_archive_path) @attr('fs') class FileSystemCreationRoutine(TestCase): """Mixin intended for tests needed to tamper with archives. """ def setUp(self): """Define the test client and other test variables.""" super().setUp() self.root_path = '/tmp/swh-deposit/test/build-zip/' os.makedirs(self.root_path, exist_ok=True) self.archive = create_arborescence_archive( self.root_path, 'archive1', 'file1', b'some content in file') self.atom_entry = b""" Awesome Compiler urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 1785io25c695 2017-10-07T15:17:08Z some awesome author https://hal-test.archives-ouvertes.fr """ def tearDown(self): super().tearDown() shutil.rmtree(self.root_path) def create_simple_binary_deposit(self, status_partial=True): response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/zip', data=self.archive['data'], CONTENT_LENGTH=self.archive['length'], HTTP_MD5SUM=self.archive['md5sum'], HTTP_SLUG='external-id', HTTP_IN_PROGRESS=status_partial, HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % ( self.archive['name'], )) # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) _status = response_content['deposit_status'] if status_partial: expected_status = DEPOSIT_STATUS_PARTIAL else: expected_status = DEPOSIT_STATUS_VERIFIED self.assertEqual(_status, expected_status) deposit_id = int(response_content['deposit_id']) return deposit_id def create_complex_binary_deposit(self, status_partial=False): deposit_id = self.create_simple_binary_deposit( status_partial=True) # Add a second archive to the deposit # update its status to DEPOSIT_STATUS_VERIFIED response = self.client.post( reverse(EM_IRI, args=[self.collection.name, deposit_id]), content_type='application/zip', data=self.archive2['data'], CONTENT_LENGTH=self.archive2['length'], HTTP_MD5SUM=self.archive2['md5sum'], HTTP_SLUG='external-id', HTTP_IN_PROGRESS=status_partial, HTTP_CONTENT_DISPOSITION='attachment; filename=filename1.zip') # then assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content['deposit_id']) return deposit_id def create_deposit_archive_with_archive(self, archive_extension): # we create the holding archive to a given extension archive = create_arborescence_archive( self.root_path, 'archive1', 'file1', b'some content in file', extension=archive_extension) # now we create an archive holding the first created archive invalid_archive = create_archive_with_archive( self.root_path, 'invalid.tar.gz', archive) # we deposit it response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/x-tar', data=invalid_archive['data'], CONTENT_LENGTH=invalid_archive['length'], HTTP_MD5SUM=invalid_archive['md5sum'], HTTP_SLUG='external-id', HTTP_IN_PROGRESS=False, HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % ( invalid_archive['name'], )) # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) _status = response_content['deposit_status'] self.assertEqual(_status, DEPOSIT_STATUS_DEPOSITED) deposit_id = int(response_content['deposit_id']) return deposit_id def update_binary_deposit(self, deposit_id, status_partial=False): # update existing deposit with atom entry metadata response = self.client.post( reverse(EDIT_SE_IRI, args=[self.collection.name, deposit_id]), content_type='application/atom+xml;type=entry', data=self.codemeta_entry_data1, HTTP_SLUG='external-id', HTTP_IN_PROGRESS=status_partial) # then self.assertEqual(response.status_code, status.HTTP_201_CREATED) response_content = parse_xml(BytesIO(response.content)) _status = response_content['deposit_status'] if status_partial: expected_status = DEPOSIT_STATUS_PARTIAL else: expected_status = DEPOSIT_STATUS_DEPOSITED self.assertEqual(_status, expected_status) deposit_id = int(response_content['deposit_id']) return deposit_id @attr('fs') class BasicTestCase(TestCase): """Mixin intended for data setup purposes (user, collection, etc...) """ def setUp(self): """Define the test client and other test variables.""" super().setUp() # expanding diffs in tests self.maxDiff = None # basic minimum test data deposit_request_types = {} # Add deposit request types for deposit_request_type in ['archive', 'metadata']: drt = DepositRequestType(name=deposit_request_type) drt.save() deposit_request_types[deposit_request_type] = drt _name = 'hal' _provider_url = 'https://hal-test.archives-ouvertes.fr/' _domain = 'archives-ouvertes.fr/' # set collection up _collection = DepositCollection(name=_name) _collection.save() # set user/client up _client = DepositClient.objects.create_user(username=_name, password=_name, provider_url=_provider_url, domain=_domain) _client.collections = [_collection.id] _client.last_name = _name _client.save() self.collection = _collection self.user = _client self.username = _name self.userpass = _name self.deposit_request_types = deposit_request_types def tearDown(self): super().tearDown() # Clean up uploaded files in temporary directory (tests have # their own media root folder) if os.path.exists(MEDIA_ROOT): for d in os.listdir(MEDIA_ROOT): shutil.rmtree(os.path.join(MEDIA_ROOT, d)) class WithAuthTestCase(TestCase): """Mixin intended for testing the api with basic authentication. """ def setUp(self): super().setUp() _token = '%s:%s' % (self.username, self.userpass) token = base64.b64encode(_token.encode('utf-8')) authorization = 'Basic %s' % token.decode('utf-8') self.client.credentials(HTTP_AUTHORIZATION=authorization) def tearDown(self): super().tearDown() self.client.credentials() class CommonCreationRoutine(TestCase): """Mixin class to share initialization routine. cf: `class`:test_deposit_update.DepositReplaceExistingDataTest `class`:test_deposit_update.DepositUpdateDepositWithNewDataTest `class`:test_deposit_update.DepositUpdateFailuresTest `class`:test_deposit_delete.DepositDeleteTest """ def setUp(self): super().setUp() self.atom_entry_data0 = b""" some-external-id https://hal-test.archives-ouvertes.fr/some-external-id """ self.atom_entry_data1 = b""" some awesome author """ self.atom_entry_data2 = b""" Awesome Compiler urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 1785io25c695 2017-10-07T15:17:08Z some awesome author https://hal-test.archives-ouvertes.fr/id """ self.codemeta_entry_data0 = b""" Awesome Compiler https://hal-test.archives-ouvertes.fr/1785io25c695 urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 1785io25c695 2017-10-07T15:17:08Z some awesome author description key-word 1 """ self.codemeta_entry_data1 = b""" Composing a Web of Audio Applications hal hal-01243065 hal-01243065 https://hal-test.archives-ouvertes.fr/hal-01243065 test DSP programming,Web 2017-05-03T16:08:47+02:00 this is the description 1 phpstorm stable php python C GNU General Public License v3.0 only CeCILL Free Software License Agreement v1.1 HAL hal@ccsd.cnrs.fr Morane Gruenpeter """ def create_deposit_with_invalid_archive(self, external_id='some-external-id-1'): url = reverse(COL_IRI, args=[self.collection.name]) data = b'some data which is clearly not a zip file' md5sum = hashlib.md5(data).hexdigest() # when response = self.client.post( url, content_type='application/zip', # as zip data=data, # + headers CONTENT_LENGTH=len(data), # other headers needs HTTP_ prefix to be taken into account HTTP_SLUG=external_id, HTTP_CONTENT_MD5=md5sum, HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip', HTTP_CONTENT_DISPOSITION='attachment; filename=filename0') response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content['deposit_id']) return deposit_id def create_deposit_with_status( self, status, - external_id='some-external-id-1', swh_id=None, status_detail=None): + external_id='some-external-id-1', + swh_id=None, + swh_id_context=None, + swh_anchor_id=None, + swh_anchor_id_context=None, + status_detail=None): # create an invalid deposit which we will update further down the line deposit_id = self.create_deposit_with_invalid_archive(external_id) # We cannot create some form of deposit with a given status in # test context ('rejected' for example). Update in place the # deposit with such status to permit some further tests. deposit = Deposit.objects.get(pk=deposit_id) if status == DEPOSIT_STATUS_REJECTED: deposit.status_detail = status_detail deposit.status = status if swh_id: deposit.swh_id = swh_id + if swh_id_context: + deposit.swh_id_context = swh_id_context + if swh_anchor_id: + deposit.swh_anchor_id = swh_anchor_id + if swh_anchor_id_context: + deposit.swh_anchor_id_context = swh_anchor_id_context deposit.save() - return deposit_id def create_simple_deposit_partial(self, external_id='some-external-id'): """Create a simple deposit (1 request) in `partial` state and returns its new identifier. Returns: deposit id """ response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', data=self.atom_entry_data0, HTTP_SLUG=external_id, HTTP_IN_PROGRESS='true') assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content['deposit_id']) return deposit_id def create_deposit_partial_with_data_in_args(self, data): """Create a simple deposit (1 request) in `partial` state with the data or metadata as an argument and returns its new identifier. Args: data: atom entry Returns: deposit id """ response = self.client.post( reverse(COL_IRI, args=[self.collection.name]), content_type='application/atom+xml;type=entry', data=data, HTTP_SLUG='external-id', HTTP_IN_PROGRESS='true') assert response.status_code == status.HTTP_201_CREATED response_content = parse_xml(BytesIO(response.content)) deposit_id = int(response_content['deposit_id']) return deposit_id def _update_deposit_with_status(self, deposit_id, status_partial=False): """Add to a given deposit another archive and update its current status to `deposited` (by default). Returns: deposit id """ # when response = self.client.post( reverse(EDIT_SE_IRI, args=[self.collection.name, deposit_id]), content_type='application/atom+xml;type=entry', data=self.atom_entry_data1, HTTP_SLUG='external-id', HTTP_IN_PROGRESS=status_partial) # then assert response.status_code == status.HTTP_201_CREATED return deposit_id def create_deposit_ready(self, external_id='some-external-id'): """Create a complex deposit (2 requests) in status `deposited`. """ deposit_id = self.create_simple_deposit_partial( external_id=external_id) deposit_id = self._update_deposit_with_status(deposit_id) return deposit_id def create_deposit_partial(self, external_id='some-external-id'): """Create a complex deposit (2 requests) in status `partial`. """ deposit_id = self.create_simple_deposit_partial( external_id=external_id) deposit_id = self._update_deposit_with_status( deposit_id, status_partial=True) return deposit_id def add_metadata_to_deposit(self, deposit_id, status_partial=False): """Add metadata to deposit. """ # when response = self.client.post( reverse(EDIT_SE_IRI, args=[self.collection.name, deposit_id]), content_type='application/atom+xml;type=entry', data=self.codemeta_entry_data1, HTTP_SLUG='external-id', HTTP_IN_PROGRESS=status_partial) assert response.status_code == status.HTTP_201_CREATED # then deposit = Deposit.objects.get(pk=deposit_id) assert deposit is not None deposit_requests = DepositRequest.objects.filter(deposit=deposit) assert deposit_requests is not [] for dr in deposit_requests: if dr.type.name == 'metadata': assert deposit_requests[0].metadata is not {} return deposit_id diff --git a/swh/deposit/tests/loader/common.py b/swh/deposit/tests/loader/common.py index 389a1d47..6adb0d85 100644 --- a/swh/deposit/tests/loader/common.py +++ b/swh/deposit/tests/loader/common.py @@ -1,49 +1,52 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json from swh.deposit.client import PrivateApiDepositClient CLIENT_TEST_CONFIG = { 'url': 'http://nowhere:9000/', 'auth': {}, # no authentication in test scenario } class SWHDepositTestClient(PrivateApiDepositClient): """Deposit test client to permit overriding the default request client. """ def __init__(self, client, config): super().__init__(config=config) self.client = client def archive_get(self, archive_update_url, archive_path, log=None): r = self.client.get(archive_update_url) with open(archive_path, 'wb') as f: for chunk in r.streaming_content: f.write(chunk) return archive_path def metadata_get(self, metadata_url, log=None): r = self.client.get(metadata_url) return json.loads(r.content.decode('utf-8')) - def status_update(self, update_status_url, status, revision_id=None): + def status_update(self, update_status_url, status, + revision_id=None, directory_id=None): payload = {'status': status} if revision_id: payload['revision_id'] = revision_id + if directory_id: + payload['directory_id'] = directory_id self.client.put(update_status_url, content_type='application/json', data=json.dumps(payload)) def check(self, check_url): r = self.client.get(check_url) data = json.loads(r.content.decode('utf-8')) return data['status']