Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/PKG-INFO b/PKG-INFO
index 5e3d8fa8..2b730ef3 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,35 +1,35 @@
Metadata-Version: 2.1
Name: swh.deposit
-Version: 0.0.79
+Version: 0.0.80
Summary: Software Heritage Deposit Server
Home-page: https://forge.softwareheritage.org/source/swh-deposit/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-deposit
Description: # swh-deposit
This is [Software Heritage](https://www.softwareheritage.org)'s
[SWORD 2.0](http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html) Server
implementation, as well as a simple client to upload deposits on the server.
**S.W.O.R.D** (**S**imple **W**eb-Service **O**ffering **R**epository
**D**eposit) is an interoperability standard for digital file deposit.
This implementation will permit interaction between a client (a
repository) and a server (SWH repository) to permit deposits of
software source code archives and associated metadata.
The documentation is at ./docs/README-specification.md
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Description-Content-Type: text/markdown
Provides-Extra: testing
Provides-Extra: server
diff --git a/requirements-swh-server.txt b/requirements-swh-server.txt
index c18e77d1..b12106e5 100644
--- a/requirements-swh-server.txt
+++ b/requirements-swh-server.txt
@@ -1,5 +1,4 @@
swh.core[http]
-swh.loader.tar >= 0.0.39
-swh.loader.core >= 0.0.43
+swh.loader.core >= 0.0.71
swh.scheduler >= 0.0.39
swh.model >= 0.0.26
diff --git a/requirements-test.txt b/requirements-test.txt
index 2e64c384..cda155f8 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,7 +1,8 @@
pytest
pytest-django
pytest-mock
swh.scheduler[testing]
+swh.loader.core[testing]
pytest-postgresql >= 2.1.0
requests_mock
-django-stubs < 1.3.0
+django-stubs
diff --git a/swh.deposit.egg-info/PKG-INFO b/swh.deposit.egg-info/PKG-INFO
index 5e3d8fa8..2b730ef3 100644
--- a/swh.deposit.egg-info/PKG-INFO
+++ b/swh.deposit.egg-info/PKG-INFO
@@ -1,35 +1,35 @@
Metadata-Version: 2.1
Name: swh.deposit
-Version: 0.0.79
+Version: 0.0.80
Summary: Software Heritage Deposit Server
Home-page: https://forge.softwareheritage.org/source/swh-deposit/
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-deposit
Description: # swh-deposit
This is [Software Heritage](https://www.softwareheritage.org)'s
[SWORD 2.0](http://swordapp.github.io/SWORDv2-Profile/SWORDProfile.html) Server
implementation, as well as a simple client to upload deposits on the server.
**S.W.O.R.D** (**S**imple **W**eb-Service **O**ffering **R**epository
**D**eposit) is an interoperability standard for digital file deposit.
This implementation will permit interaction between a client (a
repository) and a server (SWH repository) to permit deposits of
software source code archives and associated metadata.
The documentation is at ./docs/README-specification.md
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 5 - Production/Stable
Description-Content-Type: text/markdown
Provides-Extra: testing
Provides-Extra: server
diff --git a/swh.deposit.egg-info/SOURCES.txt b/swh.deposit.egg-info/SOURCES.txt
index 27c9c6c4..3bb1677d 100644
--- a/swh.deposit.egg-info/SOURCES.txt
+++ b/swh.deposit.egg-info/SOURCES.txt
@@ -1,169 +1,168 @@
MANIFEST.in
Makefile
README.md
requirements-server.txt
requirements-swh-server.txt
requirements-swh.txt
requirements-test.txt
requirements.txt
setup.py
version.txt
swh/__init__.py
swh.deposit.egg-info/PKG-INFO
swh.deposit.egg-info/SOURCES.txt
swh.deposit.egg-info/dependency_links.txt
swh.deposit.egg-info/entry_points.txt
swh.deposit.egg-info/requires.txt
swh.deposit.egg-info/top_level.txt
swh/deposit/__init__.py
swh/deposit/apps.py
swh/deposit/auth.py
+swh/deposit/client.py
swh/deposit/config.py
swh/deposit/errors.py
swh/deposit/gunicorn_config.py
swh/deposit/manage.py
swh/deposit/models.py
swh/deposit/parsers.py
swh/deposit/py.typed
-swh/deposit/signals.py
swh/deposit/urls.py
swh/deposit/utils.py
swh/deposit/api/__init__.py
swh/deposit/api/common.py
swh/deposit/api/converters.py
swh/deposit/api/deposit.py
swh/deposit/api/deposit_content.py
swh/deposit/api/deposit_status.py
swh/deposit/api/deposit_update.py
swh/deposit/api/service_document.py
swh/deposit/api/urls.py
swh/deposit/api/private/__init__.py
swh/deposit/api/private/deposit_check.py
swh/deposit/api/private/deposit_list.py
swh/deposit/api/private/deposit_read.py
swh/deposit/api/private/deposit_update_status.py
swh/deposit/api/private/urls.py
swh/deposit/cli/__init__.py
swh/deposit/cli/admin.py
swh/deposit/cli/client.py
-swh/deposit/client/__init__.py
swh/deposit/fixtures/__init__.py
swh/deposit/fixtures/deposit_data.yaml
swh/deposit/loader/__init__.py
swh/deposit/loader/checker.py
swh/deposit/loader/tasks.py
swh/deposit/migrations/0001_initial.py
swh/deposit/migrations/0002_depositrequest_archive.py
swh/deposit/migrations/0003_temporaryarchive.py
swh/deposit/migrations/0004_delete_temporaryarchive.py
swh/deposit/migrations/0005_auto_20171019_1436.py
swh/deposit/migrations/0006_depositclient_url.py
swh/deposit/migrations/0007_auto_20171129_1609.py
swh/deposit/migrations/0008_auto_20171130_1513.py
swh/deposit/migrations/0009_deposit_parent.py
swh/deposit/migrations/0010_auto_20180110_0953.py
swh/deposit/migrations/0011_auto_20180115_1510.py
swh/deposit/migrations/0012_deposit_status_detail.py
swh/deposit/migrations/0013_depositrequest_raw_metadata.py
swh/deposit/migrations/0014_auto_20180720_1221.py
swh/deposit/migrations/0015_depositrequest_typemigration.py
swh/deposit/migrations/0016_auto_20190507_1408.py
swh/deposit/migrations/0017_auto_20190925_0906.py
swh/deposit/migrations/__init__.py
swh/deposit/settings/__init__.py
swh/deposit/settings/common.py
swh/deposit/settings/development.py
swh/deposit/settings/production.py
swh/deposit/settings/testing.py
swh/deposit/static/robots.txt
swh/deposit/static/css/bootstrap-responsive.min.css
swh/deposit/static/css/style.css
swh/deposit/static/img/arrow-up-small.png
swh/deposit/static/img/swh-logo-deposit.png
swh/deposit/static/img/swh-logo-deposit.svg
swh/deposit/static/img/icons/swh-logo-32x32.png
swh/deposit/static/img/icons/swh-logo-deposit-180x180.png
swh/deposit/static/img/icons/swh-logo-deposit-192x192.png
swh/deposit/static/img/icons/swh-logo-deposit-270x270.png
swh/deposit/templates/__init__.py
swh/deposit/templates/api.html
swh/deposit/templates/homepage.html
swh/deposit/templates/layout.html
swh/deposit/templates/deposit/__init__.py
swh/deposit/templates/deposit/content.xml
swh/deposit/templates/deposit/deposit_receipt.xml
swh/deposit/templates/deposit/error.xml
swh/deposit/templates/deposit/service_document.xml
swh/deposit/templates/deposit/status.xml
swh/deposit/templates/rest_framework/api.html
swh/deposit/tests/__init__.py
swh/deposit/tests/common.py
swh/deposit/tests/conftest.py
swh/deposit/tests/test_common.py
swh/deposit/tests/test_gunicorn_config.py
swh/deposit/tests/test_utils.py
swh/deposit/tests/api/__init__.py
swh/deposit/tests/api/conftest.py
swh/deposit/tests/api/test_converters.py
swh/deposit/tests/api/test_deposit.py
swh/deposit/tests/api/test_deposit_atom.py
swh/deposit/tests/api/test_deposit_binary.py
swh/deposit/tests/api/test_deposit_delete.py
swh/deposit/tests/api/test_deposit_list.py
swh/deposit/tests/api/test_deposit_multipart.py
swh/deposit/tests/api/test_deposit_private_check.py
swh/deposit/tests/api/test_deposit_private_read_archive.py
swh/deposit/tests/api/test_deposit_private_read_metadata.py
swh/deposit/tests/api/test_deposit_private_update_status.py
swh/deposit/tests/api/test_deposit_status.py
swh/deposit/tests/api/test_deposit_update.py
swh/deposit/tests/api/test_parser.py
swh/deposit/tests/api/test_service_document.py
swh/deposit/tests/api/data/atom/codemeta-sample.xml
swh/deposit/tests/api/data/atom/entry-data-badly-formatted.xml
swh/deposit/tests/api/data/atom/entry-data-deposit-binary.xml
swh/deposit/tests/api/data/atom/entry-data-empty-body.xml
swh/deposit/tests/api/data/atom/entry-data-ko.xml
swh/deposit/tests/api/data/atom/entry-data-minimal.xml
swh/deposit/tests/api/data/atom/entry-data-parsing-error-prone.xml
swh/deposit/tests/api/data/atom/entry-data0.xml
swh/deposit/tests/api/data/atom/entry-data1.xml
swh/deposit/tests/api/data/atom/entry-data2.xml
swh/deposit/tests/api/data/atom/entry-data3.xml
swh/deposit/tests/api/data/atom/entry-update-in-place.xml
swh/deposit/tests/api/data/atom/error-with-decimal.xml
swh/deposit/tests/api/data/atom/metadata.xml
swh/deposit/tests/api/data/atom/tei-sample.xml
swh/deposit/tests/cli/__init__.py
swh/deposit/tests/cli/test_client.py
swh/deposit/tests/cli/data/atom/codemeta-sample.xml
swh/deposit/tests/cli/data/atom/entry-data-badly-formatted.xml
swh/deposit/tests/cli/data/atom/entry-data-deposit-binary.xml
swh/deposit/tests/cli/data/atom/entry-data-empty-body.xml
swh/deposit/tests/cli/data/atom/entry-data-ko.xml
swh/deposit/tests/cli/data/atom/entry-data-minimal.xml
swh/deposit/tests/cli/data/atom/entry-data-parsing-error-prone.xml
swh/deposit/tests/cli/data/atom/entry-data0.xml
swh/deposit/tests/cli/data/atom/entry-data1.xml
swh/deposit/tests/cli/data/atom/entry-data2.xml
swh/deposit/tests/cli/data/atom/entry-data3.xml
swh/deposit/tests/cli/data/atom/entry-update-in-place.xml
swh/deposit/tests/cli/data/atom/error-with-decimal.xml
swh/deposit/tests/cli/data/atom/metadata.xml
swh/deposit/tests/cli/data/atom/tei-sample.xml
swh/deposit/tests/loader/__init__.py
swh/deposit/tests/loader/common.py
swh/deposit/tests/loader/conftest.py
swh/deposit/tests/loader/test_checker.py
swh/deposit/tests/loader/test_client.py
swh/deposit/tests/loader/test_tasks.py
swh/deposit/tests/loader/data/http_example.org/hello.json
swh/deposit/tests/loader/data/http_example.org/hello_you
swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_1_check
swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_2_check
swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_meta
swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_raw
swh/deposit/tests/loader/data/https_deposit.softwareheritage.org/1_private_test_999_update
swh/deposit/tests/loader/data/https_nowhere.org/1_private_test_1_check
swh/deposit/tests/loader/data/https_nowhere.org/1_private_test_1_metadata
swh/deposit/tests/loader/data/https_nowhere.org/1_private_test_1_raw
\ No newline at end of file
diff --git a/swh.deposit.egg-info/requires.txt b/swh.deposit.egg-info/requires.txt
index 79807d04..1c419684 100644
--- a/swh.deposit.egg-info/requires.txt
+++ b/swh.deposit.egg-info/requires.txt
@@ -1,31 +1,30 @@
vcversioner
click
xmltodict
iso8601
requests
swh.core>=0.0.75
[server]
Django<2.0
djangorestframework
swh.core[http]
-swh.loader.tar>=0.0.39
-swh.loader.core>=0.0.43
+swh.loader.core>=0.0.71
swh.scheduler>=0.0.39
swh.model>=0.0.26
[testing]
pytest
pytest-django
pytest-mock
swh.scheduler[testing]
+swh.loader.core[testing]
pytest-postgresql>=2.1.0
requests_mock
-django-stubs<1.3.0
+django-stubs
Django<2.0
djangorestframework
swh.core[http]
-swh.loader.tar>=0.0.39
-swh.loader.core>=0.0.43
+swh.loader.core>=0.0.71
swh.scheduler>=0.0.39
swh.model>=0.0.26
diff --git a/swh/deposit/api/common.py b/swh/deposit/api/common.py
index f867d5b4..dbff46e0 100644
--- a/swh/deposit/api/common.py
+++ b/swh/deposit/api/common.py
@@ -1,886 +1,901 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import hashlib
from typing import Any, Tuple
from abc import ABCMeta, abstractmethod
from django.urls import reverse
from django.http import HttpResponse
from django.shortcuts import render
from django.utils import timezone
from rest_framework import status
from rest_framework.authentication import BasicAuthentication
from rest_framework.permissions import IsAuthenticated
from rest_framework.views import APIView
from swh.model import hashutil
+from swh.scheduler.utils import create_oneshot_task_dict
from ..config import (
SWHDefaultConfig, EDIT_SE_IRI, EM_IRI, CONT_FILE_IRI,
ARCHIVE_KEY, METADATA_KEY, RAW_METADATA_KEY, STATE_IRI,
DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL,
+ PRIVATE_CHECK_DEPOSIT,
DEPOSIT_STATUS_LOAD_SUCCESS, ARCHIVE_TYPE, METADATA_TYPE
)
from ..errors import (
MAX_UPLOAD_SIZE_EXCEEDED, BAD_REQUEST, ERROR_CONTENT,
CHECKSUM_MISMATCH, make_error_dict, MEDIATION_NOT_ALLOWED,
make_error_response_from_dict, FORBIDDEN,
NOT_FOUND, make_error_response, METHOD_NOT_ALLOWED,
ParserError, PARSING_ERROR
)
from ..models import (
Deposit, DepositRequest, DepositCollection,
DepositClient
)
from ..parsers import parse_xml
ACCEPT_PACKAGINGS = ['http://purl.org/net/sword/package/SimpleZip']
ACCEPT_ARCHIVE_CONTENT_TYPES = ['application/zip', 'application/x-tar']
class SWHAPIView(APIView):
"""Mixin intended as a based API view to enforce the basic
authentication check
"""
authentication_classes = (BasicAuthentication, ) # type: Tuple[Any, ...]
permission_classes = (IsAuthenticated, )
class SWHBaseDeposit(SWHDefaultConfig, SWHAPIView, metaclass=ABCMeta):
"""Base deposit request class sharing multiple common behaviors.
"""
def _read_headers(self, req):
"""Read and unify the necessary headers from the request (those are
not stored in the same location or not properly formatted).
Args:
req (Request): Input request
Returns:
Dictionary with the following keys (some associated values may be
None):
- content-type
- content-length
- in-progress
- content-disposition
- packaging
- slug
- on-behalf-of
"""
meta = req._request.META
content_type = req.content_type
content_length = meta.get('CONTENT_LENGTH')
if content_length and isinstance(content_length, str):
content_length = int(content_length)
# final deposit if not provided
in_progress = meta.get('HTTP_IN_PROGRESS', False)
content_disposition = meta.get('HTTP_CONTENT_DISPOSITION')
if isinstance(in_progress, str):
in_progress = in_progress.lower() == 'true'
content_md5sum = meta.get('HTTP_CONTENT_MD5')
if content_md5sum:
content_md5sum = bytes.fromhex(content_md5sum)
packaging = meta.get('HTTP_PACKAGING')
slug = meta.get('HTTP_SLUG')
on_behalf_of = meta.get('HTTP_ON_BEHALF_OF')
metadata_relevant = meta.get('HTTP_METADATA_RELEVANT')
return {
'content-type': content_type,
'content-length': content_length,
'in-progress': in_progress,
'content-disposition': content_disposition,
'content-md5sum': content_md5sum,
'packaging': packaging,
'slug': slug,
'on-behalf-of': on_behalf_of,
'metadata-relevant': metadata_relevant,
}
def _compute_md5(self, filehandler):
"""Compute uploaded file's md5 sum.
Args:
filehandler (InMemoryUploadedFile): the file to compute the md5
hash
Returns:
the md5 checksum (str)
"""
h = hashlib.md5()
for chunk in filehandler:
h.update(chunk)
return h.digest()
- def _deposit_put(self, deposit_id=None, in_progress=False,
+ def _deposit_put(self, req, deposit_id=None, in_progress=False,
external_id=None):
"""Save/Update a deposit in db.
Args:
deposit_id (int): deposit identifier
in_progress (dict): The deposit's status
external_id (str): The external identifier to associate to
the deposit
Returns:
The Deposit instance saved or updated.
"""
if in_progress is False:
complete_date = timezone.now()
status_type = DEPOSIT_STATUS_DEPOSITED
else:
complete_date = None
status_type = DEPOSIT_STATUS_PARTIAL
if not deposit_id:
try:
# find a deposit parent (same external id, status load
# to success)
deposit_parent = Deposit.objects.filter(
external_id=external_id,
status=DEPOSIT_STATUS_LOAD_SUCCESS).order_by('-id')[0:1].get() # noqa
except Deposit.DoesNotExist:
deposit_parent = None
deposit = Deposit(collection=self._collection,
external_id=external_id,
complete_date=complete_date,
status=status_type,
client=self._client,
parent=deposit_parent)
else:
deposit = Deposit.objects.get(pk=deposit_id)
# update metadata
deposit.complete_date = complete_date
deposit.status = status_type
+ if self.config['checks']:
+ deposit.save() # needed to have a deposit id
+ args = [deposit.collection.name, deposit.id]
+ scheduler = self.scheduler
+ if (deposit.status == DEPOSIT_STATUS_DEPOSITED and
+ not deposit.check_task_id):
+ check_url = req.build_absolute_uri(
+ reverse(PRIVATE_CHECK_DEPOSIT, args=args))
+ task = create_oneshot_task_dict(
+ 'check-deposit', deposit_check_url=check_url)
+ check_task_id = scheduler.create_tasks([task])[0]['id']
+ deposit.check_task_id = check_task_id
+
deposit.save()
return deposit
def _deposit_request_put(self, deposit, deposit_request_data,
replace_metadata=False, replace_archives=False):
"""Save a deposit request with metadata attached to a deposit.
Args:
deposit (Deposit): The deposit concerned by the request
deposit_request_data (dict): The dictionary with at most 2 deposit
request types (archive, metadata) to associate to the deposit
replace_metadata (bool): Flag defining if we add or update
existing metadata to the deposit
replace_archives (bool): Flag defining if we add or update
archives to existing deposit
Returns:
None
"""
if replace_metadata:
DepositRequest.objects.filter(
deposit=deposit,
type=METADATA_TYPE).delete()
if replace_archives:
DepositRequest.objects.filter(
deposit=deposit,
type=ARCHIVE_TYPE).delete()
deposit_request = None
archive_file = deposit_request_data.get(ARCHIVE_KEY)
if archive_file:
deposit_request = DepositRequest(
type=ARCHIVE_TYPE,
deposit=deposit,
archive=archive_file)
deposit_request.save()
metadata = deposit_request_data.get(METADATA_KEY)
if metadata:
raw_metadata = deposit_request_data.get(RAW_METADATA_KEY)
deposit_request = DepositRequest(
type=METADATA_TYPE,
deposit=deposit,
metadata=metadata,
raw_metadata=raw_metadata)
deposit_request.save()
assert deposit_request is not None
def _delete_archives(self, collection_name, deposit_id):
"""Delete archives reference from the deposit id.
"""
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND,
'The deposit %s does not exist' % deposit_id)
DepositRequest.objects.filter(
deposit=deposit,
type=ARCHIVE_TYPE).delete()
return {}
def _delete_deposit(self, collection_name, deposit_id):
"""Delete deposit reference.
Args:
collection_name (str): Client's name
deposit_id (id): The deposit to delete
Returns
Empty dict when ok.
Dict with error key to describe the failure.
"""
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND,
'The deposit %s does not exist' % deposit_id)
if deposit.collection.name != collection_name:
summary = 'Cannot delete a deposit from another collection'
description = "Deposit %s does not belong to the collection %s" % (
deposit_id, collection_name)
return make_error_dict(
BAD_REQUEST,
summary=summary,
verbose_description=description)
DepositRequest.objects.filter(deposit=deposit).delete()
deposit.delete()
return {}
def _check_preconditions_on(self, filehandler, md5sum,
content_length=None):
"""Check preconditions on provided file are respected. That is the
length and/or the md5sum hash match the file's content.
Args:
filehandler (InMemoryUploadedFile): The file to check
md5sum (hex str): md5 hash expected from the file's content
content_length (int): the expected length if provided.
Returns:
Either none if no error or a dictionary with a key error
detailing the problem.
"""
if content_length:
if content_length > self.config['max_upload_size']:
return make_error_dict(
MAX_UPLOAD_SIZE_EXCEEDED,
'Upload size limit exceeded (max %s bytes).' %
self.config['max_upload_size'],
'Please consider sending the archive in '
'multiple steps.')
length = filehandler.size
if length != content_length:
return make_error_dict(status.HTTP_412_PRECONDITION_FAILED,
'Wrong length')
if md5sum:
_md5sum = self._compute_md5(filehandler)
if _md5sum != md5sum:
return make_error_dict(
CHECKSUM_MISMATCH,
'Wrong md5 hash',
'The checksum sent %s and the actual checksum '
'%s does not match.' % (hashutil.hash_to_hex(md5sum),
hashutil.hash_to_hex(_md5sum)))
return None
def _binary_upload(self, req, headers, collection_name, deposit_id=None,
replace_metadata=False, replace_archives=False):
"""Binary upload routine.
Other than such a request, a 415 response is returned.
Args:
req (Request): the request holding information to parse
and inject in db
headers (dict): request headers formatted
collection_name (str): the associated client
deposit_id (id): deposit identifier if provided
replace_metadata (bool): 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
replace_archives (bool): 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id (int): Deposit identifier
- deposit_date (date): Deposit date
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 413 (request entity too large) if the length of the
archive exceeds the max size configured
- 412 (precondition failed) if the length or md5 hash provided
mismatch the reality of the archive
- 415 (unsupported media type) if a wrong media type is provided
"""
content_length = headers['content-length']
if not content_length:
return make_error_dict(
BAD_REQUEST,
'CONTENT_LENGTH header is mandatory',
'For archive deposit, the '
'CONTENT_LENGTH header must be sent.')
content_disposition = headers['content-disposition']
if not content_disposition:
return make_error_dict(
BAD_REQUEST,
'CONTENT_DISPOSITION header is mandatory',
'For archive deposit, the '
'CONTENT_DISPOSITION header must be sent.')
packaging = headers['packaging']
if packaging and packaging not in ACCEPT_PACKAGINGS:
return make_error_dict(
BAD_REQUEST,
'Only packaging %s is supported' %
ACCEPT_PACKAGINGS,
'The packaging provided %s is not supported' % packaging)
filehandler = req.FILES['file']
precondition_status_response = self._check_preconditions_on(
filehandler, headers['content-md5sum'], content_length)
if precondition_status_response:
return precondition_status_response
external_id = headers['slug']
# actual storage of data
archive_metadata = filehandler
- deposit = self._deposit_put(deposit_id=deposit_id,
+ deposit = self._deposit_put(req, deposit_id=deposit_id,
in_progress=headers['in-progress'],
external_id=external_id)
self._deposit_request_put(
deposit, {ARCHIVE_KEY: archive_metadata},
replace_metadata=replace_metadata,
replace_archives=replace_archives)
return {
'deposit_id': deposit.id,
'deposit_date': deposit.reception_date,
'status': deposit.status,
'archive': filehandler.name,
}
def _read_metadata(self, metadata_stream):
"""Given a metadata stream, reads the metadata and returns both the
parsed and the raw metadata.
"""
raw_metadata = metadata_stream.read()
metadata = parse_xml(raw_metadata)
return raw_metadata, metadata
def _multipart_upload(self, req, headers, collection_name,
deposit_id=None, replace_metadata=False,
replace_archives=False):
"""Multipart upload supported with exactly:
- 1 archive (zip)
- 1 atom entry
Other than such a request, a 415 response is returned.
Args:
req (Request): the request holding information to parse
and inject in db
headers (dict): request headers formatted
collection_name (str): the associated client
deposit_id (id): deposit identifier if provided
replace_metadata (bool): 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
replace_archives (bool): 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id (int): Deposit identifier
- deposit_date (date): Deposit date
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 412 (precondition failed) if the potentially md5 hash provided
mismatch the reality of the archive
- 413 (request entity too large) if the length of the
archive exceeds the max size configured
- 415 (unsupported media type) if a wrong media type is provided
"""
external_id = headers['slug']
content_types_present = set()
data = {
'application/zip': None, # expected either zip
'application/x-tar': None, # or x-tar
'application/atom+xml': None,
}
for key, value in req.FILES.items():
fh = value
if fh.content_type in content_types_present:
return make_error_dict(
ERROR_CONTENT,
'Only 1 application/zip (or application/x-tar) archive '
'and 1 atom+xml entry is supported (as per sword2.0 '
'specification)',
'You provided more than 1 application/(zip|x-tar) '
'or more than 1 application/atom+xml content-disposition '
'header in the multipart deposit')
content_types_present.add(fh.content_type)
data[fh.content_type] = fh
if len(content_types_present) != 2:
return make_error_dict(
ERROR_CONTENT,
'You must provide both 1 application/zip (or '
'application/x-tar) and 1 atom+xml entry for multipart '
'deposit',
'You need to provide only 1 application/(zip|x-tar) '
'and 1 application/atom+xml content-disposition header '
'in the multipart deposit')
filehandler = data['application/zip']
if not filehandler:
filehandler = data['application/x-tar']
precondition_status_response = self._check_preconditions_on(
filehandler,
headers['content-md5sum'])
if precondition_status_response:
return precondition_status_response
try:
raw_metadata, metadata = self._read_metadata(
data['application/atom+xml'])
except ParserError:
return make_error_dict(
PARSING_ERROR,
'Malformed xml metadata',
"The xml received is malformed. "
"Please ensure your metadata file is correctly formatted.")
# actual storage of data
- deposit = self._deposit_put(deposit_id=deposit_id,
+ deposit = self._deposit_put(req, deposit_id=deposit_id,
in_progress=headers['in-progress'],
external_id=external_id)
deposit_request_data = {
ARCHIVE_KEY: filehandler,
METADATA_KEY: metadata,
RAW_METADATA_KEY: raw_metadata,
}
self._deposit_request_put(
deposit, deposit_request_data, replace_metadata, replace_archives)
return {
'deposit_id': deposit.id,
'deposit_date': deposit.reception_date,
'archive': filehandler.name,
'status': deposit.status,
}
def _atom_entry(self, req, headers, collection_name,
deposit_id=None,
replace_metadata=False,
replace_archives=False):
"""Atom entry deposit.
Args:
req (Request): the request holding information to parse
and inject in db
headers (dict): request headers formatted
collection_name (str): the associated client
deposit_id (id): deposit identifier if provided
replace_metadata (bool): 'Update or add' request to existing
deposit. If False (default), this adds new metadata request to
existing ones. Otherwise, this will replace existing metadata.
replace_archives (bool): 'Update or add' request to existing
deposit. If False (default), this adds new archive request to
existing ones. Otherwise, this will replace existing archives.
ones.
Returns:
In the optimal case a dict with the following keys:
- deposit_id: deposit id associated to the deposit
- deposit_date: date of the deposit
- archive: None (no archive is provided here)
Otherwise, a dictionary with the key error and the
associated failures, either:
- 400 (bad request) if the request is not providing an external
identifier
- 400 (bad request) if the request's body is empty
- 415 (unsupported media type) if a wrong media type is provided
"""
try:
raw_metadata, metadata = self._read_metadata(req.data)
except ParserError:
return make_error_dict(
BAD_REQUEST,
'Malformed xml metadata',
"The xml received is malformed. "
"Please ensure your metadata file is correctly formatted.")
if not metadata:
return make_error_dict(
BAD_REQUEST,
'Empty body request is not supported',
'Atom entry deposit is supposed to send for metadata. '
'If the body is empty, there is no metadata.')
external_id = metadata.get('external_identifier', headers['slug'])
- deposit = self._deposit_put(deposit_id=deposit_id,
+ deposit = self._deposit_put(req, deposit_id=deposit_id,
in_progress=headers['in-progress'],
external_id=external_id)
self._deposit_request_put(
deposit, {METADATA_KEY: metadata, RAW_METADATA_KEY: raw_metadata},
replace_metadata, replace_archives)
return {
'deposit_id': deposit.id,
'deposit_date': deposit.reception_date,
'archive': None,
'status': deposit.status,
}
def _empty_post(self, req, headers, collection_name, deposit_id):
"""Empty post to finalize an empty deposit.
Args:
req (Request): the request holding information to parse
and inject in db
headers (dict): request headers formatted
collection_name (str): the associated client
deposit_id (id): deposit identifier
Returns:
Dictionary of result with the deposit's id, the date
it was completed and no archive.
"""
deposit = Deposit.objects.get(pk=deposit_id)
deposit.complete_date = timezone.now()
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
return {
'deposit_id': deposit_id,
'deposit_date': deposit.complete_date,
'status': deposit.status,
'archive': None,
}
def _make_iris(self, req, collection_name, deposit_id):
"""Define the IRI endpoints
Args:
req (Request): The initial request
collection_name (str): client/collection's name
deposit_id (id): Deposit identifier
Returns:
Dictionary of keys with the iris' urls.
"""
args = [collection_name, deposit_id]
return {
iri: req.build_absolute_uri(reverse(iri, args=args))
for iri in [EM_IRI, EDIT_SE_IRI, CONT_FILE_IRI, STATE_IRI]
}
def additional_checks(self, req, headers, collection_name,
deposit_id=None):
"""Permit the child class to enrich additional checks.
Returns:
dict with 'error' detailing the problem.
"""
return {}
def checks(self, req, collection_name, deposit_id=None):
try:
self._collection = DepositCollection.objects.get(
name=collection_name)
except DepositCollection.DoesNotExist:
return make_error_dict(
NOT_FOUND,
'Unknown collection name %s' % collection_name)
username = req.user.username
if username: # unauthenticated request can have the username empty
try:
self._client = DepositClient.objects.get(username=username)
except DepositClient.DoesNotExist:
return make_error_dict(NOT_FOUND,
'Unknown client name %s' % username)
if self._collection.id not in self._client.collections:
return make_error_dict(
FORBIDDEN,
'Client %s cannot access collection %s' % (
username, collection_name))
if deposit_id:
try:
deposit = Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND,
'Deposit with id %s does not exist' %
deposit_id)
checks = self.restrict_access(req, deposit)
if checks:
return checks
headers = self._read_headers(req)
if headers['on-behalf-of']:
return make_error_dict(MEDIATION_NOT_ALLOWED,
'Mediation is not supported.')
checks = self.additional_checks(req, headers,
collection_name, deposit_id)
if 'error' in checks:
return checks
return {'headers': headers}
def restrict_access(self, req, deposit=None):
if deposit:
if (req.method != 'GET' and
deposit.status != DEPOSIT_STATUS_PARTIAL):
summary = "You can only act on deposit with status '%s'" % (
DEPOSIT_STATUS_PARTIAL, )
description = "This deposit has status '%s'" % deposit.status
return make_error_dict(
BAD_REQUEST, summary=summary,
verbose_description=description)
def _basic_not_allowed_method(self, req, method):
return make_error_response(
req, METHOD_NOT_ALLOWED,
'%s method is not supported on this endpoint' % method)
def get(self, req, *args, **kwargs):
return self._basic_not_allowed_method(req, 'GET')
def post(self, req, *args, **kwargs):
return self._basic_not_allowed_method(req, 'POST')
def put(self, req, *args, **kwargs):
return self._basic_not_allowed_method(req, 'PUT')
def delete(self, req, *args, **kwargs):
return self._basic_not_allowed_method(req, 'DELETE')
class SWHGetDepositAPI(SWHBaseDeposit, metaclass=ABCMeta):
"""Mixin for class to support GET method.
"""
def get(self, req, collection_name, deposit_id, format=None):
"""Endpoint to create/add resources to deposit.
Returns:
200 response when no error during routine occurred
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(req, collection_name, deposit_id)
if 'error' in checks:
return make_error_response_from_dict(req, checks['error'])
r = self.process_get(
req, collection_name, deposit_id)
if isinstance(r, tuple):
status, content, content_type = r
return HttpResponse(content,
status=status,
content_type=content_type)
return r
@abstractmethod
def process_get(self, req, collection_name, deposit_id):
"""Routine to deal with the deposit's get processing.
Returns:
Tuple status, stream of content, content-type
"""
pass
class SWHPostDepositAPI(SWHBaseDeposit, metaclass=ABCMeta):
"""Mixin for class to support DELETE method.
"""
def post(self, req, collection_name, deposit_id=None, format=None):
"""Endpoint to create/add resources to deposit.
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(req, collection_name, deposit_id)
if 'error' in checks:
return make_error_response_from_dict(req, checks['error'])
headers = checks['headers']
_status, _iri_key, data = self.process_post(
req, headers, collection_name, deposit_id)
error = data.get('error')
if error:
return make_error_response_from_dict(req, error)
data['packagings'] = ACCEPT_PACKAGINGS
iris = self._make_iris(req, collection_name, data['deposit_id'])
data.update(iris)
response = render(req, 'deposit/deposit_receipt.xml',
context=data,
content_type='application/xml',
status=_status)
response._headers['location'] = 'Location', data[_iri_key]
return response
@abstractmethod
def process_post(self, req, headers, collection_name, deposit_id=None):
"""Routine to deal with the deposit's processing.
Returns
Tuple of:
- response status code (200, 201, etc...)
- key iri (EM_IRI, EDIT_SE_IRI, etc...)
- dictionary of the processing result
"""
pass
class SWHPutDepositAPI(SWHBaseDeposit, metaclass=ABCMeta):
"""Mixin for class to support PUT method.
"""
def put(self, req, collection_name, deposit_id, format=None):
"""Endpoint to update deposit resources.
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(req, collection_name, deposit_id)
if 'error' in checks:
return make_error_response_from_dict(req, checks['error'])
headers = checks['headers']
data = self.process_put(req, headers, collection_name, deposit_id)
error = data.get('error')
if error:
return make_error_response_from_dict(req, error)
return HttpResponse(status=status.HTTP_204_NO_CONTENT)
@abstractmethod
def process_put(self, req, headers, collection_name, deposit_id):
"""Routine to deal with updating a deposit in some way.
Returns
dictionary of the processing result
"""
pass
class SWHDeleteDepositAPI(SWHBaseDeposit, metaclass=ABCMeta):
"""Mixin for class to support DELETE method.
"""
def delete(self, req, collection_name, deposit_id):
"""Endpoint to delete some deposit's resources (archives, deposit).
Returns:
204 response when no error during routine occurred.
400 if the deposit does not belong to the collection
404 if the deposit or the collection does not exist
"""
checks = self.checks(req, collection_name, deposit_id)
if 'error' in checks:
return make_error_response_from_dict(req, checks['error'])
data = self.process_delete(req, collection_name, deposit_id)
error = data.get('error')
if error:
return make_error_response_from_dict(req, error)
return HttpResponse(status=status.HTTP_204_NO_CONTENT)
@abstractmethod
def process_delete(self, req, collection_name, deposit_id):
"""Routine to delete a resource.
This is mostly not allowed except for the
EM_IRI (cf. .api.deposit_update.SWHUpdateArchiveDeposit)
"""
pass
diff --git a/swh/deposit/api/private/__init__.py b/swh/deposit/api/private/__init__.py
index 4ed1154d..f0f86945 100644
--- a/swh/deposit/api/private/__init__.py
+++ b/swh/deposit/api/private/__init__.py
@@ -1,94 +1,94 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.deposit import utils
-from ...config import METADATA_TYPE
+from ...config import METADATA_TYPE, SWHDefaultConfig
from ...models import DepositRequest, Deposit
from rest_framework.permissions import AllowAny
from swh.deposit.api.common import SWHAPIView
from swh.deposit.errors import make_error_dict, NOT_FOUND
class DepositReadMixin:
"""Deposit Read mixin
"""
def _deposit_requests(self, deposit, request_type):
"""Given a deposit, yields its associated deposit_request
Args:
deposit (Deposit): Deposit to list requests for
request_type (str): 'archive' or 'metadata'
Yields:
deposit requests of type request_type associated to the deposit
"""
if isinstance(deposit, int):
deposit = Deposit.objects.get(pk=deposit)
deposit_requests = DepositRequest.objects.filter(
type=request_type,
deposit=deposit).order_by('id')
for deposit_request in deposit_requests:
yield deposit_request
def _metadata_get(self, deposit):
"""Given a deposit, aggregate all metadata requests.
Args:
deposit (Deposit): The deposit instance to extract
metadata from.
Returns:
metadata dict from the deposit.
"""
metadata = (m.metadata for m in self._deposit_requests(
deposit, request_type=METADATA_TYPE))
return utils.merge(*metadata)
-class SWHPrivateAPIView(SWHAPIView):
+class SWHPrivateAPIView(SWHDefaultConfig, SWHAPIView):
"""Mixin intended as private api (so no authentication) based API view
(for the private ones).
"""
authentication_classes = ()
permission_classes = (AllowAny, )
def checks(self, req, collection_name, deposit_id=None):
"""Override default checks implementation to allow empty collection.
"""
if deposit_id:
try:
Deposit.objects.get(pk=deposit_id)
except Deposit.DoesNotExist:
return make_error_dict(
NOT_FOUND,
'Deposit with id %s does not exist' %
deposit_id)
headers = self._read_headers(req)
checks = self.additional_checks(
req, headers, collection_name, deposit_id)
if 'error' in checks:
return checks
return {'headers': headers}
def get(self, req, collection_name=None, deposit_id=None, format=None,
*args, **kwargs):
return super().get(req, collection_name, deposit_id, format)
def put(self, req, collection_name=None, deposit_id=None, format=None,
*args, **kwargs):
return super().put(req, collection_name, deposit_id, format)
diff --git a/swh/deposit/api/private/deposit_check.py b/swh/deposit/api/private/deposit_check.py
index 8961d914..e17d4eab 100644
--- a/swh/deposit/api/private/deposit_check.py
+++ b/swh/deposit/api/private/deposit_check.py
@@ -1,209 +1,229 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import re
import tarfile
import zipfile
+from itertools import chain
+from shutil import get_unpack_formats
+
from rest_framework import status
+from swh.scheduler.utils import create_oneshot_task_dict
from . import DepositReadMixin, SWHPrivateAPIView
from ..common import SWHGetDepositAPI
from ...config import DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_REJECTED
from ...config import ARCHIVE_TYPE
from ...models import Deposit
MANDATORY_FIELDS_MISSING = 'Mandatory fields are missing'
ALTERNATE_FIELDS_MISSING = 'Mandatory alternate fields are missing'
MANDATORY_ARCHIVE_UNREADABLE = 'At least one of its associated archives is not readable' # noqa
MANDATORY_ARCHIVE_INVALID = 'Mandatory archive is invalid (i.e contains only one archive)' # noqa
MANDATORY_ARCHIVE_UNSUPPORTED = 'Mandatory archive type is not supported'
MANDATORY_ARCHIVE_MISSING = 'Deposit without archive is rejected'
ARCHIVE_EXTENSIONS = [
'zip', 'tar', 'tar.gz', 'xz', 'tar.xz', 'bz2',
'tar.bz2', 'Z', 'tar.Z', 'tgz', '7z'
]
PATTERN_ARCHIVE_EXTENSION = re.compile(
r'.*\.(%s)$' % '|'.join(ARCHIVE_EXTENSIONS))
+def known_archive_format(filename):
+ return any(filename.endswith(t) for t in
+ chain(*(x[1] for x in get_unpack_formats())))
+
+
class SWHChecksDeposit(SWHPrivateAPIView, SWHGetDepositAPI, DepositReadMixin):
"""Dedicated class to read a deposit's raw archives content.
Only GET is supported.
"""
def _check_deposit_archives(self, deposit):
"""Given a deposit, check each deposit request of type archive.
Args:
The deposit to check archives for
Returns
tuple (status, error_detail): True, None if all archives
are ok, (False, <detailed-error>) otherwise.
"""
requests = list(self._deposit_requests(
deposit, request_type=ARCHIVE_TYPE))
if len(requests) == 0: # no associated archive is refused
return False, {
'archive': [{
'summary': MANDATORY_ARCHIVE_MISSING,
}]
}
errors = []
for archive_request in requests:
check, error_message = self._check_archive(archive_request)
if not check:
errors.append({
'summary': error_message,
'fields': [archive_request.id]
})
if not errors:
return True, None
return False, {
'archive': errors
}
def _check_archive(self, archive_request):
"""Check that a deposit associated archive is ok:
- readable
- supported archive format
- valid content: the archive does not contain a single archive file
If any of those checks are not ok, return the corresponding
failing check.
Args:
archive_path (DepositRequest): Archive to check
Returns:
(True, None) if archive is check compliant, (False,
<detail-error>) otherwise.
"""
archive_path = archive_request.archive.path
+
+ if not known_archive_format(archive_path):
+ return False, MANDATORY_ARCHIVE_UNSUPPORTED
+
try:
if zipfile.is_zipfile(archive_path):
with zipfile.ZipFile(archive_path) as f:
files = f.namelist()
elif tarfile.is_tarfile(archive_path):
with tarfile.open(archive_path) as f:
files = f.getnames()
else:
return False, MANDATORY_ARCHIVE_UNSUPPORTED
except Exception:
return False, MANDATORY_ARCHIVE_UNREADABLE
if len(files) > 1:
return True, None
element = files[0]
if PATTERN_ARCHIVE_EXTENSION.match(element):
# archive in archive!
return False, MANDATORY_ARCHIVE_INVALID
return True, None
def _check_metadata(self, metadata):
"""Check to execute on all metadata for mandatory field presence.
Args:
metadata (dict): Metadata dictionary to check for mandatory fields
Returns:
tuple (status, error_detail): True, None if metadata are
ok (False, <detailed-error>) otherwise.
"""
required_fields = {
'author': False,
}
alternate_fields = {
('name', 'title'): False, # alternate field, at least one
# of them must be present
}
for field, value in metadata.items():
for name in required_fields:
if name in field:
required_fields[name] = True
for possible_names in alternate_fields:
for possible_name in possible_names:
if possible_name in field:
alternate_fields[possible_names] = True
continue
mandatory_result = [k for k, v in required_fields.items() if not v]
optional_result = [
' or '.join(k) for k, v in alternate_fields.items() if not v]
if mandatory_result == [] and optional_result == []:
return True, None
detail = []
if mandatory_result != []:
detail.append({
'summary': MANDATORY_FIELDS_MISSING,
'fields': mandatory_result
})
if optional_result != []:
detail.append({
'summary': ALTERNATE_FIELDS_MISSING,
'fields': optional_result,
})
return False, {
'metadata': detail
}
def process_get(self, req, collection_name, deposit_id):
"""Build a unique tarball from the multiple received and stream that
content to the client.
Args:
req (Request):
collection_name (str): Collection owning the deposit
deposit_id (id): Deposit concerned by the reading
Returns:
Tuple status, stream of content, content-type
"""
deposit = Deposit.objects.get(pk=deposit_id)
metadata = self._metadata_get(deposit)
problems = {}
# will check each deposit's associated request (both of type
# archive and metadata) for errors
archives_status, error_detail = self._check_deposit_archives(deposit)
if not archives_status:
problems.update(error_detail)
metadata_status, error_detail = self._check_metadata(metadata)
if not metadata_status:
problems.update(error_detail)
deposit_status = archives_status and metadata_status
# if any problems arose, the deposit is rejected
if not deposit_status:
deposit.status = DEPOSIT_STATUS_REJECTED
deposit.status_detail = problems
response = {
'status': deposit.status,
'details': deposit.status_detail,
}
else:
deposit.status = DEPOSIT_STATUS_VERIFIED
response = {
'status': deposit.status,
}
+ if not deposit.load_task_id and self.config['checks']:
+ url = deposit.origin_url
+ task = create_oneshot_task_dict(
+ 'load-deposit', url=url, deposit_id=deposit.id,
+ retries_left=3)
+ load_task_id = self.scheduler.create_tasks([task])[0]['id']
+ deposit.load_task_id = load_task_id
deposit.save()
return status.HTTP_200_OK, json.dumps(response), 'application/json'
diff --git a/swh/deposit/api/private/deposit_read.py b/swh/deposit/api/private/deposit_read.py
index 8b834b04..2ee5da91 100644
--- a/swh/deposit/api/private/deposit_read.py
+++ b/swh/deposit/api/private/deposit_read.py
@@ -1,234 +1,227 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import os
import shutil
import tempfile
from contextlib import contextmanager
from django.http import FileResponse
from rest_framework import status
from swh.core import tarball
from swh.model import identifiers
from swh.deposit.utils import normalize_date
-from swh.deposit import utils
from . import DepositReadMixin, SWHPrivateAPIView
from ...config import SWH_PERSON, ARCHIVE_TYPE
from ..common import SWHGetDepositAPI
from ...models import Deposit
@contextmanager
def aggregate_tarballs(extraction_dir, archive_paths):
"""Aggregate multiple tarballs into one and returns this new archive's
path.
Args:
extraction_dir (path): Path to use for the tarballs computation
archive_paths ([str]): Deposit's archive paths
Returns:
Tuple (directory to clean up, archive path (aggregated or not))
"""
- if len(archive_paths) > 1:
- # need to rebuild one archive from multiple ones
- os.makedirs(extraction_dir, 0o755, exist_ok=True)
- dir_path = tempfile.mkdtemp(prefix='swh.deposit-',
- dir=extraction_dir)
- # root folder to build an aggregated tarball
- aggregated_tarball_rootdir = os.path.join(dir_path, 'aggregate')
- os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True)
+ # rebuild one zip archive from (possibly) multiple ones
+ os.makedirs(extraction_dir, 0o755, exist_ok=True)
+ dir_path = tempfile.mkdtemp(prefix='swh.deposit-', dir=extraction_dir)
- # uncompress in a temporary location all archives
- for archive_path in archive_paths:
- tarball.uncompress(archive_path, aggregated_tarball_rootdir)
+ # root folder to build an aggregated tarball
+ aggregated_tarball_rootdir = os.path.join(dir_path, 'aggregate')
+ os.makedirs(aggregated_tarball_rootdir, 0o755, exist_ok=True)
- # Aggregate into one big tarball the multiple smaller ones
- temp_tarpath = tarball.compress(
- aggregated_tarball_rootdir + '.zip',
- nature='zip',
- dirpath_or_files=aggregated_tarball_rootdir)
+ # uncompress in a temporary location all archives
+ for archive_path in archive_paths:
+ tarball.uncompress(archive_path, aggregated_tarball_rootdir)
- # can already clean up temporary directory
- shutil.rmtree(aggregated_tarball_rootdir)
+ # Aggregate into one big tarball the multiple smaller ones
+ temp_tarpath = shutil.make_archive(
+ aggregated_tarball_rootdir, 'zip',
+ aggregated_tarball_rootdir)
+ # can already clean up temporary directory
+ shutil.rmtree(aggregated_tarball_rootdir)
- try:
- yield temp_tarpath
- finally:
- shutil.rmtree(dir_path)
-
- else: # only 1 archive, no need to do fancy actions (and no cleanup step)
- yield archive_paths[0]
+ try:
+ yield temp_tarpath
+ finally:
+ shutil.rmtree(dir_path)
class SWHDepositReadArchives(SWHPrivateAPIView, SWHGetDepositAPI,
DepositReadMixin):
"""Dedicated class to read a deposit's raw archives content.
Only GET is supported.
"""
ADDITIONAL_CONFIG = {
'extraction_dir': ('str', '/tmp/swh-deposit/archive/'),
}
def __init__(self):
super().__init__()
self.extraction_dir = self.config['extraction_dir']
if not os.path.exists(self.extraction_dir):
os.makedirs(self.extraction_dir)
def process_get(self, req, collection_name, deposit_id):
"""Build a unique tarball from the multiple received and stream that
content to the client.
Args:
req (Request):
collection_name (str): Collection owning the deposit
deposit_id (id): Deposit concerned by the reading
Returns:
Tuple status, stream of content, content-type
"""
archive_paths = [r.archive.path for r in self._deposit_requests(
deposit_id, request_type=ARCHIVE_TYPE)]
with aggregate_tarballs(self.extraction_dir, archive_paths) as path:
return FileResponse(open(path, 'rb'),
status=status.HTTP_200_OK,
- content_type='application/octet-stream')
+ content_type='application/zip')
class SWHDepositReadMetadata(SWHPrivateAPIView, SWHGetDepositAPI,
DepositReadMixin):
"""Class in charge of aggregating metadata on a deposit.
"""
ADDITIONAL_CONFIG = {
'provider': ('dict', {
# 'provider_name': '', # those are not set since read from the
# 'provider_url': '', # deposit's client
'provider_type': 'deposit_client',
'metadata': {}
}),
'tool': ('dict', {
'name': 'swh-deposit',
'version': '0.0.1',
'configuration': {
'sword_version': '2'
}
})
}
def __init__(self):
super().__init__()
self.provider = self.config['provider']
self.tool = self.config['tool']
def _normalize_dates(self, deposit, metadata):
"""Normalize the date to use as a tuple of author date, committer date
from the incoming metadata.
Args:
deposit (Deposit): Deposit model representation
metadata (Dict): Metadata dict representation
Returns:
Tuple of author date, committer date. Those dates are
swh normalized.
"""
commit_date = metadata.get('codemeta:datePublished')
author_date = metadata.get('codemeta:dateCreated')
if author_date and commit_date:
pass
elif commit_date:
author_date = commit_date
elif author_date:
commit_date = author_date
else:
author_date = deposit.complete_date
commit_date = deposit.complete_date
return (
normalize_date(author_date),
normalize_date(commit_date)
)
def metadata_read(self, deposit):
"""Read and aggregate multiple data on deposit into one unified data
dictionary.
Args:
deposit (Deposit): Deposit concerned by the data aggregation.
Returns:
Dictionary of data representing the deposit to inject in swh.
"""
metadata = self._metadata_get(deposit)
# Read information metadata
data = {
'origin': {
'type': 'deposit',
- 'url': utils.origin_url_from(deposit),
+ 'url': deposit.origin_url,
}
}
# revision
fullname = deposit.client.username
author_committer = SWH_PERSON
# metadata provider
self.provider['provider_name'] = deposit.client.last_name
self.provider['provider_url'] = deposit.client.provider_url
revision_type = 'tar'
revision_msg = '%s: Deposit %s in collection %s' % (
fullname, deposit.id, deposit.collection.name)
author_date, commit_date = self._normalize_dates(deposit, metadata)
data['revision'] = {
'synthetic': True,
'date': author_date,
'committer_date': commit_date,
'author': author_committer,
'committer': author_committer,
'type': revision_type,
'message': revision_msg,
'metadata': metadata,
}
if deposit.parent:
swh_persistent_id = deposit.parent.swh_id
persistent_identifier = identifiers.parse_persistent_identifier(
swh_persistent_id)
parent_revision = persistent_identifier.object_id
data['revision']['parents'] = [parent_revision]
data['branch_name'] = 'master'
data['origin_metadata'] = {
'provider': self.provider,
'tool': self.tool,
'metadata': metadata
}
return data
def process_get(self, req, collection_name, deposit_id):
deposit = Deposit.objects.get(pk=deposit_id)
data = self.metadata_read(deposit)
d = {}
if data:
d = json.dumps(data)
return status.HTTP_200_OK, d, 'application/json'
diff --git a/swh/deposit/apps.py b/swh/deposit/apps.py
index f45146bb..16dffec6 100644
--- a/swh/deposit/apps.py
+++ b/swh/deposit/apps.py
@@ -1,16 +1,10 @@
# Copyright (C) 2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.apps import AppConfig
class DepositConfig(AppConfig):
name = 'swh.deposit'
-
- def ready(self):
- super().ready()
-
- # install the signal permitting to trigger the status' check
- from .signals import post_deposit_save # noqa
diff --git a/swh/deposit/cli/client.py b/swh/deposit/cli/client.py
index 8cc0f609..fc945fa0 100644
--- a/swh/deposit/cli/client.py
+++ b/swh/deposit/cli/client.py
@@ -1,381 +1,397 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import logging
import sys
import tempfile
import uuid
+import json
+import yaml
import click
import xmltodict
from swh.deposit.client import PublicApiDepositClient
from swh.deposit.cli import deposit
logger = logging.getLogger(__name__)
class InputError(ValueError):
"""Input script error
"""
pass
def generate_slug():
"""Generate a slug (sample purposes).
"""
return str(uuid.uuid4())
def _url(url):
"""Force the /1 api version at the end of the url (avoiding confusing
issues without it).
Args:
url (str): api url used by cli users
Returns:
Top level api url to actually request
"""
if not url.endswith('/1'):
url = '%s/1' % url
return url
def generate_metadata_file(name, external_id, authors, temp_dir):
"""Generate a temporary metadata file with the minimum required metadata
This generates a xml file in a temporary location and returns the
path to that file.
This is up to the client of that function to clean up the
temporary file.
Args:
name (str): Software's name
external_id (str): External identifier (slug) or generated one
authors (List[str]): List of author names
Returns:
Filepath to the metadata generated file
"""
path = os.path.join(temp_dir, 'metadata.xml')
# generate a metadata file with the minimum required metadata
codemetadata = {
'entry': {
'@xmlns': "http://www.w3.org/2005/Atom",
'@xmlns:codemeta': "https://doi.org/10.5063/SCHEMA/CODEMETA-2.0",
'codemeta:name': name,
'codemeta:identifier': external_id,
'codemeta:author': [{
'codemeta:name': author_name
} for author_name in authors],
},
}
logging.debug('Temporary file: %s', path)
logging.debug('Metadata dict to generate as xml: %s', codemetadata)
s = xmltodict.unparse(codemetadata, pretty=True)
logging.debug('Metadata dict as xml generated: %s', s)
with open(path, 'w') as fp:
fp.write(s)
return path
def _client(url, username, password):
"""Instantiate a client to access the deposit api server
Args:
url (str): Deposit api server
username (str): User
password (str): User's password
"""
client = PublicApiDepositClient({
'url': url,
'auth': {
'username': username,
'password': password
},
})
return client
def _collection(client):
"""Retrieve the client's collection
"""
# retrieve user's collection
sd_content = client.service_document()
if 'error' in sd_content:
raise InputError('Service document retrieval: %s' % (
sd_content['error'], ))
collection = sd_content[
'service']['workspace']['collection']['sword:name']
return collection
def client_command_parse_input(
username, password, archive, metadata,
archive_deposit, metadata_deposit,
collection, slug, partial, deposit_id, replace,
url, name, authors, temp_dir):
"""Parse the client subcommand options and make sure the combination
is acceptable*. If not, an InputError exception is raised
explaining the issue.
By acceptable, we mean:
- A multipart deposit (create or update) requires:
- an existing software archive
- an existing metadata file or author(s) and name provided in
params
- A binary deposit (create/update) requires an existing software
archive
- A metadata deposit (create/update) requires an existing metadata
file or author(s) and name provided in params
- A deposit update requires a deposit_id
This will not prevent all failure cases though. The remaining
errors are already dealt with by the underlying api client.
Raises:
InputError explaining the issue
Returns:
dict with the following keys:
'archive': the software archive to deposit
'username': username
'password': associated password
'metadata': the metadata file to deposit
'collection': the username's associated client
'slug': the slug or external id identifying the deposit to make
'partial': if the deposit is partial or not
'client': instantiated class
'url': deposit's server main entry point
'deposit_type': deposit's type (binary, multipart, metadata)
'deposit_id': optional deposit identifier
"""
if archive_deposit and metadata_deposit:
# too many flags use, remove redundant ones (-> multipart deposit)
archive_deposit = False
metadata_deposit = False
if not slug: # generate one as this is mandatory
slug = generate_slug()
if not metadata:
if name and authors:
metadata = generate_metadata_file(name, slug, authors, temp_dir)
elif not archive_deposit and not partial and not deposit_id:
# If we meet all the following conditions:
# * there is not an archive-only deposit
# * it is not part of a multipart deposit (either create/update
# or finish)
# * it misses either name or authors
raise InputError(
"Either a metadata file (--metadata) or both --author and "
"--name must be provided, unless this is an archive-only "
"deposit.")
elif name or authors:
# If we are generating metadata, then all mandatory metadata
# must be present
raise InputError(
"Either a metadata file (--metadata) or both --author and "
"--name must be provided.")
else:
# TODO: this is a multipart deposit, we might want to check that
# metadata are deposited at some point
pass
elif name or authors:
raise InputError(
"Using a metadata file (--metadata) is incompatible with "
"--author and --name, which are used to generate one.")
if metadata_deposit:
archive = None
if archive_deposit:
metadata = None
if metadata_deposit and not metadata:
raise InputError(
"Metadata deposit must be provided for metadata "
"deposit (either a filepath or --name and --author)")
if not archive and not metadata and partial:
raise InputError(
'Please provide an actionable command. See --help for more '
'information')
if replace and not deposit_id:
raise InputError(
'To update an existing deposit, you must provide its id')
client = _client(url, username, password)
if not collection:
collection = _collection(client)
return {
'archive': archive,
'username': username,
'password': password,
'metadata': metadata,
'collection': collection,
'slug': slug,
'in_progress': partial,
'client': client,
'url': url,
'deposit_id': deposit_id,
'replace': replace,
}
def _subdict(d, keys):
'return a dict from d with only given keys'
return {k: v for k, v in d.items() if k in keys}
def deposit_create(config, logger):
"""Delegate the actual deposit to the deposit client.
"""
logger.debug('Create deposit')
client = config['client']
keys = ('collection', 'archive', 'metadata', 'slug', 'in_progress')
return client.deposit_create(
**_subdict(config, keys))
def deposit_update(config, logger):
"""Delegate the actual deposit to the deposit client.
"""
logger.debug('Update deposit')
client = config['client']
keys = ('collection', 'deposit_id', 'archive', 'metadata',
'slug', 'in_progress', 'replace')
return client.deposit_update(
**_subdict(config, keys))
@deposit.command()
@click.option('--username', required=True,
help="(Mandatory) User's name")
@click.option('--password', required=True,
help="(Mandatory) User's associated password")
@click.option('--archive', type=click.Path(exists=True),
help='(Optional) Software archive to deposit')
@click.option('--metadata', type=click.Path(exists=True),
help="(Optional) Path to xml metadata file. If not provided, this will use a file named <archive>.metadata.xml") # noqa
@click.option('--archive-deposit/--no-archive-deposit', default=False,
help='(Optional) Software archive only deposit')
@click.option('--metadata-deposit/--no-metadata-deposit', default=False,
help='(Optional) Metadata only deposit')
@click.option('--collection',
help="(Optional) User's collection. If not provided, this will be fetched.") # noqa
@click.option('--slug',
help="""(Optional) External system information identifier. If not provided, it will be generated""") # noqa
@click.option('--partial/--no-partial', default=False,
help='(Optional) The deposit will be partial, other deposits will have to take place to finalize it.') # noqa
@click.option('--deposit-id', default=None,
help='(Optional) Update an existing partial deposit with its identifier') # noqa
@click.option('--replace/--no-replace', default=False,
help='(Optional) Update by replacing existing metadata to a deposit') # noqa
@click.option('--url', default='https://deposit.softwareheritage.org',
help="(Optional) Deposit server api endpoint. By default, https://deposit.softwareheritage.org/1") # noqa
@click.option('--verbose/--no-verbose', default=False,
help='Verbose mode')
@click.option('--name',
help='Software name')
@click.option('--author', multiple=True,
help='Software author(s), this can be repeated as many times'
' as there are authors')
+@click.option('-f', '--format', 'output_format', default='logging',
+ type=click.Choice(['logging', 'yaml', 'json']),
+ help='Output format results.')
@click.pass_context
def upload(ctx,
username, password, archive=None, metadata=None,
archive_deposit=False, metadata_deposit=False,
collection=None, slug=None, partial=False, deposit_id=None,
replace=False, url='https://deposit.softwareheritage.org',
- verbose=False, name=None, author=None):
+ verbose=False, name=None, author=None, output_format=None):
"""Software Heritage Public Deposit Client
Create/Update deposit through the command line.
More documentation can be found at
https://docs.softwareheritage.org/devel/swh-deposit/getting-started.html.
"""
url = _url(url)
config = {}
with tempfile.TemporaryDirectory() as temp_dir:
try:
logger.debug('Parsing cli options')
config = client_command_parse_input(
username, password, archive, metadata, archive_deposit,
metadata_deposit, collection, slug, partial, deposit_id,
replace, url, name, author, temp_dir)
except InputError as e:
logger.error('Problem during parsing options: %s', e)
sys.exit(1)
if verbose:
logger.info("Parsed configuration: %s" % (
config, ))
deposit_id = config['deposit_id']
if deposit_id:
r = deposit_update(config, logger)
else:
r = deposit_create(config, logger)
-
- logger.info(r)
+ print_result(r, output_format)
@deposit.command()
@click.option('--url', default='https://deposit.softwareheritage.org',
help="(Optional) Deposit server api endpoint. By default, "
"https://deposit.softwareheritage.org/1")
@click.option('--username', required=True,
help="(Mandatory) User's name")
@click.option('--password', required=True,
help="(Mandatory) User's associated password")
@click.option('--deposit-id', default=None,
required=True,
help="Deposit identifier.")
+@click.option('-f', '--format', 'output_format', default='logging',
+ type=click.Choice(['logging', 'yaml', 'json']),
+ help='Output format results.')
@click.pass_context
-def status(ctx, url, username, password, deposit_id):
+def status(ctx, url, username, password, deposit_id, output_format):
"""Deposit's status
"""
url = _url(url)
logger.debug('Status deposit')
try:
client = _client(url, username, password)
collection = _collection(client)
except InputError as e:
logger.error('Problem during parsing options: %s', e)
sys.exit(1)
- r = client.deposit_status(
- collection=collection, deposit_id=deposit_id)
- logger.info(r)
+ print_result(client.deposit_status(
+ collection=collection, deposit_id=deposit_id),
+ output_format)
+
+
+def print_result(data, output_format):
+ if output_format == 'json':
+ click.echo(json.dumps(data))
+ elif output_format == 'yaml':
+ click.echo(yaml.dump(data))
+ else:
+ logger.info(data)
diff --git a/swh/deposit/client/__init__.py b/swh/deposit/client.py
similarity index 98%
rename from swh/deposit/client/__init__.py
rename to swh/deposit/client.py
index b4b7f7e2..58517c3c 100644
--- a/swh/deposit/client/__init__.py
+++ b/swh/deposit/client.py
@@ -1,580 +1,581 @@
-# Copyright (C) 2017-2018 The Software Heritage developers
+# Copyright (C) 2017-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Module in charge of defining an swh-deposit client
"""
import hashlib
import os
import requests
import xmltodict
import logging
from abc import ABCMeta, abstractmethod
+from urllib.parse import urljoin
from swh.core.config import SWHConfig
logger = logging.getLogger(__name__)
def _parse(stream, encoding='utf-8'):
"""Given a xml stream, parse the result.
Args:
stream (bytes/text): The stream to parse
encoding (str): The encoding to use if to decode the bytes
stream
Returns:
A dict of values corresponding to the parsed xml
"""
if isinstance(stream, bytes):
stream = stream.decode(encoding)
data = xmltodict.parse(stream, encoding=encoding, process_namespaces=False)
if 'entry' in data:
data = data['entry']
if 'sword:error' in data:
data = data['sword:error']
return dict(data)
def _parse_with_filter(stream, encoding='utf-8', keys=[]):
"""Given a xml stream, parse the result and filter with keys.
Args:
stream (bytes/text): The stream to parse
encoding (str): The encoding to use if to decode the bytes
stream
keys ([str]): Keys to filter the parsed result
Returns:
A dict of values corresponding to the parsed xml filtered by
the keys provided.
"""
data = _parse(stream, encoding=encoding)
m = {}
for key in keys:
m[key] = data.get(key)
return m
class BaseApiDepositClient(SWHConfig):
"""Deposit client base class
"""
CONFIG_BASE_FILENAME = 'deposit/client'
DEFAULT_CONFIG = {
'url': ('str', 'http://localhost:5006'),
'auth': ('dict', {}), # with optional 'username'/'password' keys
}
def __init__(self, config=None, _client=requests):
super().__init__()
if config is None:
self.config = super().parse_config_file()
else:
self.config = config
self._client = _client
- self.base_url = self.config['url']
+ self.base_url = self.config['url'].strip('/') + '/'
auth = self.config['auth']
if auth == {}:
self.auth = None
else:
self.auth = (auth['username'], auth['password'])
def do(self, method, url, *args, **kwargs):
"""Internal method to deal with requests, possibly with basic http
authentication.
Args:
method (str): supported http methods as in self._methods' keys
Returns:
The request's execution
"""
if hasattr(self._client, method):
method_fn = getattr(self._client, method)
else:
raise ValueError('Development error, unsupported method %s' % (
method))
if self.auth:
kwargs['auth'] = self.auth
- full_url = '%s%s' % (self.base_url.rstrip('/'), url)
+ full_url = urljoin(self.base_url, url.lstrip('/'))
return method_fn(full_url, *args, **kwargs)
class PrivateApiDepositClient(BaseApiDepositClient):
"""Private API deposit client to:
- read a given deposit's archive(s)
- read a given deposit's metadata
- update a given deposit's status
"""
def archive_get(self, archive_update_url, archive):
"""Retrieve the archive from the deposit to a local directory.
Args:
archive_update_url (str): The full deposit archive(s)'s raw content
to retrieve locally
archive (str): the local archive's path where to store
the raw content
Returns:
The archive path to the local archive to load.
Or None if any problem arose.
"""
r = self.do('get', archive_update_url, stream=True)
if r.ok:
with open(archive, 'wb') as f:
for chunk in r.iter_content():
f.write(chunk)
return archive
msg = 'Problem when retrieving deposit archive at %s' % (
archive_update_url, )
logger.error(msg)
raise ValueError(msg)
def metadata_get(self, metadata_url):
"""Retrieve the metadata information on a given deposit.
Args:
metadata_url (str): The full deposit metadata url to retrieve
locally
Returns:
The dictionary of metadata for that deposit or None if any
problem arose.
"""
r = self.do('get', metadata_url)
if r.ok:
return r.json()
msg = 'Problem when retrieving metadata at %s' % metadata_url
logger.error(msg)
raise ValueError(msg)
def status_update(self, update_status_url, status,
revision_id=None, directory_id=None, origin_url=None):
"""Update the deposit's status.
Args:
update_status_url (str): the full deposit's archive
status (str): The status to update the deposit with
revision_id (str/None): the revision's identifier to update to
directory_id (str/None): the directory's identifier to update to
origin_url (str/None): deposit's associated origin url
"""
payload = {'status': status}
if revision_id:
payload['revision_id'] = revision_id
if directory_id:
payload['directory_id'] = directory_id
if origin_url:
payload['origin_url'] = origin_url
self.do('put', update_status_url, json=payload)
def check(self, check_url):
"""Check the deposit's associated data (metadata, archive(s))
Args:
check_url (str): the full deposit's check url
"""
r = self.do('get', check_url)
if r.ok:
data = r.json()
return data['status']
msg = 'Problem when checking deposit %s' % check_url
logger.error(msg)
raise ValueError(msg)
class BaseDepositClient(BaseApiDepositClient, metaclass=ABCMeta):
"""Base Deposit client to access the public api.
"""
def __init__(self, config, error_msg=None, empty_result={}):
super().__init__(config)
self.error_msg = error_msg
self.empty_result = empty_result
@abstractmethod
def compute_url(self, *args, **kwargs):
"""Compute api url endpoint to query."""
pass
@abstractmethod
def compute_method(self, *args, **kwargs):
"""Http method to use on the url"""
pass
@abstractmethod
def parse_result_ok(self, xml_content):
"""Given an xml result from the api endpoint, parse it and returns a
dict.
"""
pass
def compute_information(self, *args, **kwargs):
"""Compute some more information given the inputs (e.g http headers,
...)
"""
return {}
def parse_result_error(self, xml_content):
"""Given an error response in xml, parse it into a dict.
Returns:
dict with following keys:
'error': The error message
'detail': Some more detail about the error if any
"""
return _parse_with_filter(xml_content, keys=[
'summary', 'detail', 'sword:verboseDescription'])
def do_execute(self, method, url, info):
"""Execute the http query to url using method and info information.
By default, execute a simple query to url with the http
method. Override this in daughter class to improve the
default behavior if needed.
"""
return self.do(method, url)
def execute(self, *args, **kwargs):
"""Main endpoint to prepare and execute the http query to the api.
"""
url = self.compute_url(*args, **kwargs)
method = self.compute_method(*args, **kwargs)
info = self.compute_information(*args, **kwargs)
try:
r = self.do_execute(method, url, info)
except Exception as e:
msg = self.error_msg % (url, e)
r = self.empty_result
r.update({
'error': msg,
})
return r
else:
if r.ok:
if int(r.status_code) == 204: # 204 returns no body
return {'status': r.status_code}
else:
return self.parse_result_ok(r.text)
else:
error = self.parse_result_error(r.text)
empty = self.empty_result
error.update(empty)
error.update({
'status': r.status_code,
})
return error
class ServiceDocumentDepositClient(BaseDepositClient):
"""Service Document information retrieval.
"""
def __init__(self, config):
super().__init__(config,
error_msg='Service document failure at %s: %s',
empty_result={'collection': None})
def compute_url(self, *args, **kwargs):
return '/servicedocument/'
def compute_method(self, *args, **kwargs):
return 'get'
def parse_result_ok(self, xml_content):
"""Parse service document's success response.
"""
return _parse(xml_content)
class StatusDepositClient(BaseDepositClient):
"""Status information on a deposit.
"""
def __init__(self, config):
super().__init__(config,
error_msg='Status check failure at %s: %s',
empty_result={
'deposit_status': None,
'deposit_status_detail': None,
'deposit_swh_id': None,
})
def compute_url(self, collection, deposit_id):
return '/%s/%s/status/' % (collection, deposit_id)
def compute_method(self, *args, **kwargs):
return 'get'
def parse_result_ok(self, xml_content):
"""Given an xml content as string, returns a deposit dict.
"""
return _parse_with_filter(xml_content, keys=[
'deposit_id',
'deposit_status',
'deposit_status_detail',
'deposit_swh_id',
'deposit_swh_id_context',
'deposit_swh_anchor_id',
'deposit_swh_anchor_id_context',
'deposit_external_id',
])
class BaseCreateDepositClient(BaseDepositClient):
"""Deposit client base class to post new deposit.
"""
def __init__(self, config):
super().__init__(config,
error_msg='Post Deposit failure at %s: %s',
empty_result={
'deposit_id': None,
'deposit_status': None,
})
def compute_url(self, collection, *args, **kwargs):
return '/%s/' % collection
def compute_method(self, *args, **kwargs):
return 'post'
def parse_result_ok(self, xml_content):
"""Given an xml content as string, returns a deposit dict.
"""
return _parse_with_filter(xml_content, keys=['deposit_id',
'deposit_status',
'deposit_status_detail',
'deposit_date'])
def _compute_information(self, collection, filepath, in_progress, slug,
is_archive=True):
"""Given a filepath, compute necessary information on that file.
Args:
filepath (str): Path to a file
is_archive (bool): is it an archive or not?
Returns:
dict with keys:
'content-type': content type associated
'md5sum': md5 sum
'filename': filename
"""
filename = os.path.basename(filepath)
if is_archive:
md5sum = hashlib.md5(open(filepath, 'rb').read()).hexdigest()
extension = filename.split('.')[-1]
if 'zip' in extension:
content_type = 'application/zip'
else:
content_type = 'application/x-tar'
else:
content_type = None
md5sum = None
return {
'slug': slug,
'in_progress': in_progress,
'content-type': content_type,
'md5sum': md5sum,
'filename': filename,
'filepath': filepath,
}
def compute_information(self, collection, filepath, in_progress, slug,
is_archive=True, **kwargs):
info = self._compute_information(collection, filepath, in_progress,
slug, is_archive=is_archive)
info['headers'] = self.compute_headers(info)
return info
def do_execute(self, method, url, info):
with open(info['filepath'], 'rb') as f:
return self.do(method, url, data=f, headers=info['headers'])
class CreateArchiveDepositClient(BaseCreateDepositClient):
"""Post an archive (binary) deposit client."""
def compute_headers(self, info):
return {
'SLUG': info['slug'],
'CONTENT_MD5': info['md5sum'],
'IN-PROGRESS': str(info['in_progress']),
'CONTENT-TYPE': info['content-type'],
'CONTENT-DISPOSITION': 'attachment; filename=%s' % (
info['filename'], ),
}
class UpdateArchiveDepositClient(CreateArchiveDepositClient):
"""Update (add/replace) an archive (binary) deposit client."""
def compute_url(self, collection, *args, deposit_id=None, **kwargs):
return '/%s/%s/media/' % (collection, deposit_id)
def compute_method(self, *args, replace=False, **kwargs):
return 'put' if replace else 'post'
class CreateMetadataDepositClient(BaseCreateDepositClient):
"""Post a metadata deposit client."""
def compute_headers(self, info):
return {
'SLUG': info['slug'],
'IN-PROGRESS': str(info['in_progress']),
'CONTENT-TYPE': 'application/atom+xml;type=entry',
}
class UpdateMetadataDepositClient(CreateMetadataDepositClient):
"""Update (add/replace) a metadata deposit client."""
def compute_url(self, collection, *args, deposit_id=None, **kwargs):
return '/%s/%s/metadata/' % (collection, deposit_id)
def compute_method(self, *args, replace=False, **kwargs):
return 'put' if replace else 'post'
class CreateMultipartDepositClient(BaseCreateDepositClient):
"""Create a multipart deposit client."""
def _multipart_info(self, info, info_meta):
files = [
('file',
(info['filename'],
open(info['filepath'], 'rb'),
info['content-type'])),
('atom',
(info_meta['filename'],
open(info_meta['filepath'], 'rb'),
'application/atom+xml')),
]
headers = {
'SLUG': info['slug'],
'CONTENT_MD5': info['md5sum'],
'IN-PROGRESS': str(info['in_progress']),
}
return files, headers
def compute_information(self, collection, archive, metadata,
in_progress, slug, **kwargs):
info = self._compute_information(
collection, archive, in_progress, slug)
info_meta = self._compute_information(
collection, metadata, in_progress, slug, is_archive=False)
files, headers = self._multipart_info(info, info_meta)
return {'files': files, 'headers': headers}
def do_execute(self, method, url, info):
return self.do(
method, url, files=info['files'], headers=info['headers'])
class UpdateMultipartDepositClient(CreateMultipartDepositClient):
"""Update a multipart deposit client."""
def compute_url(self, collection, *args, deposit_id=None, **kwargs):
return '/%s/%s/metadata/' % (collection, deposit_id)
def compute_method(self, *args, replace=False, **kwargs):
return 'put' if replace else 'post'
class PublicApiDepositClient(BaseApiDepositClient):
"""Public api deposit client."""
def service_document(self):
"""Retrieve service document endpoint's information."""
return ServiceDocumentDepositClient(self.config).execute()
def deposit_status(self, collection, deposit_id):
"""Retrieve status information on a deposit."""
return StatusDepositClient(self.config).execute(
collection, deposit_id)
def deposit_create(self, collection, slug, archive=None,
metadata=None, in_progress=False):
"""Create a new deposit (archive, metadata, both as multipart)."""
if archive and not metadata:
return CreateArchiveDepositClient(self.config).execute(
collection, archive, in_progress, slug)
elif not archive and metadata:
return CreateMetadataDepositClient(self.config).execute(
collection, metadata, in_progress, slug,
is_archive=False)
else:
return CreateMultipartDepositClient(self.config).execute(
collection, archive, metadata, in_progress,
slug)
def deposit_update(self, collection, deposit_id, slug, archive=None,
metadata=None, in_progress=False,
replace=False):
"""Update (add/replace) existing deposit (archive, metadata, both)."""
r = self.deposit_status(collection, deposit_id)
if 'error' in r:
return r
status = r['deposit_status']
if status != 'partial':
return {
'error': "You can only act on deposit with status 'partial'",
'detail': "The deposit %s has status '%s'" % (
deposit_id, status),
'deposit_status': status,
'deposit_id': deposit_id,
}
if archive and not metadata:
r = UpdateArchiveDepositClient(self.config).execute(
collection, archive, in_progress, slug,
deposit_id=deposit_id, replace=replace)
elif not archive and metadata:
r = UpdateMetadataDepositClient(self.config).execute(
collection, metadata, in_progress, slug,
deposit_id=deposit_id, replace=replace)
else:
r = UpdateMultipartDepositClient(self.config).execute(
collection, archive, metadata, in_progress,
slug, deposit_id=deposit_id, replace=replace)
if 'error' in r:
return r
return self.deposit_status(collection, deposit_id)
diff --git a/swh/deposit/config.py b/swh/deposit/config.py
index b7a1b6a0..912fb602 100644
--- a/swh/deposit/config.py
+++ b/swh/deposit/config.py
@@ -1,111 +1,111 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import logging
from typing import Any, Dict, Tuple
from swh.core.config import SWHConfig
from swh.scheduler import get_scheduler
# IRIs (Internationalized Resource identifier) sword 2.0 specified
EDIT_SE_IRI = 'edit_se_iri'
EM_IRI = 'em_iri'
CONT_FILE_IRI = 'cont_file_iri'
SD_IRI = 'servicedocument'
COL_IRI = 'upload'
STATE_IRI = 'state_iri'
PRIVATE_GET_RAW_CONTENT = 'private-download'
PRIVATE_CHECK_DEPOSIT = 'check-deposit'
PRIVATE_PUT_DEPOSIT = 'private-update'
PRIVATE_GET_DEPOSIT_METADATA = 'private-read'
PRIVATE_LIST_DEPOSITS = 'private-deposit-list'
ARCHIVE_KEY = 'archive'
METADATA_KEY = 'metadata'
RAW_METADATA_KEY = 'raw-metadata'
ARCHIVE_TYPE = 'archive'
METADATA_TYPE = 'metadata'
AUTHORIZED_PLATFORMS = ['development', 'production', 'testing']
DEPOSIT_STATUS_REJECTED = 'rejected'
DEPOSIT_STATUS_PARTIAL = 'partial'
DEPOSIT_STATUS_DEPOSITED = 'deposited'
DEPOSIT_STATUS_VERIFIED = 'verified'
DEPOSIT_STATUS_LOAD_SUCCESS = 'done'
DEPOSIT_STATUS_LOAD_FAILURE = 'failed'
# Revision author for deposit
SWH_PERSON = {
'name': 'Software Heritage',
'fullname': 'Software Heritage',
'email': 'robot@softwareheritage.org'
}
def setup_django_for(platform=None, config_file=None):
"""Setup function for command line tools (swh.deposit.create_user) to
initialize the needed db access.
Note:
Do not import any django related module prior to this function
call. Otherwise, this will raise an
django.core.exceptions.ImproperlyConfigured error message.
Args:
platform (str): the platform the scheduling is running
config_file (str): Extra configuration file (typically for the
production platform)
Raises:
ValueError in case of wrong platform inputs.
"""
if platform is not None:
if platform not in AUTHORIZED_PLATFORMS:
raise ValueError('Platform should be one of %s' %
AUTHORIZED_PLATFORMS)
if 'DJANGO_SETTINGS_MODULE' not in os.environ:
os.environ['DJANGO_SETTINGS_MODULE'] = (
'swh.deposit.settings.%s' % platform)
if config_file:
os.environ.setdefault('SWH_CONFIG_FILENAME', config_file)
import django
django.setup()
class SWHDefaultConfig(SWHConfig):
"""Mixin intended to enrich views with SWH configuration.
"""
CONFIG_BASE_FILENAME = 'deposit/server'
DEFAULT_CONFIG = {
'max_upload_size': ('int', 209715200),
'checks': ('bool', True),
'scheduler': ('dict', {
'cls': 'remote',
'args': {
'url': 'http://localhost:5008/'
}
})
}
ADDITIONAL_CONFIG = {} # type: Dict[str, Tuple[str, Any]]
def __init__(self, **config):
super().__init__()
self.config = self.parse_config_file(
additional_configs=[self.ADDITIONAL_CONFIG])
self.config.update(config)
self.log = logging.getLogger('swh.deposit')
- if self.config['checks']:
+ if self.config.get('scheduler'):
self.scheduler = get_scheduler(**self.config['scheduler'])
diff --git a/swh/deposit/models.py b/swh/deposit/models.py
index e7c5440d..1a7a78ea 100644
--- a/swh/deposit/models.py
+++ b/swh/deposit/models.py
@@ -1,223 +1,228 @@
# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
# Generated from:
# cd swh_deposit && \
# python3 -m manage inspectdb
from django.contrib.postgres.fields import JSONField, ArrayField
from django.contrib.auth.models import User, UserManager
from django.db import models
from django.utils.timezone import now
from .config import (
DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_PARTIAL,
DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_FAILURE,
DEPOSIT_STATUS_REJECTED, ARCHIVE_TYPE, METADATA_TYPE
)
class Dbversion(models.Model):
"""Db version
"""
version = models.IntegerField(primary_key=True)
release = models.DateTimeField(default=now, null=True)
description = models.TextField(blank=True, null=True)
class Meta:
db_table = 'dbversion'
def __str__(self):
return str({
'version': self.version,
'release': self.release,
'description': self.description
})
"""Possible status"""
DEPOSIT_STATUS = [
(DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_PARTIAL),
('expired', 'expired'),
(DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_DEPOSITED),
(DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_VERIFIED),
(DEPOSIT_STATUS_REJECTED, DEPOSIT_STATUS_REJECTED),
('loading', 'loading'),
(DEPOSIT_STATUS_LOAD_SUCCESS, DEPOSIT_STATUS_LOAD_SUCCESS),
(DEPOSIT_STATUS_LOAD_FAILURE, DEPOSIT_STATUS_LOAD_FAILURE),
]
"""Possible status and the detailed meaning."""
DEPOSIT_STATUS_DETAIL = {
DEPOSIT_STATUS_PARTIAL: 'Deposit is partially received. To finalize it, '
'In-Progress header should be false',
'expired': 'Deposit has been there too long and is now '
'deemed ready to be garbage collected',
DEPOSIT_STATUS_DEPOSITED: 'Deposit is ready for additional checks '
'(tarball ok, metadata, etc...)',
DEPOSIT_STATUS_VERIFIED: 'Deposit is fully received, checked, and '
'ready for loading',
DEPOSIT_STATUS_REJECTED: 'Deposit failed the checks',
'loading': "Loading is ongoing on swh's side",
DEPOSIT_STATUS_LOAD_SUCCESS: 'The deposit has been successfully '
'loaded into the Software Heritage archive',
DEPOSIT_STATUS_LOAD_FAILURE: 'The deposit loading into the '
'Software Heritage archive failed',
}
class DepositClient(User):
"""Deposit client
"""
collections = ArrayField(models.IntegerField(), null=True)
objects = UserManager() # type: ignore
# this typing hint is due to a mypy/django-stubs limitation,
# see https://github.com/typeddjango/django-stubs/issues/174
provider_url = models.TextField(null=False)
domain = models.TextField(null=False)
class Meta:
db_table = 'deposit_client'
def __str__(self):
return str({
'id': self.id,
'collections': self.collections,
'username': super().username,
'domain': self.domain,
'provider_url': self.provider_url,
})
class Deposit(models.Model):
"""Deposit reception table
"""
id = models.BigAutoField(primary_key=True)
# First deposit reception date
reception_date = models.DateTimeField(auto_now_add=True)
# Date when the deposit is deemed complete and ready for loading
complete_date = models.DateTimeField(null=True)
# collection concerned by the deposit
collection = models.ForeignKey(
'DepositCollection', models.DO_NOTHING)
# Deposit's external identifier
external_id = models.TextField()
# Deposit client
client = models.ForeignKey('DepositClient', models.DO_NOTHING)
# SWH's loading result identifier
swh_id = models.TextField(blank=True, null=True)
swh_id_context = models.TextField(blank=True, null=True)
swh_anchor_id = models.TextField(blank=True, null=True)
swh_anchor_id_context = models.TextField(blank=True, null=True)
# Deposit's status regarding loading
status = models.TextField(
choices=DEPOSIT_STATUS,
default=DEPOSIT_STATUS_PARTIAL)
status_detail = JSONField(null=True)
# deposit can have one parent
parent = models.ForeignKey('self', on_delete=models.PROTECT, null=True)
check_task_id = models.TextField(
blank=True, null=True,
verbose_name="Scheduler's associated checking task id"
)
load_task_id = models.TextField(
blank=True, null=True,
verbose_name="Scheduler's associated loading task id"
)
class Meta:
db_table = 'deposit'
def __str__(self):
d = {
'id': self.id,
'reception_date': self.reception_date,
'collection': self.collection.name,
'external_id': self.external_id,
'client': self.client.username,
'status': self.status,
}
if self.status in (DEPOSIT_STATUS_REJECTED):
d['status_detail'] = self.status_detail
return str(d)
+ @property
+ def origin_url(self):
+ return '%s/%s' % (self.client.provider_url.rstrip('/'),
+ self.external_id)
+
def client_directory_path(instance, filename):
"""Callable to upload archive in MEDIA_ROOT/user_<id>/<filename>
Args:
instance (DepositRequest): DepositRequest concerned by the upload
filename (str): Filename of the uploaded file
Returns:
A path to be prefixed by the MEDIA_ROOT to access physically
to the file uploaded.
"""
return 'client_{0}/{1}'.format(instance.deposit.client.id, filename)
REQUEST_TYPES = [(ARCHIVE_TYPE, ARCHIVE_TYPE),
(METADATA_TYPE, METADATA_TYPE)]
class DepositRequest(models.Model):
"""Deposit request associated to one deposit.
"""
id = models.BigAutoField(primary_key=True)
# Deposit concerned by the request
deposit = models.ForeignKey(Deposit, models.DO_NOTHING)
date = models.DateTimeField(auto_now_add=True)
# Deposit request information on the data to inject
# this can be null when type is 'archive'
metadata = JSONField(null=True)
raw_metadata = models.TextField(null=True)
# this can be null when type is 'metadata'
archive = models.FileField(null=True, upload_to=client_directory_path)
type = models.CharField(max_length=8,
choices=REQUEST_TYPES,
null=True)
class Meta:
db_table = 'deposit_request'
def __str__(self):
meta = None
if self.metadata:
from json import dumps
meta = dumps(self.metadata)
archive_name = None
if self.archive:
archive_name = self.archive.name
return str({
'id': self.id,
'deposit': self.deposit,
'metadata': meta,
'archive': archive_name
})
class DepositCollection(models.Model):
id = models.BigAutoField(primary_key=True)
# Human readable name for the collection type e.g HAL, arXiv, etc...
name = models.TextField()
class Meta:
db_table = 'deposit_collection'
def __str__(self):
return str({'id': self.id, 'name': self.name})
diff --git a/swh/deposit/signals.py b/swh/deposit/signals.py
deleted file mode 100644
index fa43da00..00000000
--- a/swh/deposit/signals.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-"""Module in charge of defining some uncoupled actions on deposit.
-
- Typically, checking that the archives deposited are ok are not
- directly testing in the request/answer to avoid too long
- computations.
-
- So this is done in the deposit_on_status_ready_for_check callback.
-
-"""
-
-from swh.deposit import utils
-
-from django.db.models.signals import post_save
-from django.dispatch import receiver
-
-from .models import Deposit
-from .config import SWHDefaultConfig, DEPOSIT_STATUS_VERIFIED
-from .config import DEPOSIT_STATUS_DEPOSITED
-
-
-def schedule_task(scheduler, task):
- """Schedule the task and return its identifier
-
- Args:
- task (dict): Task to schedule
-
- Returns:
- The task identifier
-
- """
- tasks = scheduler.create_tasks([task])
- if tasks:
- created_task = tasks[0]
- return created_task['id']
-
-
-@receiver(post_save, sender=Deposit)
-def post_deposit_save(sender, instance, created, raw, using,
- update_fields, **kwargs):
- """When a deposit is saved, check for the deposit's status change and
- schedule actions accordingly.
-
- When the status passes to deposited, schedule checks.
- When the status pass to ready, schedule loading. Otherwise, do
- nothing.
-
- Args:
- sender (Deposit): The model class
- instance (Deposit): The actual instance being saved
- created (bool): True if a new record was created
- raw (bool): True if the model is saved exactly as presented
- (i.e. when loading a fixture). One should not
- query/modify other records in the database as the
- database might not be in a consistent state yet
- using: The database alias being used
- update_fields: The set of fields to update as passed to
- Model.save(), or None if update_fields wasn’t
- passed to save()
-
- """
- default_config = SWHDefaultConfig()
- if not default_config.config['checks']:
- return
-
- if instance.status not in {DEPOSIT_STATUS_DEPOSITED,
- DEPOSIT_STATUS_VERIFIED}:
- return
-
- from django.urls import reverse
- from swh.scheduler.utils import create_oneshot_task_dict
-
- args = [instance.collection.name, instance.id]
-
- # In the following, we are checking the instance.*task_id are not already
- # populated because the `instance.save()` call will also trigger a call to
- # that very function.
-
- if (instance.status == DEPOSIT_STATUS_DEPOSITED and
- not instance.check_task_id):
- # schedule deposit's checks
- from swh.deposit.config import PRIVATE_CHECK_DEPOSIT
- check_url = reverse(PRIVATE_CHECK_DEPOSIT, args=args)
- task = create_oneshot_task_dict('check-deposit',
- deposit_check_url=check_url)
- check_task_id = schedule_task(default_config.scheduler, task)
- instance.check_task_id = check_task_id
- instance.save()
-
- elif (instance.status == DEPOSIT_STATUS_VERIFIED and
- not instance.load_task_id):
-
- url = utils.origin_url_from(instance)
- task = create_oneshot_task_dict(
- 'load-deposit',
- url=url, deposit_id=instance.id)
-
- load_task_id = schedule_task(default_config.scheduler, task)
- instance.load_task_id = load_task_id
- instance.save()
diff --git a/swh/deposit/tests/__init__.py b/swh/deposit/tests/__init__.py
index 2b34b26e..e69de29b 100644
--- a/swh/deposit/tests/__init__.py
+++ b/swh/deposit/tests/__init__.py
@@ -1,42 +0,0 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-from swh.deposit.config import setup_django_for
-from swh.deposit.config import SWHDefaultConfig # noqa
-
-
-TEST_CONFIG = {
- 'max_upload_size': 500,
- 'extraction_dir': '/tmp/swh-deposit/test/extraction-dir',
- 'checks': False,
- 'provider': {
- 'provider_name': '',
- 'provider_type': 'deposit_client',
- 'provider_url': '',
- 'metadata': {
- }
- },
- 'tool': {
- 'name': 'swh-deposit',
- 'version': '0.0.1',
- 'configuration': {
- 'sword_version': '2'
- }
- }
-}
-
-
-def parse_deposit_config_file(base_filename=None, config_filename=None,
- additional_configs=None, global_config=True):
- return TEST_CONFIG
-
-
-# monkey patch classes method permits to override, for tests purposes,
-# the default configuration without side-effect, i.e do not load the
-# configuration from disk
-SWHDefaultConfig.parse_config_file = parse_deposit_config_file # type: ignore
-
-
-setup_django_for('testing')
diff --git a/swh/deposit/tests/api/test_deposit_binary.py b/swh/deposit/tests/api/test_deposit_binary.py
index 7d3eac5d..8f1cc763 100644
--- a/swh/deposit/tests/api/test_deposit_binary.py
+++ b/swh/deposit/tests/api/test_deposit_binary.py
@@ -1,544 +1,543 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from django.core.files.uploadedfile import InMemoryUploadedFile
from django.urls import reverse
from io import BytesIO
from rest_framework import status
-from swh.deposit.tests import TEST_CONFIG
from swh.deposit.config import (
COL_IRI, EM_IRI, DEPOSIT_STATUS_DEPOSITED,
)
from swh.deposit.models import Deposit, DepositRequest
from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import create_arborescence_archive, check_archive
def test_post_deposit_binary_no_slug(
authenticated_client, deposit_collection, sample_archive):
"""Posting a binary deposit without slug header should return 400
"""
url = reverse(COL_IRI, args=[deposit_collection.name])
# when
response = authenticated_client.post(
url,
content_type='application/zip', # as zip
data=sample_archive['data'],
# + headers
CONTENT_LENGTH=sample_archive['length'],
HTTP_CONTENT_MD5=sample_archive['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_CONTENT_DISPOSITION='attachment; filename=filename0')
assert b'Missing SLUG header' in response.content
assert response.status_code == status.HTTP_400_BAD_REQUEST
def test_post_deposit_binary_support(
authenticated_client, deposit_collection, sample_archive):
"""Binary upload with content-type not in [zip,x-tar] should return 415
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = 'some-external-id-1'
# when
response = authenticated_client.post(
url,
content_type='application/octet-stream',
data=sample_archive['data'],
# + headers
CONTENT_LENGTH=sample_archive['length'],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_CONTENT_DISPOSITION='attachment; filename=filename0')
# then
assert response.status_code == status.HTTP_415_UNSUPPORTED_MEDIA_TYPE
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
def test_post_deposit_binary_upload_ok(
authenticated_client, deposit_collection, sample_archive):
"""Binary upload with correct headers should return 201 with receipt
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = 'some-external-id-1'
# when
response = authenticated_client.post(
url,
content_type='application/zip', # as zip
data=sample_archive['data'],
# + headers
CONTENT_LENGTH=sample_archive['length'],
# other headers needs HTTP_ prefix to be taken into account
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % (
sample_archive['name'], ))
# then
response_content = parse_xml(BytesIO(response.content))
assert response.status_code == status.HTTP_201_CREATED
deposit_id = response_content['deposit_id']
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_request = DepositRequest.objects.get(deposit=deposit)
check_archive(sample_archive['name'], deposit_request.archive.name)
assert deposit_request.metadata is None
assert deposit_request.raw_metadata is None
response_content = parse_xml(BytesIO(response.content))
assert response_content['deposit_archive'] == sample_archive['name']
assert int(response_content['deposit_id']) == deposit.id
assert response_content['deposit_status'] == deposit.status
edit_se_iri = reverse('edit_se_iri',
args=[deposit_collection.name, deposit.id])
assert response._headers['location'] == (
'Location', 'http://testserver' + edit_se_iri)
def test_post_deposit_binary_failure_unsupported_packaging_header(
authenticated_client, deposit_collection, sample_archive):
"""Bin deposit without supported content_disposition header returns 400
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = 'some-external-id'
# when
response = authenticated_client.post(
url,
content_type='application/zip',
data=sample_archive['data'],
# + headers
CONTENT_LENGTH=sample_archive['length'],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive['md5sum'],
HTTP_PACKAGING='something-unsupported',
HTTP_CONTENT_DISPOSITION='attachment; filename=filename0')
# then
assert response.status_code == status.HTTP_400_BAD_REQUEST
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
def test_post_deposit_binary_upload_no_content_disposition_header(
authenticated_client, deposit_collection, sample_archive):
"""Binary upload without content_disposition header should return 400
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = 'some-external-id'
# when
response = authenticated_client.post(
url,
content_type='application/zip',
data=sample_archive['data'],
# + headers
CONTENT_LENGTH=sample_archive['length'],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false')
# then
assert response.status_code == status.HTTP_400_BAD_REQUEST
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
def test_post_deposit_mediation_not_supported(
authenticated_client, deposit_collection, sample_archive):
"""Binary upload with mediation should return a 412 response
"""
# given
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = 'some-external-id-1'
# when
response = authenticated_client.post(
url,
content_type='application/zip',
data=sample_archive['data'],
# + headers
CONTENT_LENGTH=sample_archive['length'],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_ON_BEHALF_OF='someone',
HTTP_CONTENT_DISPOSITION='attachment; filename=filename0')
# then
assert response.status_code == status.HTTP_412_PRECONDITION_FAILED
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
def test_post_deposit_binary_upload_fail_if_upload_size_limit_exceeded(
authenticated_client, deposit_collection, sample_archive, tmp_path):
"""Binary upload must not exceed the limit set up...
"""
tmp_path = str(tmp_path)
url = reverse(COL_IRI, args=[deposit_collection.name])
archive = create_arborescence_archive(
tmp_path, 'archive2', 'file2', b'some content in file',
- up_to_size=TEST_CONFIG['max_upload_size'])
+ up_to_size=500)
external_id = 'some-external-id'
# when
response = authenticated_client.post(
url,
content_type='application/zip',
data=archive['data'],
# + headers
CONTENT_LENGTH=archive['length'],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=archive['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_CONTENT_DISPOSITION='attachment; filename=filename0')
# then
assert response.status_code == status.HTTP_413_REQUEST_ENTITY_TOO_LARGE
assert b'Upload size limit exceeded' in response.content
with pytest.raises(Deposit.DoesNotExist):
Deposit.objects.get(external_id=external_id)
def test_post_deposit_2_post_2_different_deposits(
authenticated_client, deposit_collection, sample_archive):
"""2 posting deposits should return 2 different 201 with receipt
"""
url = reverse(COL_IRI, args=[deposit_collection.name])
# when
response = authenticated_client.post(
url,
content_type='application/zip', # as zip
data=sample_archive['data'],
# + headers
CONTENT_LENGTH=sample_archive['length'],
HTTP_SLUG='some-external-id-1',
HTTP_CONTENT_MD5=sample_archive['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_CONTENT_DISPOSITION='attachment; filename=filename0')
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content['deposit_id']
deposit = Deposit.objects.get(pk=deposit_id)
deposits = Deposit.objects.all()
assert len(deposits) == 1
assert deposits[0] == deposit
# second post
response = authenticated_client.post(
url,
content_type='application/x-tar', # as zip
data=sample_archive['data'],
# + headers
CONTENT_LENGTH=sample_archive['length'],
HTTP_SLUG='another-external-id',
HTTP_CONTENT_MD5=sample_archive['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_CONTENT_DISPOSITION='attachment; filename=filename1')
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id2 = response_content['deposit_id']
deposit2 = Deposit.objects.get(pk=deposit_id2)
assert deposit != deposit2
deposits = Deposit.objects.all().order_by('id')
assert len(deposits) == 2
assert list(deposits), [deposit == deposit2]
def test_post_deposit_binary_and_post_to_add_another_archive(
authenticated_client, deposit_collection, sample_archive, tmp_path):
"""Updating a deposit should return a 201 with receipt
"""
tmp_path = str(tmp_path)
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = 'some-external-id-1'
# when
response = authenticated_client.post(
url,
content_type='application/zip', # as zip
data=sample_archive['data'],
# + headers
CONTENT_LENGTH=sample_archive['length'],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='true',
HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % (
sample_archive['name'], ))
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content['deposit_id']
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == 'partial'
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_request = DepositRequest.objects.get(deposit=deposit)
assert deposit_request.deposit == deposit
assert deposit_request.type == 'archive'
check_archive(sample_archive['name'], deposit_request.archive.name)
# 2nd archive to upload
archive2 = create_arborescence_archive(
tmp_path, 'archive2', 'file2', b'some other content in file')
# uri to update the content
update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit_id])
# adding another archive for the deposit and finalizing it
response = authenticated_client.post(
update_uri,
content_type='application/zip', # as zip
data=archive2['data'],
# + headers
CONTENT_LENGTH=archive2['length'],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=archive2['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % (
archive2['name']))
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_requests = list(DepositRequest.objects.filter(deposit=deposit).
order_by('id'))
# 2 deposit requests for the same deposit
assert len(deposit_requests) == 2
assert deposit_requests[0].deposit == deposit
assert deposit_requests[0].type == 'archive'
check_archive(sample_archive['name'], deposit_requests[0].archive.name)
assert deposit_requests[1].deposit == deposit
assert deposit_requests[1].type == 'archive'
check_archive(archive2['name'], deposit_requests[1].archive.name)
# only 1 deposit in db
deposits = Deposit.objects.all()
assert len(deposits) == 1
def test_post_deposit_then_update_refused(
authenticated_client, deposit_collection,
sample_archive, atom_dataset, tmp_path):
"""Updating a deposit with status 'ready' should return a 400
"""
tmp_path = str(tmp_path)
url = reverse(COL_IRI, args=[deposit_collection.name])
external_id = 'some-external-id-1'
# when
response = authenticated_client.post(
url,
content_type='application/zip', # as zip
data=sample_archive['data'],
# + headers
CONTENT_LENGTH=sample_archive['length'],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_CONTENT_DISPOSITION='attachment; filename=filename0')
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(BytesIO(response.content))
deposit_id = response_content['deposit_id']
deposit = Deposit.objects.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
assert deposit.external_id == external_id
assert deposit.collection == deposit_collection
assert deposit.swh_id is None
deposit_request = DepositRequest.objects.get(deposit=deposit)
assert deposit_request.deposit == deposit
check_archive('filename0', deposit_request.archive.name)
# updating/adding is forbidden
# uri to update the content
edit_se_iri = reverse(
'edit_se_iri', args=[deposit_collection.name, deposit_id])
em_iri = reverse(
'em_iri', args=[deposit_collection.name, deposit_id])
# Testing all update/add endpoint should fail
# since the status is ready
archive2 = create_arborescence_archive(
tmp_path, 'archive2', 'file2', b'some content in file 2')
# replacing file is no longer possible since the deposit's
# status is ready
r = authenticated_client.put(
em_iri,
content_type='application/zip',
data=archive2['data'],
CONTENT_LENGTH=archive2['length'],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=archive2['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_CONTENT_DISPOSITION='attachment; filename=filename0')
assert r.status_code == status.HTTP_400_BAD_REQUEST
# adding file is no longer possible since the deposit's status
# is ready
r = authenticated_client.post(
em_iri,
content_type='application/zip',
data=archive2['data'],
CONTENT_LENGTH=archive2['length'],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=archive2['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_CONTENT_DISPOSITION='attachment; filename=filename0')
assert r.status_code == status.HTTP_400_BAD_REQUEST
# replacing metadata is no longer possible since the deposit's
# status is ready
r = authenticated_client.put(
edit_se_iri,
content_type='application/atom+xml;type=entry',
data=atom_dataset['entry-data-deposit-binary'],
CONTENT_LENGTH=len(atom_dataset['entry-data-deposit-binary']),
HTTP_SLUG=external_id)
assert r.status_code == status.HTTP_400_BAD_REQUEST
# adding new metadata is no longer possible since the
# deposit's status is ready
r = authenticated_client.post(
edit_se_iri,
content_type='application/atom+xml;type=entry',
data=atom_dataset['entry-data-deposit-binary'],
CONTENT_LENGTH=len(atom_dataset['entry-data-deposit-binary']),
HTTP_SLUG=external_id)
assert r.status_code == status.HTTP_400_BAD_REQUEST
archive_content = b'some content representing archive'
archive = InMemoryUploadedFile(
BytesIO(archive_content),
field_name='archive0',
name='archive0',
content_type='application/zip',
size=len(archive_content),
charset=None)
atom_entry = InMemoryUploadedFile(
BytesIO(atom_dataset['entry-data-deposit-binary']),
field_name='atom0',
name='atom0',
content_type='application/atom+xml; charset="utf-8"',
size=len(atom_dataset['entry-data-deposit-binary']),
charset='utf-8')
# replacing multipart metadata is no longer possible since the
# deposit's status is ready
r = authenticated_client.put(
edit_se_iri,
format='multipart',
data={
'archive': archive,
'atom_entry': atom_entry,
})
assert r.status_code == status.HTTP_400_BAD_REQUEST
# adding new metadata is no longer possible since the
# deposit's status is ready
r = authenticated_client.post(
edit_se_iri,
format='multipart',
data={
'archive': archive,
'atom_entry': atom_entry,
})
assert r.status_code == status.HTTP_400_BAD_REQUEST
diff --git a/swh/deposit/tests/api/test_deposit_private_check.py b/swh/deposit/tests/api/test_deposit_private_check.py
index 1c90113a..a6bab004 100644
--- a/swh/deposit/tests/api/test_deposit_private_check.py
+++ b/swh/deposit/tests/api/test_deposit_private_check.py
@@ -1,263 +1,266 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.urls import reverse
+import pytest
from rest_framework import status
from swh.deposit.config import (
DEPOSIT_STATUS_VERIFIED, PRIVATE_CHECK_DEPOSIT,
DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED, COL_IRI
)
from swh.deposit.api.private.deposit_check import (
MANDATORY_ARCHIVE_INVALID, MANDATORY_FIELDS_MISSING,
MANDATORY_ARCHIVE_UNSUPPORTED, ALTERNATE_FIELDS_MISSING,
MANDATORY_ARCHIVE_MISSING
)
from swh.deposit.models import Deposit
from swh.deposit.parsers import parse_xml
from swh.deposit.tests.common import (
create_arborescence_archive, create_archive_with_archive
)
PRIVATE_CHECK_DEPOSIT_NC = PRIVATE_CHECK_DEPOSIT + '-nc'
def private_check_url_endpoints(collection, deposit):
"""There are 2 endpoints to check (one with collection, one without)"""
return [
reverse(PRIVATE_CHECK_DEPOSIT, args=[collection.name, deposit.id]),
reverse(PRIVATE_CHECK_DEPOSIT_NC, args=[deposit.id])
]
+@pytest.mark.parametrize(
+ "extension", ['zip', 'tar', 'tar.gz', 'tar.bz2', 'tar.xz'])
def test_deposit_ok(
- authenticated_client, deposit_collection, ready_deposit_ok):
+ authenticated_client, deposit_collection, ready_deposit_ok, extension):
"""Proper deposit should succeed the checks (-> status ready)
"""
deposit = ready_deposit_ok
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data['status'] == DEPOSIT_STATUS_VERIFIED
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_VERIFIED
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
-
+@pytest.mark.parametrize(
+ "extension", ['zip', 'tar', 'tar.gz', 'tar.bz2', 'tar.xz'])
def test_deposit_invalid_tarball(
- tmp_path, authenticated_client, deposit_collection):
+ tmp_path, authenticated_client, deposit_collection, extension):
"""Deposit with tarball (of 1 tarball) should fail the checks: rejected
"""
- for archive_extension in ['zip', 'tar', 'tar.gz', 'tar.bz2', 'tar.xz']:
- deposit = create_deposit_archive_with_archive(
- tmp_path, archive_extension,
- authenticated_client,
- deposit_collection.name)
- for url in private_check_url_endpoints(deposit_collection, deposit):
- response = authenticated_client.get(url)
- assert response.status_code == status.HTTP_200_OK
- data = response.json()
- assert data['status'] == DEPOSIT_STATUS_REJECTED
- details = data['details']
- # archive checks failure
- assert len(details['archive']) == 1
- assert details['archive'][0]['summary'] == \
- MANDATORY_ARCHIVE_INVALID
-
- deposit = Deposit.objects.get(pk=deposit.id)
- assert deposit.status == DEPOSIT_STATUS_REJECTED
+ deposit = create_deposit_archive_with_archive(
+ tmp_path, extension,
+ authenticated_client,
+ deposit_collection.name)
+ for url in private_check_url_endpoints(deposit_collection, deposit):
+ response = authenticated_client.get(url)
+ assert response.status_code == status.HTTP_200_OK
+ data = response.json()
+ assert data['status'] == DEPOSIT_STATUS_REJECTED
+ details = data['details']
+ # archive checks failure
+ assert len(details['archive']) == 1
+ assert details['archive'][0]['summary'] == \
+ MANDATORY_ARCHIVE_INVALID
+
+ deposit = Deposit.objects.get(pk=deposit.id)
+ assert deposit.status == DEPOSIT_STATUS_REJECTED
def test_deposit_ko_missing_tarball(
authenticated_client, deposit_collection, ready_deposit_only_metadata):
"""Deposit without archive should fail the checks: rejected
"""
deposit = ready_deposit_only_metadata
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data['status'] == DEPOSIT_STATUS_REJECTED
details = data['details']
# archive checks failure
assert len(details['archive']) == 1
assert details['archive'][0]['summary'] == MANDATORY_ARCHIVE_MISSING
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_REJECTED
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
def test_deposit_ko_unsupported_tarball(
tmp_path, authenticated_client, deposit_collection,
ready_deposit_invalid_archive):
"""Deposit with an unsupported tarball should fail the checks: rejected
"""
deposit = ready_deposit_invalid_archive
assert DEPOSIT_STATUS_DEPOSITED == deposit.status
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data['status'] == DEPOSIT_STATUS_REJECTED
details = data['details']
# archive checks failure
assert len(details['archive']) == 1
assert details['archive'][0]['summary'] == \
MANDATORY_ARCHIVE_UNSUPPORTED
# metadata check failure
assert len(details['metadata']) == 2
mandatory = details['metadata'][0]
assert mandatory['summary'] == MANDATORY_FIELDS_MISSING
assert set(mandatory['fields']) == set(['author'])
alternate = details['metadata'][1]
assert alternate['summary'] == ALTERNATE_FIELDS_MISSING
assert alternate['fields'] == ['name or title']
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_REJECTED
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
def test_check_deposit_metadata_ok(
authenticated_client, deposit_collection, ready_deposit_ok):
"""Proper deposit should succeed the checks (-> status ready)
with all **MUST** metadata
using the codemeta metadata test set
"""
deposit = ready_deposit_ok
assert deposit.status == DEPOSIT_STATUS_DEPOSITED
for url in private_check_url_endpoints(deposit_collection, deposit):
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert data['status'] == DEPOSIT_STATUS_VERIFIED
deposit = Deposit.objects.get(pk=deposit.id)
assert deposit.status == DEPOSIT_STATUS_VERIFIED
deposit.status = DEPOSIT_STATUS_DEPOSITED
deposit.save()
def test_check_metadata_ok(swh_checks_deposit):
actual_check, detail = swh_checks_deposit._check_metadata({
'url': 'something',
'external_identifier': 'something-else',
'name': 'foo',
'author': 'someone',
})
assert actual_check is True
assert detail is None
def test_check_metadata_ok2(swh_checks_deposit):
actual_check, detail = swh_checks_deposit._check_metadata({
'url': 'something',
'external_identifier': 'something-else',
'title': 'bar',
'author': 'someone',
})
assert actual_check is True
assert detail is None
def test_check_metadata_ko(swh_checks_deposit):
"""Missing optional field should be caught
"""
actual_check, error_detail = swh_checks_deposit._check_metadata({
'url': 'something',
'external_identifier': 'something-else',
'author': 'someone',
})
expected_error = {
'metadata': [{
'summary': 'Mandatory alternate fields are missing',
'fields': ['name or title'],
}]
}
assert actual_check is False
assert error_detail == expected_error
def test_check_metadata_ko2(swh_checks_deposit):
"""Missing mandatory fields should be caught
"""
actual_check, error_detail = swh_checks_deposit._check_metadata({
'url': 'something',
'external_identifier': 'something-else',
'title': 'foobar',
})
expected_error = {
'metadata': [{
'summary': 'Mandatory fields are missing',
'fields': ['author'],
}]
}
assert actual_check is False
assert error_detail == expected_error
def create_deposit_archive_with_archive(
root_path, archive_extension, client, collection_name):
# we create the holding archive to a given extension
archive = create_arborescence_archive(
root_path, 'archive1', 'file1', b'some content in file',
extension=archive_extension)
# now we create an archive holding the first created archive
invalid_archive = create_archive_with_archive(
- root_path, 'invalid.tar.gz', archive)
+ root_path, 'invalid.tgz', archive)
# we deposit it
response = client.post(
reverse(COL_IRI, args=[collection_name]),
content_type='application/x-tar',
data=invalid_archive['data'],
CONTENT_LENGTH=invalid_archive['length'],
HTTP_MD5SUM=invalid_archive['md5sum'],
HTTP_SLUG='external-id',
HTTP_IN_PROGRESS=False,
HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % (
invalid_archive['name'], ))
# then
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(response.content)
deposit_status = response_content['deposit_status']
assert deposit_status == DEPOSIT_STATUS_DEPOSITED
deposit_id = int(response_content['deposit_id'])
deposit = Deposit.objects.get(pk=deposit_id)
assert DEPOSIT_STATUS_DEPOSITED == deposit.status
return deposit
diff --git a/swh/deposit/tests/api/test_deposit_private_read_archive.py b/swh/deposit/tests/api/test_deposit_private_read_archive.py
index d41ebef1..b2d9a234 100644
--- a/swh/deposit/tests/api/test_deposit_private_read_archive.py
+++ b/swh/deposit/tests/api/test_deposit_private_read_archive.py
@@ -1,111 +1,86 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-import hashlib
-import shutil
+import io
+import zipfile
from django.urls import reverse
-from os import listdir, path, mkdir
from rest_framework import status
-from swh.core import tarball
from swh.deposit.config import PRIVATE_GET_RAW_CONTENT, EM_IRI
-
from swh.deposit.tests.common import create_arborescence_archive
PRIVATE_GET_RAW_CONTENT_NC = PRIVATE_GET_RAW_CONTENT + '-nc'
def private_get_raw_url_endpoints(collection, deposit):
"""There are 2 endpoints to check (one with collection, one without)"""
return [
reverse(PRIVATE_GET_RAW_CONTENT, args=[collection.name, deposit.id]),
reverse(PRIVATE_GET_RAW_CONTENT_NC, args=[deposit.id])
]
def test_access_to_existing_deposit_with_one_archive(
authenticated_client, deposit_collection, complete_deposit,
sample_archive):
"""Access to deposit should stream a 200 response with its raw content
"""
deposit = complete_deposit
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
r = authenticated_client.get(url)
assert r.status_code == status.HTTP_200_OK
- assert r._headers['content-type'][1] == 'application/octet-stream'
+ assert r._headers['content-type'][1] == 'application/zip'
# read the stream
data = b''.join(r.streaming_content)
- actual_sha1 = hashlib.sha1(data).hexdigest()
- assert actual_sha1 == sample_archive['sha1sum']
+ # extract the file from the zip
+ zfile = zipfile.ZipFile(io.BytesIO(data))
+ assert zfile.namelist() == ['file1']
+ assert zfile.open('file1').read() == b'some content in file'
def test_access_to_existing_deposit_with_multiple_archives(
tmp_path, authenticated_client, deposit_collection, partial_deposit,
sample_archive):
"""Access to deposit should stream a 200 response with its raw contents
"""
deposit = partial_deposit
archive2 = create_arborescence_archive(
- tmp_path, 'archive2', 'file2', b'some content in file')
+ tmp_path, 'archive2', 'file2', b'some other content in file')
# Add a second archive to deposit
update_uri = reverse(EM_IRI, args=[deposit_collection.name, deposit.id])
response = authenticated_client.post(
update_uri,
content_type='application/zip', # as zip
data=archive2['data'],
# + headers
CONTENT_LENGTH=archive2['length'],
HTTP_SLUG=deposit.external_id,
HTTP_CONTENT_MD5=archive2['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % (
archive2['name'], ))
assert response.status_code == status.HTTP_201_CREATED
for url in private_get_raw_url_endpoints(deposit_collection, deposit):
r = authenticated_client.get(url)
assert r.status_code == status.HTTP_200_OK
- assert r._headers['content-type'][1] == 'application/octet-stream'
+ assert r._headers['content-type'][1] == 'application/zip'
# read the stream
data = b''.join(r.streaming_content)
- actual_sha1 = hashlib.sha1(data).hexdigest()
- check_tarball_consistency(
- tmp_path, sample_archive, archive2, actual_sha1)
-
-
-def check_tarball_consistency(tmp_path, archive, archive2, actual_sha1):
- """Check the tarballs are ok
-
- """
- workdir = path.join(tmp_path, 'workdir')
- mkdir(workdir)
- lst = set(listdir(workdir))
- assert lst == set()
- tarball.uncompress(archive['path'], dest=workdir)
- assert listdir(workdir) == ['file1']
- tarball.uncompress(archive2['path'], dest=workdir)
- lst = set(listdir(workdir))
- assert lst == {'file1', 'file2'}
-
- new_path = workdir + '.zip'
- tarball.compress(new_path, 'zip', workdir)
- with open(new_path, 'rb') as f:
- h = hashlib.sha1(f.read()).hexdigest()
-
- assert actual_sha1 == h
- assert actual_sha1 != archive['sha1sum']
- assert actual_sha1 != archive2['sha1sum']
-
- shutil.rmtree(workdir)
+ # extract the file from the zip
+ zfile = zipfile.ZipFile(io.BytesIO(data))
+ assert zfile.namelist() == ['file1', 'file2']
+ assert zfile.open('file1').read() == b'some content in file'
+ assert zfile.open('file2').read() == b'some other content in file'
diff --git a/swh/deposit/tests/api/test_service_document.py b/swh/deposit/tests/api/test_service_document.py
index 558c7598..dda59a88 100644
--- a/swh/deposit/tests/api/test_service_document.py
+++ b/swh/deposit/tests/api/test_service_document.py
@@ -1,87 +1,86 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from django.urls import reverse
from rest_framework import status
-from swh.deposit.tests import TEST_CONFIG
from swh.deposit.config import SD_IRI
def test_service_document_no_auth_fails(client):
"""Without authentication, service document endpoint should return 401
"""
url = reverse(SD_IRI)
response = client.get(url)
assert response.status_code == status.HTTP_401_UNAUTHORIZED
def test_service_document_no_auth_with_http_auth_should_not_break(client):
"""Without auth, sd endpoint through browser should return 401
"""
url = reverse(SD_IRI)
response = client.get(
url,
HTTP_ACCEPT='text/html,application/xml;q=9,*/*,q=8')
assert response.status_code == status.HTTP_401_UNAUTHORIZED
def test_service_document(authenticated_client, deposit_user):
"""With authentication, service document list user's collection
"""
url = reverse(SD_IRI)
response = authenticated_client.get(url)
check_response(response, deposit_user.username)
def test_service_document_with_http_accept_header(
authenticated_client, deposit_user):
"""With authentication, with browser, sd list user's collection
"""
url = reverse(SD_IRI)
response = authenticated_client.get(
url,
HTTP_ACCEPT='text/html,application/xml;q=9,*/*,q=8')
check_response(response, deposit_user.username)
def check_response(response, username):
assert response.status_code == status.HTTP_200_OK
assert response.content.decode('utf-8') == \
'''<?xml version="1.0" ?>
<service xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:sword="http://purl.org/net/sword/terms/"
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns="http://www.w3.org/2007/app">
<sword:version>2.0</sword:version>
<sword:maxUploadSize>%s</sword:maxUploadSize>
<workspace>
<atom:title>The Software Heritage (SWH) Archive</atom:title>
<collection href="http://testserver/1/%s/">
<atom:title>%s Software Collection</atom:title>
<accept>application/zip</accept>
<accept>application/x-tar</accept>
<sword:collectionPolicy>Collection Policy</sword:collectionPolicy>
<dcterms:abstract>Software Heritage Archive</dcterms:abstract>
<sword:treatment>Collect, Preserve, Share</sword:treatment>
<sword:mediation>false</sword:mediation>
<sword:metadataRelevantHeader>false</sword:metadataRelevantHeader>
<sword:acceptPackaging>http://purl.org/net/sword/package/SimpleZip</sword:acceptPackaging>
<sword:service>http://testserver/1/%s/</sword:service>
<sword:name>%s</sword:name>
</collection>
</workspace>
</service>
-''' % (TEST_CONFIG['max_upload_size'],
+''' % (500,
username,
username,
username,
username) # noqa
diff --git a/swh/deposit/tests/conftest.py b/swh/deposit/tests/conftest.py
index e95fad54..6346896d 100644
--- a/swh/deposit/tests/conftest.py
+++ b/swh/deposit/tests/conftest.py
@@ -1,338 +1,390 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import base64
import pytest
import psycopg2
from django.urls import reverse
+from django.test.utils import setup_databases # type: ignore
+# mypy is asked to ignore the import statement above because setup_databases
+# is not part of the d.t.utils.__all__ variable.
+
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from rest_framework import status
from rest_framework.test import APIClient
from typing import Mapping
+from swh.scheduler import get_scheduler
from swh.scheduler.tests.conftest import * # noqa
+from swh.deposit.config import setup_django_for
from swh.deposit.parsers import parse_xml
+from swh.deposit.config import SWHDefaultConfig
from swh.deposit.config import (
COL_IRI, EDIT_SE_IRI, DEPOSIT_STATUS_DEPOSITED, DEPOSIT_STATUS_REJECTED,
DEPOSIT_STATUS_PARTIAL, DEPOSIT_STATUS_LOAD_SUCCESS,
DEPOSIT_STATUS_VERIFIED, DEPOSIT_STATUS_LOAD_FAILURE
)
from swh.deposit.tests.common import create_arborescence_archive
TEST_USER = {
'username': 'test',
'password': 'password',
'email': 'test@example.org',
'provider_url': 'https://hal-test.archives-ouvertes.fr/',
'domain': 'archives-ouvertes.fr/',
'collection': {
'name': 'test'
},
}
+TEST_CONFIG = {
+ 'max_upload_size': 500,
+ 'extraction_dir': '/tmp/swh-deposit/test/extraction-dir',
+ 'checks': False,
+ 'provider': {
+ 'provider_name': '',
+ 'provider_type': 'deposit_client',
+ 'provider_url': '',
+ 'metadata': {
+ }
+ },
+ 'tool': {
+ 'name': 'swh-deposit',
+ 'version': '0.0.1',
+ 'configuration': {
+ 'sword_version': '2'
+ }
+ },
+}
+
+
+def pytest_configure():
+ setup_django_for('testing')
+
+
+@pytest.fixture()
+def deposit_config():
+ return TEST_CONFIG
+
+
+@pytest.fixture(autouse=True)
+def deposit_autoconfig(monkeypatch, deposit_config, swh_scheduler_config):
+ """Enforce config for deposit classes inherited from SWHDefaultConfig."""
+ def mock_parse_config(*args, **kw):
+ config = deposit_config.copy()
+ config['scheduler'] = {
+ 'cls': 'local',
+ 'args': swh_scheduler_config,
+ }
+ return config
+ monkeypatch.setattr(
+ SWHDefaultConfig, "parse_config_file",
+ mock_parse_config)
+
+ scheduler = get_scheduler('local', swh_scheduler_config)
+ task_type = {
+ 'type': 'load-deposit',
+ 'backend_name': 'swh.loader.packages.deposit.tasks.LoadDeposit',
+ 'description': 'why does this have not-null constraint?'}
+ scheduler.create_task_type(task_type)
+
+
+@pytest.fixture(scope='session')
+def django_db_setup(
+ request,
+ django_db_blocker,
+ postgresql_proc):
+ from django.conf import settings
+ settings.DATABASES['default'].update({
+ ('ENGINE', 'django.db.backends.postgresql'),
+ ('NAME', 'tests'),
+ ('USER', postgresql_proc.user), # noqa
+ ('HOST', postgresql_proc.host), # noqa
+ ('PORT', postgresql_proc.port), # noqa
+ })
+ with django_db_blocker.unblock():
+ setup_databases(
+ verbosity=request.config.option.verbose,
+ interactive=False,
+ keepdb=False)
+
+
def execute_sql(sql):
"""Execute sql to postgres db"""
with psycopg2.connect(database='postgres') as conn:
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()
cur.execute(sql)
-@pytest.hookimpl(tryfirst=True)
-def pytest_load_initial_conftests(early_config, parser, args):
- """This hook is done prior to django loading.
- Used to initialize the deposit's server db.
-
- """
- import project.app.signals # type: ignore
-
- def prepare_db(*args, **kwargs):
- from django.conf import settings
- db_name = 'tests'
- # work around db settings for django
- for k, v in [
- ('ENGINE', 'django.db.backends.postgresql'),
- ('NAME', 'tests'),
- ('USER', postgresql_proc.user), # noqa
- ('HOST', postgresql_proc.host), # noqa
- ('PORT', postgresql_proc.port), # noqa
- ]:
- settings.DATABASES['default'][k] = v
-
- execute_sql('DROP DATABASE IF EXISTS %s' % db_name)
- execute_sql('CREATE DATABASE %s TEMPLATE template0' % db_name)
-
- project.app.signals.something = prepare_db
-
-
@pytest.fixture(autouse=True, scope='session')
def swh_proxy():
"""Automatically inject this fixture in all tests to ensure no outside
connection takes place.
"""
os.environ['http_proxy'] = 'http://localhost:999'
os.environ['https_proxy'] = 'http://localhost:999'
def create_deposit_collection(collection_name: str):
"""Create a deposit collection with name collection_name
"""
from swh.deposit.models import DepositCollection
try:
collection = DepositCollection._default_manager.get(
name=collection_name)
except DepositCollection.DoesNotExist:
collection = DepositCollection(name=collection_name)
collection.save()
return collection
def deposit_collection_factory(
collection_name=TEST_USER['collection']['name']):
@pytest.fixture
def _deposit_collection(db, collection_name=collection_name):
return create_deposit_collection(collection_name)
return _deposit_collection
deposit_collection = deposit_collection_factory()
deposit_another_collection = deposit_collection_factory('another-collection')
@pytest.fixture
def deposit_user(db, deposit_collection):
"""Create/Return the test_user "test"
"""
from swh.deposit.models import DepositClient
try:
user = DepositClient._default_manager.get(
username=TEST_USER['username'])
except DepositClient.DoesNotExist:
user = DepositClient._default_manager.create_user(
username=TEST_USER['username'],
email=TEST_USER['email'],
password=TEST_USER['password'],
provider_url=TEST_USER['provider_url'],
domain=TEST_USER['domain'],
)
user.collections = [deposit_collection.id]
user.save()
return user
@pytest.fixture
def client():
"""Override pytest-django one which does not work for djangorestframework.
"""
return APIClient() # <- drf's client
@pytest.yield_fixture
def authenticated_client(client, deposit_user):
"""Returned a logged client
"""
_token = '%s:%s' % (deposit_user.username, TEST_USER['password'])
token = base64.b64encode(_token.encode('utf-8'))
authorization = 'Basic %s' % token.decode('utf-8')
client.credentials(HTTP_AUTHORIZATION=authorization)
yield client
client.logout()
@pytest.fixture
def sample_archive(tmp_path):
"""Returns a sample archive
"""
tmp_path = str(tmp_path) # pytest version limitation in previous version
archive = create_arborescence_archive(
tmp_path, 'archive1', 'file1', b'some content in file')
return archive
@pytest.fixture
def atom_dataset(datadir) -> Mapping[str, bytes]:
"""Compute the paths to atom files.
Returns:
Dict of atom name per content (bytes)
"""
atom_path = os.path.join(datadir, 'atom')
data = {}
for filename in os.listdir(atom_path):
filepath = os.path.join(atom_path, filename)
with open(filepath, 'rb') as f:
raw_content = f.read()
# Keep the filename without extension
atom_name = filename.split('.')[0]
data[atom_name] = raw_content
return data
def create_deposit(
authenticated_client, collection_name: str, sample_archive,
external_id: str, deposit_status=DEPOSIT_STATUS_DEPOSITED):
"""Create a skeleton shell deposit
"""
url = reverse(COL_IRI, args=[collection_name])
# when
response = authenticated_client.post(
url,
content_type='application/zip', # as zip
data=sample_archive['data'],
# + headers
CONTENT_LENGTH=sample_archive['length'],
HTTP_SLUG=external_id,
HTTP_CONTENT_MD5=sample_archive['md5sum'],
HTTP_PACKAGING='http://purl.org/net/sword/package/SimpleZip',
HTTP_IN_PROGRESS='false',
HTTP_CONTENT_DISPOSITION='attachment; filename=%s' % (
sample_archive['name']))
# then
assert response.status_code == status.HTTP_201_CREATED
from swh.deposit.models import Deposit
deposit = Deposit._default_manager.get(external_id=external_id)
if deposit.status != deposit_status:
deposit.status = deposit_status
deposit.save()
assert deposit.status == deposit_status
return deposit
def create_binary_deposit(
authenticated_client, collection_name: str, sample_archive,
external_id: str, deposit_status: str = DEPOSIT_STATUS_DEPOSITED,
atom_dataset: Mapping[str, bytes] = {}):
"""Create a deposit with both metadata and archive set. Then alters its status
to `deposit_status`.
"""
deposit = create_deposit(
authenticated_client, collection_name, sample_archive,
external_id=external_id, deposit_status=DEPOSIT_STATUS_PARTIAL)
response = authenticated_client.post(
reverse(EDIT_SE_IRI, args=[collection_name, deposit.id]),
content_type='application/atom+xml;type=entry',
data=atom_dataset['entry-data0'] % deposit.external_id.encode('utf-8'),
HTTP_SLUG=deposit.external_id,
HTTP_IN_PROGRESS='true')
assert response.status_code == status.HTTP_201_CREATED
assert deposit.status == DEPOSIT_STATUS_PARTIAL
from swh.deposit.models import Deposit
deposit = Deposit._default_manager.get(pk=deposit.id)
if deposit.status != deposit_status:
deposit.status = deposit_status
deposit.save()
assert deposit.status == deposit_status
return deposit
def deposit_factory(deposit_status=DEPOSIT_STATUS_DEPOSITED):
"""Build deposit with a specific status
"""
@pytest.fixture()
def _deposit(sample_archive, deposit_collection, authenticated_client,
deposit_status=deposit_status):
external_id = 'external-id-%s' % deposit_status
return create_deposit(
authenticated_client, deposit_collection.name, sample_archive,
external_id=external_id, deposit_status=deposit_status
)
return _deposit
deposited_deposit = deposit_factory()
rejected_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_REJECTED)
partial_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_PARTIAL)
verified_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_VERIFIED)
completed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS)
failed_deposit = deposit_factory(deposit_status=DEPOSIT_STATUS_LOAD_FAILURE)
@pytest.fixture
def partial_deposit_with_metadata(
sample_archive, deposit_collection, authenticated_client,
atom_dataset):
"""Returns deposit with archive and metadata provided, status 'partial'
"""
return create_binary_deposit(
authenticated_client, deposit_collection.name, sample_archive,
external_id='external-id-partial',
deposit_status=DEPOSIT_STATUS_PARTIAL,
atom_dataset=atom_dataset
)
@pytest.fixture
def partial_deposit_only_metadata(
deposit_collection, authenticated_client,
atom_dataset):
response = authenticated_client.post(
reverse(COL_IRI, args=[deposit_collection.name]),
content_type='application/atom+xml;type=entry',
data=atom_dataset['entry-data1'],
HTTP_SLUG='external-id-partial',
HTTP_IN_PROGRESS=True)
assert response.status_code == status.HTTP_201_CREATED
response_content = parse_xml(response.content)
deposit_id = response_content['deposit_id']
from swh.deposit.models import Deposit
deposit = Deposit._default_manager.get(pk=deposit_id)
assert deposit.status == DEPOSIT_STATUS_PARTIAL
return deposit
@pytest.fixture
def complete_deposit(sample_archive, deposit_collection, authenticated_client):
"""Returns a completed deposit (load success)
"""
deposit = create_deposit(
authenticated_client, deposit_collection.name, sample_archive,
external_id='external-id-complete',
deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS
)
_swh_id_context = 'https://hal.archives-ouvertes.fr/hal-01727745'
deposit.swh_id = 'swh:1:dir:42a13fc721c8716ff695d0d62fc851d641f3a12b'
deposit.swh_id_context = '%s;%s' % (
deposit.swh_id, _swh_id_context)
deposit.swh_anchor_id = \
'swh:rev:1:548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10'
deposit.swh_anchor_id_context = '%s;%s' % (
deposit.swh_anchor_id, _swh_id_context)
deposit.save()
return deposit
@pytest.fixture()
def tmp_path(tmp_path):
return str(tmp_path) # issue with oldstable's pytest version
diff --git a/swh/deposit/tests/test_utils.py b/swh/deposit/tests/test_utils.py
index f44710a6..a7486f7e 100644
--- a/swh/deposit/tests/test_utils.py
+++ b/swh/deposit/tests/test_utils.py
@@ -1,215 +1,178 @@
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import pytest
from unittest.mock import patch
from swh.deposit import utils
-from swh.deposit.models import Deposit, DepositClient
-
-
-def test_origin_url_from():
- """With correctly setup-ed deposit, all is fine
-
- """
- for provider_url, external_id in (
- ('http://somewhere.org', 'uuid'),
- ('http://overthejungle.org', 'diuu'),
- ):
- deposit = Deposit(
- client=DepositClient(provider_url=provider_url),
- external_id=external_id
- )
-
- actual_origin_url = utils.origin_url_from(deposit)
-
- assert actual_origin_url == '%s/%s' % (
- provider_url.rstrip('/'), external_id)
-
-
-def test_origin_url_from_ko():
- """Badly configured deposit should raise
-
- """
- for provider_url, external_id in (
- (None, 'uuid'),
- ('http://overthejungle.org', None),
- ):
- deposit = Deposit(
- client=DepositClient(provider_url=provider_url),
- external_id=None
- )
-
- with pytest.raises(AssertionError):
- utils.origin_url_from(deposit)
def test_merge():
"""Calling utils.merge on dicts should merge without losing information
"""
d0 = {
'author': 'someone',
'license': [['gpl2']],
'a': 1
}
d1 = {
'author': ['author0', {'name': 'author1'}],
'license': [['gpl3']],
'b': {
'1': '2'
}
}
d2 = {
'author': map(lambda x: x, ['else']),
'license': 'mit',
'b': {
'2': '3',
}
}
d3 = {
'author': (v for v in ['no one']),
}
actual_merge = utils.merge(d0, d1, d2, d3)
expected_merge = {
'a': 1,
'license': [['gpl2'], ['gpl3'], 'mit'],
'author': [
'someone', 'author0', {'name': 'author1'}, 'else', 'no one'],
'b': {
'1': '2',
'2': '3',
}
}
assert actual_merge == expected_merge
def test_merge_2():
d0 = {
'license': 'gpl2',
'runtime': {
'os': 'unix derivative'
}
}
d1 = {
'license': 'gpl3',
'runtime': 'GNU/Linux'
}
expected = {
'license': ['gpl2', 'gpl3'],
'runtime': [
{
'os': 'unix derivative'
},
'GNU/Linux'
],
}
actual = utils.merge(d0, d1)
assert actual == expected
def test_merge_edge_cases():
input_dict = {
'license': ['gpl2', 'gpl3'],
'runtime': [
{
'os': 'unix derivative'
},
'GNU/Linux'
],
}
# against empty dict
actual = utils.merge(input_dict, {})
assert actual == input_dict
# against oneself
actual = utils.merge(input_dict, input_dict, input_dict)
assert actual == input_dict
def test_merge_one_dict():
"""Merge one dict should result in the same dict value
"""
input_and_expected = {'anything': 'really'}
actual = utils.merge(input_and_expected)
assert actual == input_and_expected
def test_merge_raise():
"""Calling utils.merge with any no dict argument should raise
"""
d0 = {
'author': 'someone',
'a': 1
}
d1 = ['not a dict']
with pytest.raises(ValueError):
utils.merge(d0, d1)
with pytest.raises(ValueError):
utils.merge(d1, d0)
with pytest.raises(ValueError):
utils.merge(d1)
assert utils.merge(d0) == d0
@patch('swh.deposit.utils.normalize_timestamp', side_effect=lambda x: x)
def test_normalize_date_0(mock_normalize):
"""When date is a list, choose the first date and normalize it
Note: We do not test swh.model.identifiers which is already tested
in swh.model
"""
actual_date = utils.normalize_date(['2017-10-12', 'date1'])
expected_date = '2017-10-12 00:00:00+00:00'
assert str(actual_date) == expected_date
@patch('swh.deposit.utils.normalize_timestamp', side_effect=lambda x: x)
def test_normalize_date_1(mock_normalize):
"""Providing a date in a reasonable format, everything is fine
Note: We do not test swh.model.identifiers which is already tested
in swh.model
"""
actual_date = utils.normalize_date('2018-06-11 17:02:02')
expected_date = '2018-06-11 17:02:02+00:00'
assert str(actual_date) == expected_date
@patch('swh.deposit.utils.normalize_timestamp', side_effect=lambda x: x)
def test_normalize_date_doing_irrelevant_stuff(mock_normalize):
"""Providing a date with only the year results in a reasonable date
Note: We do not test swh.model.identifiers which is already tested
in swh.model
"""
actual_date = utils.normalize_date('2017')
expected_date = '2017-01-01 00:00:00+00:00'
assert str(actual_date) == expected_date
diff --git a/swh/deposit/utils.py b/swh/deposit/utils.py
index beb31ef6..86775ac3 100644
--- a/swh/deposit/utils.py
+++ b/swh/deposit/utils.py
@@ -1,108 +1,83 @@
# Copyright (C) 2018-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import iso8601
from types import GeneratorType
from swh.model.identifiers import normalize_timestamp
-def origin_url_from(deposit):
- """Given a deposit instance, return the associated origin url.
-
- This expects a deposit and the associated client to be correctly
- configured.
-
- Args:
- deposit (Deposit): The deposit from which derives the origin url
-
- Raises:
- AssertionError if:
- - the client's provider_url field is not configured.
- - the deposit's external_id field is not configured.
-
- Returns
- The associated origin url
-
- """
- external_id = deposit.external_id
- assert external_id is not None
- base_url = deposit.client.provider_url
- assert base_url is not None
- return '%s/%s' % (base_url.rstrip('/'), external_id)
-
-
def merge(*dicts):
"""Given an iterator of dicts, merge them losing no information.
Args:
*dicts: arguments are all supposed to be dict to merge into one
Returns:
dict merged without losing information
"""
def _extend(existing_val, value):
"""Given an existing value and a value (as potential lists), merge
them together without repetition.
"""
if isinstance(value, (list, map, GeneratorType)):
vals = value
else:
vals = [value]
for v in vals:
if v in existing_val:
continue
existing_val.append(v)
return existing_val
d = {}
for data in dicts:
if not isinstance(data, dict):
raise ValueError(
'dicts is supposed to be a variable arguments of dict')
for key, value in data.items():
existing_val = d.get(key)
if not existing_val:
d[key] = value
continue
if isinstance(existing_val, (list, map, GeneratorType)):
new_val = _extend(existing_val, value)
elif isinstance(existing_val, dict):
if isinstance(value, dict):
new_val = merge(existing_val, value)
else:
new_val = _extend([existing_val], value)
else:
new_val = _extend([existing_val], value)
d[key] = new_val
return d
def normalize_date(date):
"""Normalize date fields as expected by swh workers.
If date is a list, elect arbitrarily the first element of that
list
If date is (then) a string, parse it through
dateutil.parser.parse to extract a datetime.
Then normalize it through
swh.model.identifiers.normalize_timestamp.
Returns
The swh date object
"""
if isinstance(date, list):
date = date[0]
if isinstance(date, str):
date = iso8601.parse_date(date)
return normalize_timestamp(date)
diff --git a/version.txt b/version.txt
index b0d69697..180421f6 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-v0.0.79-0-g76fc968
\ No newline at end of file
+v0.0.80-0-g030a342
\ No newline at end of file

File Metadata

Mime Type
text/x-diff
Expires
Thu, Jul 3, 12:26 PM (2 d, 6 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3390557

Event Timeline