diff --git a/swh/web/api/apiresponse.py b/swh/web/api/apiresponse.py index 45a93e22..5b69e9e9 100644 --- a/swh/web/api/apiresponse.py +++ b/swh/web/api/apiresponse.py @@ -1,186 +1,191 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import json import traceback from django.utils.html import escape from rest_framework.response import Response from swh.storage.exc import StorageDBError, StorageAPIError from swh.web.api import utils -from swh.web.common.exc import NotFoundExc, ForbiddenExc, BadInputExc +from swh.web.common.exc import ( + NotFoundExc, ForbiddenExc, + BadInputExc, LargePayloadExc +) from swh.web.common.utils import shorten_path, gen_path_info from swh.web.config import get_config def compute_link_header(rv, options): """Add Link header in returned value results. Args: request: a DRF Request object rv (dict): dictionary with keys: - headers: potential headers with 'link-next' and 'link-prev' keys - results: containing the result to return options (dict): the initial dict to update with result if any Returns: dict: dictionary with optional keys 'link-next' and 'link-prev' """ link_headers = [] if 'headers' not in rv: return {} rv_headers = rv['headers'] if 'link-next' in rv_headers: link_headers.append('<%s>; rel="next"' % rv_headers['link-next']) if 'link-prev' in rv_headers: link_headers.append('<%s>; rel="previous"' % rv_headers['link-prev']) if link_headers: link_header_str = ','.join(link_headers) headers = options.get('headers', {}) headers.update({ 'Link': link_header_str }) return headers return {} def filter_by_fields(request, data): """Extract a request parameter 'fields' if it exists to permit the filtering on the data dict's keys. If such field is not provided, returns the data as is. """ fields = request.query_params.get('fields') if fields: fields = set(fields.split(',')) data = utils.filter_field_keys(data, fields) return data def transform(rv): """Transform an eventual returned value with multiple layer of information with only what's necessary. If the returned value rv contains the 'results' key, this is the associated value which is returned. Otherwise, return the initial dict without the potential 'headers' key. """ if 'results' in rv: return rv['results'] if 'headers' in rv: rv.pop('headers') return rv def make_api_response(request, data, doc_data={}, options={}): """Generates an API response based on the requested mimetype. Args: request: a DRF Request object data: raw data to return in the API response doc_data: documentation data for HTML response options: optional data that can be used to generate the response Returns: a DRF Response a object """ if data: options['headers'] = compute_link_header(data, options) data = transform(data) data = filter_by_fields(request, data) doc_env = doc_data headers = {} if 'headers' in options: doc_env['headers_data'] = options['headers'] headers = options['headers'] # get request status code doc_env['status_code'] = options.get('status', 200) response_args = {'status': doc_env['status_code'], 'headers': headers, 'content_type': request.accepted_media_type} # when requesting HTML, typically when browsing the API through its # documented views, we need to enrich the input data with documentation # related ones and inform DRF that we request HTML template rendering if request.accepted_media_type == 'text/html': if data: data = json.dumps(data, sort_keys=True, indent=4, separators=(',', ': ')) doc_env['response_data'] = data doc_env['heading'] = shorten_path(str(request.path)) if 'route' in doc_env: doc_env['endpoint_path'] = gen_path_info(doc_env['route']) response_args['data'] = doc_env response_args['template_name'] = 'api/apidoc.html' # otherwise simply return the raw data and let DRF picks # the correct renderer (JSON or YAML) else: response_args['data'] = data return Response(**response_args) def error_response(request, error, doc_data): """Private function to create a custom error response. Args: request: a DRF Request object error: the exception that caused the error doc_data: documentation data for HTML response """ error_code = 500 if isinstance(error, BadInputExc): error_code = 400 elif isinstance(error, NotFoundExc): error_code = 404 elif isinstance(error, ForbiddenExc): error_code = 403 + elif isinstance(error, LargePayloadExc): + error_code = 413 elif isinstance(error, StorageDBError): error_code = 503 elif isinstance(error, StorageAPIError): error_code = 503 error_opts = {'status': error_code} error_data = { 'exception': error.__class__.__name__, 'reason': str(error), } if request.accepted_media_type == 'text/html': error_data['reason'] = escape(error_data['reason']) if get_config()['debug']: error_data['traceback'] = traceback.format_exc() return make_api_response(request, error_data, doc_data, options=error_opts) diff --git a/swh/web/api/views/identifiers.py b/swh/web/api/views/identifiers.py index cc8c6567..bc9d8cf2 100644 --- a/swh/web/api/views/identifiers.py +++ b/swh/web/api/views/identifiers.py @@ -1,101 +1,107 @@ # Copyright (C) 2018-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.web.common import service, utils from swh.web.common.utils import ( resolve_swh_persistent_id, get_persistent_identifier ) from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route +from swh.web.common.exc import LargePayloadExc @api_route(r'/resolve/(?P.*)/', 'api-1-resolve-swh-pid') @api_doc('/resolve/') @format_docstring() def api_resolve_swh_pid(request, swh_id): """ .. http:get:: /api/1/resolve/(swh_id)/ Resolve a Software Heritage persistent identifier. Try to resolve a provided `persistent identifier `_ into an url for browsing the pointed archive object. If the provided identifier is valid, the existence of the object in the archive will also be checked. :param string swh_id: a Software Heritage persistent identifier :>json string browse_url: the url for browsing the pointed object :>json object metadata: object holding optional parts of the persistent identifier :>json string namespace: the persistent identifier namespace :>json string object_id: the hash identifier of the pointed object :>json string object_type: the type of the pointed object :>json number scheme_version: the scheme version of the persistent identifier {common_headers} **Allowed HTTP Methods:** :http:method:`get`, :http:method:`head`, :http:method:`options` :statuscode 200: no error :statuscode 400: an invalid persistent identifier has been provided :statuscode 404: the pointed object does not exist in the archive **Example:** .. parsed-literal:: :swh_web_api:`resolve/swh:1:rev:96db9023b881d7cd9f379b0c154650d6c108e9a3;origin=https://github.com/openssl/openssl/` """ # noqa # try to resolve the provided pid swh_id_resolved = resolve_swh_persistent_id(swh_id) # id is well-formed, now check that the pointed # object is present in the archive, NotFoundExc # will be raised otherwise swh_id_parsed = swh_id_resolved['swh_id_parsed'] object_type = swh_id_parsed.object_type object_id = swh_id_parsed.object_id service.lookup_object(object_type, object_id) # id is well-formed and the pointed object exists swh_id_data = swh_id_parsed._asdict() swh_id_data['browse_url'] = request.build_absolute_uri( swh_id_resolved['browse_url']) return swh_id_data @api_route(r'/known/', 'api-1-swh-pid-known', methods=['POST']) @api_doc('/known/', noargs=True, tags=['hidden']) @format_docstring() def api_swh_pid_known(request): """ .. http:post:: /api/1/known/ Check if a list of Software Heritage persistent identifier is present in the archive depending on their id (sha1_git). Returns: A dictionary with: keys(str): Persistent identifier values(dict): A dictionary containing the key 'known'. (true if the pid is present, False otherwise) """ + limit = 1000 + if len(request.data) > limit: + raise LargePayloadExc('The maximum number of PIDs this endpoint can ' + 'receive is %s' % limit) + persistent_ids = [get_persistent_identifier(pid) for pid in request.data] response = {str(pid): {'known': False} for pid in persistent_ids} # group pids by their type pids_by_type = utils.group_swh_persistent_identifiers(persistent_ids) # search for hashes not present in the storage missing_hashes = service.lookup_missing_hashes(pids_by_type) for pid in persistent_ids: if pid.object_id not in missing_hashes: response[str(pid)]['known'] = True return response diff --git a/swh/web/common/exc.py b/swh/web/common/exc.py index fea79b8d..fc94316d 100644 --- a/swh/web/common/exc.py +++ b/swh/web/common/exc.py @@ -1,125 +1,135 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import traceback from django.http import HttpResponse from django.shortcuts import render from django.utils.safestring import mark_safe from django.utils.html import escape import sentry_sdk from swh.web.config import get_config class BadInputExc(ValueError): """Wrong request to the api. Example: Asking a content with the wrong identifier format. """ pass class NotFoundExc(Exception): """Good request to the api but no result were found. Example: Asking a content with the right identifier format but that content does not exist. """ pass class ForbiddenExc(Exception): """Good request to the api, forbidden result to return due to enforce policy. Example: Asking for a raw content which exists but whose mimetype is not text. """ pass +class LargePayloadExc(Exception): + """The input size is too large. + + Example: Asking to resolve 10000 persistent identifier when the limit + is 1000. + """ + pass + + http_status_code_message = { 400: 'Bad Request', 401: 'Unauthorized', 403: 'Access Denied', 404: 'Resource not found', + 413: 'Payload Too Large', 500: 'Internal Server Error', 501: 'Not Implemented', 502: 'Bad Gateway', 503: 'Service unavailable' } def _generate_error_page(request, error_code, error_description): return render(request, 'error.html', {'error_code': error_code, 'error_message': http_status_code_message[error_code], 'error_description': mark_safe(error_description)}, status=error_code) def swh_handle400(request, exception=None): """ Custom Django HTTP error 400 handler for swh-web. """ error_description = ('The server cannot process the request to %s due to ' 'something that is perceived to be a client error.' % escape(request.META['PATH_INFO'])) return _generate_error_page(request, 400, error_description) def swh_handle403(request, exception=None): """ Custom Django HTTP error 403 handler for swh-web. """ error_description = ('The resource %s requires an authentication.' % escape(request.META['PATH_INFO'])) return _generate_error_page(request, 403, error_description) def swh_handle404(request, exception=None): """ Custom Django HTTP error 404 handler for swh-web. """ error_description = ('The resource %s could not be found on the server.' % escape(request.META['PATH_INFO'])) return _generate_error_page(request, 404, error_description) def swh_handle500(request): """ Custom Django HTTP error 500 handler for swh-web. """ error_description = ('An unexpected condition was encountered when ' 'requesting resource %s.' % escape(request.META['PATH_INFO'])) return _generate_error_page(request, 500, error_description) def handle_view_exception(request, exc, html_response=True): """ Function used to generate an error page when an exception was raised inside a swh-web browse view. """ sentry_sdk.capture_exception(exc) error_code = 500 error_description = '%s: %s' % (type(exc).__name__, str(exc)) if get_config()['debug']: error_description = traceback.format_exc() if isinstance(exc, BadInputExc): error_code = 400 if isinstance(exc, ForbiddenExc): error_code = 403 if isinstance(exc, NotFoundExc): error_code = 404 if html_response: return _generate_error_page(request, error_code, error_description) else: return HttpResponse(error_description, content_type='text/plain', status=error_code) diff --git a/swh/web/tests/api/views/test_identifiers.py b/swh/web/tests/api/views/test_identifiers.py index 5d6edb0e..12e7a90b 100644 --- a/swh/web/tests/api/views/test_identifiers.py +++ b/swh/web/tests/api/views/test_identifiers.py @@ -1,142 +1,161 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from hypothesis import given from swh.model.identifiers import ( CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT ) from swh.web.common.utils import reverse from swh.web.tests.data import random_sha1 from swh.web.tests.strategies import ( content, directory, origin, release, revision, snapshot, unknown_content, unknown_directory, unknown_release, unknown_revision, unknown_snapshot ) @given(origin(), content(), directory(), release(), revision(), snapshot()) def test_swh_id_resolve_success(api_client, origin, content, directory, release, revision, snapshot): for obj_type_short, obj_type, obj_id in ( ('cnt', CONTENT, content['sha1_git']), ('dir', DIRECTORY, directory), ('rel', RELEASE, release), ('rev', REVISION, revision), ('snp', SNAPSHOT, snapshot)): swh_id = 'swh:1:%s:%s;origin=%s' % (obj_type_short, obj_id, origin['url']) url = reverse('api-1-resolve-swh-pid', url_args={'swh_id': swh_id}) resp = api_client.get(url) if obj_type == CONTENT: url_args = {'query_string': 'sha1_git:%s' % obj_id} elif obj_type == SNAPSHOT: url_args = {'snapshot_id': obj_id} else: url_args = {'sha1_git': obj_id} browse_rev_url = reverse('browse-%s' % obj_type, url_args=url_args, query_params={'origin': origin['url']}, request=resp.wsgi_request) expected_result = { 'browse_url': browse_rev_url, 'metadata': {'origin': origin['url']}, 'namespace': 'swh', 'object_id': obj_id, 'object_type': obj_type, 'scheme_version': 1 } assert resp.status_code == 200, resp.data assert resp.data == expected_result def test_swh_id_resolve_invalid(api_client): rev_id_invalid = '96db9023b8_foo_50d6c108e9a3' swh_id = 'swh:1:rev:%s' % rev_id_invalid url = reverse('api-1-resolve-swh-pid', url_args={'swh_id': swh_id}) resp = api_client.get(url) assert resp.status_code == 400, resp.data @given(unknown_content(), unknown_directory(), unknown_release(), unknown_revision(), unknown_snapshot()) def test_swh_id_resolve_not_found(api_client, unknown_content, unknown_directory, unknown_release, unknown_revision, unknown_snapshot): for obj_type_short, obj_id in (('cnt', unknown_content['sha1_git']), ('dir', unknown_directory), ('rel', unknown_release), ('rev', unknown_revision), ('snp', unknown_snapshot)): swh_id = 'swh:1:%s:%s' % (obj_type_short, obj_id) url = reverse('api-1-resolve-swh-pid', url_args={'swh_id': swh_id}) resp = api_client.get(url) assert resp.status_code == 404, resp.data def test_swh_origin_id_not_resolvable(api_client): ori_pid = 'swh:1:ori:8068d0075010b590762c6cb5682ed53cb3c13deb' url = reverse('api-1-resolve-swh-pid', url_args={'swh_id': ori_pid}) resp = api_client.get(url) assert resp.status_code == 400, resp.data @given(content(), directory()) def test_api_known_swhpid_some_present(api_client, content, directory): content_ = 'swh:1:cnt:%s' % content['sha1_git'] directory_ = 'swh:1:dir:%s' % directory unknown_revision_ = 'swh:1:rev:%s' % random_sha1() unknown_release_ = 'swh:1:rel:%s' % random_sha1() unknown_snapshot_ = 'swh:1:snp:%s' % random_sha1() input_pids = [content_, directory_, unknown_revision_, unknown_release_, unknown_snapshot_] url = reverse('api-1-swh-pid-known') resp = api_client.post(url, data=input_pids, format='json', HTTP_ACCEPT='application/json') assert resp.status_code == 200, resp.data assert resp['Content-Type'] == 'application/json' assert resp.data == { content_: {'known': True}, directory_: {'known': True}, unknown_revision_: {'known': False}, unknown_release_: {'known': False}, unknown_snapshot_: {'known': False} } def test_api_known_invalid_swhpid(api_client): invalid_pid_sha1 = ['swh:1:cnt:8068d0075010b590762c6cb5682ed53cb3c13de;'] invalid_pid_type = ['swh:1:cnn:8068d0075010b590762c6cb5682ed53cb3c13deb'] url = reverse('api-1-swh-pid-known') resp = api_client.post(url, data=invalid_pid_sha1, format='json', HTTP_ACCEPT='application/json') assert resp.status_code == 400, resp.data resp2 = api_client.post(url, data=invalid_pid_type, format='json', HTTP_ACCEPT='application/json') assert resp2.status_code == 400, resp.data + + +def test_api_known_raises_large_payload_error(api_client): + random_pid = 'swh:1:cnt:8068d0075010b590762c6cb5682ed53cb3c13deb' + limit = 10000 + err_msg = 'The maximum number of PIDs this endpoint can receive is 1000' + + pids = [random_pid for i in range(limit)] + + url = reverse('api-1-swh-pid-known') + resp = api_client.post(url, data=pids, format='json', + HTTP_ACCEPT='application/json') + + assert resp.status_code == 413, resp.data + assert resp['Content-Type'] == 'application/json' + assert resp.data == { + 'exception': 'LargePayloadExc', + 'reason': err_msg + }