diff --git a/swh/web/api/throttling.py b/swh/web/api/throttling.py index faad552e..b054a070 100644 --- a/swh/web/api/throttling.py +++ b/swh/web/api/throttling.py @@ -1,218 +1,223 @@ # Copyright (C) 2017-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from ipaddress import IPv4Network, IPv6Network, ip_address, ip_network from typing import Callable, List, TypeVar, Union from django.core.exceptions import ImproperlyConfigured import rest_framework from rest_framework.throttling import ScopedRateThrottle -from swh.web.auth.utils import API_SAVE_ORIGIN_PERMISSION +from swh.web.auth.utils import API_RAW_OBJECT_PERMISSION, API_SAVE_ORIGIN_PERMISSION from swh.web.common.exc import sentry_capture_exception from swh.web.config import get_config APIView = TypeVar("APIView", bound="rest_framework.views.APIView") Request = rest_framework.request.Request API_THROTTLING_EXEMPTED_PERM = "swh.web.api.throttling_exempted" class SwhWebRateThrottle(ScopedRateThrottle): """Custom DRF request rate limiter for anonymous users Requests are grouped into scopes. It enables to apply different requests rate limiting based on the scope name but also the input HTTP request types. To associate a scope to requests, one must add a 'throttle_scope' attribute when using a class based view, or call the 'throttle_scope' decorator when using a function based view. By default, requests do not have an associated scope and are not rate limited. Rate limiting can also be configured according to the type of the input HTTP requests for fine grained tuning. For instance, the following YAML configuration section sets a rate of: - 1 per minute for POST requests - 60 per minute for other request types for the 'swh_api' scope while exempting those coming from the 127.0.0.0/8 ip network. .. code-block:: yaml throttling: scopes: swh_api: limiter_rate: default: 60/m POST: 1/m exempted_networks: - 127.0.0.0/8 """ scope = None def __init__(self): super().__init__() self.exempted_networks = None self.num_requests = 0 self.duration = 0 def get_cache_key(self, request, view): # do not handle throttling if user is authenticated if request.user.is_authenticated: return None else: return super().get_cache_key(request, view) def get_exempted_networks( self, scope_name: str ) -> List[Union[IPv4Network, IPv6Network]]: if not self.exempted_networks: scopes = get_config()["throttling"]["scopes"] scope = scopes.get(scope_name) if scope: networks = scope.get("exempted_networks") if networks: self.exempted_networks = [ ip_network(network) for network in networks ] return self.exempted_networks def get_scope(self, view: APIView): if not self.scope: # class based view case return getattr(view, self.scope_attr, None) else: # function based view case return self.scope def allow_request(self, request: Request, view: APIView) -> bool: # class based view case if not self.scope: default_scope = getattr(view, self.scope_attr, None) request_allowed = None if default_scope is not None: # check if there is a specific rate limiting associated # to the request type assert request.method is not None request_scope = f"{default_scope}_{request.method.lower()}" setattr(view, self.scope_attr, request_scope) try: request_allowed = super().allow_request(request, view) # use default rate limiting otherwise except ImproperlyConfigured as exc: sentry_capture_exception(exc) setattr(view, self.scope_attr, default_scope) if request_allowed is None: request_allowed = super().allow_request(request, view) # function based view case else: default_scope = self.scope # check if there is a specific rate limiting associated # to the request type self.scope = default_scope + "_" + request.method.lower() try: self.rate = self.get_rate() # use default rate limiting otherwise except ImproperlyConfigured: self.scope = default_scope self.rate = self.get_rate() self.num_requests, self.duration = self.parse_rate(self.rate) request_allowed = super(ScopedRateThrottle, self).allow_request( request, view ) self.scope = default_scope exempted_networks = self.get_exempted_networks(default_scope) exempted_ip = False if exempted_networks: remote_address = ip_address(self.get_ident(request)) exempted_ip = any( remote_address in network for network in exempted_networks ) request_allowed = exempted_ip or request_allowed # set throttling related data in the request metadata # in order for the ThrottlingHeadersMiddleware to # add X-RateLimit-* headers in the HTTP response if not exempted_ip and hasattr(self, "history"): hit_count = len(self.history) request.META["RateLimit-Limit"] = self.num_requests request.META["RateLimit-Remaining"] = self.num_requests - hit_count wait = self.wait() if wait is not None: request.META["RateLimit-Reset"] = int(self.now + wait) return request_allowed class SwhWebUserRateThrottle(SwhWebRateThrottle): """Custom DRF request rate limiter for authenticated users It has the same behavior than :class:`swh.web.api.throttling.SwhWebRateThrottle` except the number of allowed requests for each throttle scope is increased by a 1Ox factor. """ NUM_REQUESTS_FACTOR = 10 def get_cache_key(self, request, view): # do not handle throttling if user is not authenticated if request.user.is_authenticated: return super(SwhWebRateThrottle, self).get_cache_key(request, view) else: return None def parse_rate(self, rate): # increase number of allowed requests num_requests, duration = super().parse_rate(rate) return (num_requests * self.NUM_REQUESTS_FACTOR, duration) def allow_request(self, request: Request, view: APIView) -> bool: if request.user.is_staff or request.user.has_perm(API_THROTTLING_EXEMPTED_PERM): # no throttling for staff users or users with adequate permission return True scope = self.get_scope(view) if scope == "swh_save_origin" and request.user.has_perm( API_SAVE_ORIGIN_PERMISSION ): # no throttling on save origin endpoint for users with adequate permission return True + if scope == "swh_raw_object" and request.user.has_perm( + API_RAW_OBJECT_PERMISSION + ): + # no throttling on raw object endpoint for users with adequate permission + return True return super().allow_request(request, view) def throttle_scope(scope: str) -> Callable[..., APIView]: """Decorator that allows the throttle scope of a DRF function based view to be set:: @api_view(['GET', ]) @throttle_scope('scope') def view(request): ... """ def decorator(func: APIView) -> APIView: SwhScopeRateThrottle = type( "SwhWebScopeRateThrottle", (SwhWebRateThrottle,), {"scope": scope} ) SwhScopeUserRateThrottle = type( "SwhWebScopeUserRateThrottle", (SwhWebUserRateThrottle,), {"scope": scope}, ) func.throttle_classes = (SwhScopeRateThrottle, SwhScopeUserRateThrottle) return func return decorator diff --git a/swh/web/api/views/raw.py b/swh/web/api/views/raw.py index dbe4d7c8..b6a91cc7 100644 --- a/swh/web/api/views/raw.py +++ b/swh/web/api/views/raw.py @@ -1,109 +1,118 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.http import HttpResponse +from rest_framework.exceptions import PermissionDenied from swh.model import model from swh.model.git_objects import ( content_git_object, directory_git_object, release_git_object, revision_git_object, snapshot_git_object, ) from swh.model.hashutil import hash_to_hex from swh.model.swhids import CoreSWHID, ObjectType from swh.storage.algos.directory import directory_get from swh.storage.algos.snapshot import snapshot_get_all_branches from swh.web.api.apidoc import api_doc, format_docstring from swh.web.api.apiurls import api_route +from swh.web.auth.utils import API_RAW_OBJECT_PERMISSION from swh.web.common import archive from swh.web.common.exc import NotFoundExc from swh.web.common.utils import SWHID_RE @api_route( f"/raw/(?P{SWHID_RE})/", "api-1-raw-object", + throttle_scope="swh_raw_object", ) @api_doc("/raw/") @format_docstring() def api_raw_object(request, swhid): """ .. http:get:: /api/1/raw/(swhid)/ Get the object corresponding to the SWHID in raw form. This endpoint exposes the internal representation (see the ``*_git_object`` functions in :mod:`swh.model.git_objects`), and so can be used to fetch a binary blob which hashes to the same identifier. + .. warning:: + That endpoint is not publicly available and requires authentication and + special user permission in order to be able to request it. + :param string swhid: the object's SWHID :resheader Content-Type: application/octet-stream :statuscode 200: no error :statuscode 404: the requested object can not be found in the archive **Example:** .. parsed-literal:: :swh_web_api:`raw/swh:1:snp:6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a` """ + if not (request.user.is_staff or request.user.has_perm(API_RAW_OBJECT_PERMISSION)): + raise PermissionDenied() swhid = CoreSWHID.from_string(swhid) object_id = swhid.object_id object_type = swhid.object_type def not_found(): return NotFoundExc(f"Object with id {swhid} not found.") if object_type == ObjectType.CONTENT: results = archive.storage.content_find({"sha1_git": object_id}) if len(results) == 0: raise not_found() cnt = results[0] # `cnt.with_data()` unfortunately doesn't seem to work. if cnt.data is None: d = cnt.to_dict() d["data"] = archive.storage.content_get_data(cnt.sha1) cnt = model.Content.from_dict(d) assert cnt.data, f"Content {hash_to_hex(cnt.sha1)} ceased to exist" result = content_git_object(cnt) elif object_type == ObjectType.DIRECTORY: result = directory_get(archive.storage, object_id) if result is None: raise not_found() result = directory_git_object(result) elif object_type == ObjectType.REVISION: result = archive.storage.revision_get([object_id])[0] if result is None: raise not_found() result = revision_git_object(result) elif object_type == ObjectType.RELEASE: result = archive.storage.release_get([object_id])[0] if result is None: raise not_found() result = release_git_object(result) elif object_type == ObjectType.SNAPSHOT: result = snapshot_get_all_branches(archive.storage, object_id) if result is None: raise not_found() result = snapshot_git_object(result) else: raise ValueError(f"Unexpected object type variant: {object_type}") response = HttpResponse(result, content_type="application/octet-stream") filename = str(swhid).replace(":", "_") + "_raw" response["Content-disposition"] = f"attachment; filename={filename}" return response diff --git a/swh/web/auth/utils.py b/swh/web/auth/utils.py index c93708d0..96e1cae0 100644 --- a/swh/web/auth/utils.py +++ b/swh/web/auth/utils.py @@ -1,115 +1,116 @@ # Copyright (C) 2020-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from base64 import urlsafe_b64encode from typing import List from cryptography.fernet import Fernet from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import hashes from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC from django.contrib.auth.decorators import user_passes_test from django.http.request import HttpRequest from swh.web.common.exc import ForbiddenExc OIDC_SWH_WEB_CLIENT_ID = "swh-web" SWH_AMBASSADOR_PERMISSION = "swh.ambassador" API_SAVE_ORIGIN_PERMISSION = "swh.web.api.save_origin" ADMIN_LIST_DEPOSIT_PERMISSION = "swh.web.admin.list_deposits" MAILMAP_PERMISSION = "swh.web.mailmap" ADD_FORGE_MODERATOR_PERMISSION = "swh.web.add_forge_now.moderator" MAILMAP_ADMIN_PERMISSION = "swh.web.admin.mailmap" +API_RAW_OBJECT_PERMISSION = "swh.web.api.raw_object" def _get_fernet(password: bytes, salt: bytes) -> Fernet: """ Instantiate a Fernet system from a password and a salt value (see https://cryptography.io/en/latest/fernet/). Args: password: user password that will be used to generate a Fernet key derivation function salt: value that will be used to generate a Fernet key derivation function Returns: The Fernet system """ kdf = PBKDF2HMAC( algorithm=hashes.SHA256(), length=32, salt=salt, iterations=100000, backend=default_backend(), ) key = urlsafe_b64encode(kdf.derive(password)) return Fernet(key) def encrypt_data(data: bytes, password: bytes, salt: bytes) -> bytes: """ Encrypt data using Fernet system (symmetric encryption). Args: data: input data to encrypt password: user password that will be used to generate a Fernet key derivation function salt: value that will be used to generate a Fernet key derivation function Returns: The encrypted data """ return _get_fernet(password, salt).encrypt(data) def decrypt_data(data: bytes, password: bytes, salt: bytes) -> bytes: """ Decrypt data using Fernet system (symmetric encryption). Args: data: input data to decrypt password: user password that will be used to generate a Fernet key derivation function salt: value that will be used to generate a Fernet key derivation function Returns: The decrypted data """ return _get_fernet(password, salt).decrypt(data) def privileged_user(request: HttpRequest, permissions: List[str] = []) -> bool: """Determine whether a user is authenticated and is a privileged one (e.g ambassador). This allows such user to have access to some more actions (e.g. bypass save code now review, access to 'archives' type...). A user is considered as privileged if he is a staff member or has any permission from those provided as parameters. Args: request: Input django HTTP request permissions: list of permission names to determine if user is privileged or not Returns: Whether the user is privileged or not. """ user = request.user return user.is_authenticated and ( user.is_staff or any([user.has_perm(perm) for perm in permissions]) ) def any_permission_required(*perms): """View decorator granting access to it if user has at least one permission among those passed as parameters. """ def check_perms(user): if any(user.has_perm(perm) for perm in perms): return True raise ForbiddenExc return user_passes_test(check_perms) diff --git a/swh/web/settings/tests.py b/swh/web/settings/tests.py index bec4b3ae..8e626e7d 100644 --- a/swh/web/settings/tests.py +++ b/swh/web/settings/tests.py @@ -1,130 +1,134 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information """ Django tests settings for swh-web. """ import os import sys from swh.web.config import get_config scope1_limiter_rate = 3 scope1_limiter_rate_post = 1 scope2_limiter_rate = 5 scope2_limiter_rate_post = 2 scope3_limiter_rate = 1 scope3_limiter_rate_post = 1 save_origin_rate_post = 5 +api_raw_object_rate = 5 swh_web_config = get_config() _pytest = "pytest" in sys.argv[0] or "PYTEST_XDIST_WORKER" in os.environ swh_web_config.update( { # enable django debug mode only when running pytest "debug": _pytest, "secret_key": "test", "history_counters_url": "", "throttling": { "cache_uri": None, "scopes": { "swh_api": { "limiter_rate": {"default": "60/min"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_api_origin_search": { "limiter_rate": {"default": "100/min"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_api_origin_visit_latest": { "limiter_rate": {"default": "6000/min"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_vault_cooking": { "limiter_rate": {"default": "120/h", "GET": "60/m"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_save_origin": { "limiter_rate": { "default": "120/h", "POST": "%s/h" % save_origin_rate_post, } }, + "swh_raw_object": { + "limiter_rate": {"default": f"{api_raw_object_rate}/h"}, + }, "scope1": { "limiter_rate": { "default": "%s/min" % scope1_limiter_rate, "POST": "%s/min" % scope1_limiter_rate_post, } }, "scope2": { "limiter_rate": { "default": "%s/min" % scope2_limiter_rate, "POST": "%s/min" % scope2_limiter_rate_post, } }, "scope3": { "limiter_rate": { "default": "%s/min" % scope3_limiter_rate, "POST": "%s/min" % scope3_limiter_rate_post, }, "exempted_networks": ["127.0.0.0/8"], }, }, }, "keycloak": { # disable keycloak use when not running pytest "server_url": "http://localhost:8080/auth/" if _pytest else "", "realm_name": "SoftwareHeritage", }, } ) from .common import * # noqa from .common import LOGGING # noqa, isort: skip ALLOWED_HOSTS = ["*"] DATABASES = { "default": { "ENGINE": "django.db.backends.postgresql", "NAME": swh_web_config["test_db"]["name"], } } # when running cypress tests, make the webapp fetch data from memory storages if not _pytest: swh_web_config.update( { "debug": True, "e2e_tests_mode": True, # ensure scheduler not available to avoid side effects in cypress tests "scheduler": {"cls": "remote", "url": ""}, } ) from django.conf import settings from swh.web.tests.data import get_tests_data, override_storages test_data = get_tests_data() override_storages( test_data["storage"], test_data["idx_storage"], test_data["search"], test_data["counters"], ) # using sqlite3 for frontend tests settings.DATABASES["default"].update( {"ENGINE": "django.db.backends.sqlite3", "NAME": "swh-web-test.sqlite3"} ) else: # Silent DEBUG output when running unit tests LOGGING["handlers"]["console"]["level"] = "INFO" # type: ignore diff --git a/swh/web/tests/api/views/test_raw.py b/swh/web/tests/api/views/test_raw.py index 73369fdb..5e502b65 100644 --- a/swh/web/tests/api/views/test_raw.py +++ b/swh/web/tests/api/views/test_raw.py @@ -1,58 +1,115 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib +import pytest + from swh.model.hashutil import hash_to_bytes +from swh.web.api.throttling import SwhWebUserRateThrottle +from swh.web.auth.utils import API_RAW_OBJECT_PERMISSION from swh.web.common.utils import reverse +from swh.web.settings.tests import api_raw_object_rate from swh.web.tests.utils import ( check_api_get_responses, check_http_get_response, + create_django_permission, ) -def test_api_raw_not_found(api_client, unknown_core_swhid): +@pytest.fixture +def privileged_user(regular_user): + regular_user.user_permissions.add( + create_django_permission(API_RAW_OBJECT_PERMISSION) + ) + return regular_user + + +@pytest.mark.django_db +def test_api_raw_forbidden_for_anonymous_user(api_client, unknown_core_swhid): + url = reverse("api-1-raw-object", url_args={"swhid": str(unknown_core_swhid)}) + check_api_get_responses(api_client, url, status_code=403) + + +@pytest.mark.django_db +def test_api_raw_forbidden_for_user_without_permission( + api_client, regular_user, unknown_core_swhid +): + api_client.force_login(regular_user) + url = reverse("api-1-raw-object", url_args={"swhid": str(unknown_core_swhid)}) + check_api_get_responses(api_client, url, status_code=403) + + +@pytest.mark.django_db +def test_api_raw_not_found(api_client, unknown_core_swhid, staff_user): + api_client.force_login(staff_user) url = reverse("api-1-raw-object", url_args={"swhid": str(unknown_core_swhid)}) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data == { "exception": "NotFoundExc", "reason": f"Object with id {unknown_core_swhid} not found.", } -def _test_api_raw_hash(api_client, archive_data, object_id, object_ty): +def _test_api_raw_hash(api_client, privileged_user, archive_data, object_id, object_ty): + api_client.force_login(privileged_user) url = reverse( "api-1-raw-object", url_args={"swhid": f"swh:1:{object_ty}:{object_id}"}, ) rv = check_http_get_response(api_client, url, status_code=200) assert rv["Content-Type"] == "application/octet-stream" assert ( rv["Content-disposition"] == f"attachment; filename=swh_1_{object_ty}_{object_id}_raw" ) sha1_git = hashlib.new("sha1", rv.content).digest() assert sha1_git == hash_to_bytes(object_id) -def test_api_raw_content(api_client, archive_data, content): - _test_api_raw_hash(api_client, archive_data, content["sha1_git"], "cnt") +@pytest.mark.django_db +def test_api_raw_content(api_client, archive_data, content, privileged_user): + _test_api_raw_hash( + api_client, privileged_user, archive_data, content["sha1_git"], "cnt" + ) + + +@pytest.mark.django_db +def test_api_raw_directory(api_client, archive_data, directory, privileged_user): + _test_api_raw_hash(api_client, privileged_user, archive_data, directory, "dir") + + +@pytest.mark.django_db +def test_api_raw_revision(api_client, archive_data, revision, privileged_user): + _test_api_raw_hash(api_client, privileged_user, archive_data, revision, "rev") -def test_api_raw_directory(api_client, archive_data, directory): - _test_api_raw_hash(api_client, archive_data, directory, "dir") +@pytest.mark.django_db +def test_api_raw_release(api_client, archive_data, release, privileged_user): + _test_api_raw_hash(api_client, privileged_user, archive_data, release, "rel") -def test_api_raw_revision(api_client, archive_data, revision): - _test_api_raw_hash(api_client, archive_data, revision, "rev") +@pytest.mark.django_db +def test_api_raw_snapshot(api_client, archive_data, snapshot, privileged_user): + _test_api_raw_hash(api_client, privileged_user, archive_data, snapshot, "snp") -def test_api_raw_release(api_client, archive_data, release): - _test_api_raw_hash(api_client, archive_data, release, "rel") +@pytest.mark.django_db +def test_api_raw_no_rate_limit_for_privileged_user( + api_client, revision, privileged_user +): + + api_client.force_login(privileged_user) + + url = reverse( + "api-1-raw-object", + url_args={"swhid": f"swh:1:rev:{revision}"}, + ) + for _ in range(api_raw_object_rate * SwhWebUserRateThrottle.NUM_REQUESTS_FACTOR): + check_http_get_response(api_client, url, status_code=200) -def test_api_raw_snapshot(api_client, archive_data, snapshot): - _test_api_raw_hash(api_client, archive_data, snapshot, "snp") + check_http_get_response(api_client, url, status_code=200)