diff --git a/swh/web/api/apiurls.py b/swh/web/api/apiurls.py index 206404ea..423299b8 100644 --- a/swh/web/api/apiurls.py +++ b/swh/web/api/apiurls.py @@ -1,127 +1,127 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import functools from typing import Dict, List, Optional -from django.http import HttpResponse +from django.http.response import HttpResponseBase from django.utils.cache import add_never_cache_headers from rest_framework.decorators import api_view from swh.web.api import throttling from swh.web.api.apiresponse import make_api_response from swh.web.common.urlsindex import UrlsIndex class APIUrls(UrlsIndex): """ Class to manage API documentation URLs. - Indexes all routes documented using apidoc's decorators. - Tracks endpoint/request processing method relationships for use in generating related urls in API documentation """ _apidoc_routes = {} # type: Dict[str, Dict[str, str]] scope = "api" @classmethod def get_app_endpoints(cls) -> Dict[str, Dict[str, str]]: return cls._apidoc_routes @classmethod def add_doc_route( cls, route: str, docstring: str, noargs: bool = False, api_version: str = "1", **kwargs, ) -> None: """ Add a route to the self-documenting API reference """ route_name = route[1:-1].replace("/", "-") if not noargs: route_name = "%s-doc" % route_name route_view_name = "api-%s-%s" % (api_version, route_name) if route not in cls._apidoc_routes: d = { "docstring": docstring, "route": "/api/%s%s" % (api_version, route), "route_view_name": route_view_name, } for k, v in kwargs.items(): d[k] = v cls._apidoc_routes[route] = d def api_route( url_pattern: str, view_name: Optional[str] = None, methods: List[str] = ["GET", "HEAD", "OPTIONS"], throttle_scope: str = "swh_api", api_version: str = "1", checksum_args: Optional[List[str]] = None, never_cache: bool = False, ): """ Decorator to ease the registration of an API endpoint using the Django REST Framework. Args: url_pattern: the url pattern used by DRF to identify the API route view_name: the name of the API view associated to the route used to reverse the url methods: array of HTTP methods supported by the API route throttle_scope: Named scope for rate limiting api_version: web API version checksum_args: list of view argument names holding checksum values never_cache: define if api response must be cached """ url_pattern = "^" + api_version + url_pattern + "$" def decorator(f): # create a DRF view from the wrapped function @api_view(methods) @throttling.throttle_scope(throttle_scope) @functools.wraps(f) def api_view_f(request, **kwargs): response = f(request, **kwargs) doc_data = None # check if response has been forwarded by api_doc decorator if isinstance(response, dict) and "doc_data" in response: doc_data = response["doc_data"] response = response["data"] # check if HTTP response needs to be created - if not isinstance(response, HttpResponse): + if not isinstance(response, HttpResponseBase): api_response = make_api_response( request, data=response, doc_data=doc_data ) else: api_response = response if never_cache: add_never_cache_headers(api_response) return api_response # small hacks for correctly generating API endpoints index doc api_view_f.__name__ = f.__name__ api_view_f.http_method_names = methods # register the route and its view in the endpoints index APIUrls.add_url_pattern(url_pattern, api_view_f, view_name) if checksum_args: APIUrls.add_redirect_for_checksum_args( view_name, [url_pattern], checksum_args ) return f return decorator diff --git a/swh/web/api/views/graph.py b/swh/web/api/views/graph.py index 825d5588..3d1b4c85 100644 --- a/swh/web/api/views/graph.py +++ b/swh/web/api/views/graph.py @@ -1,145 +1,158 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from distutils.util import strtobool import json -from typing import Dict +from typing import Dict, Iterator, Union import requests +from django.http.response import StreamingHttpResponse from rest_framework.decorators import renderer_classes from rest_framework.request import Request from rest_framework.response import Response from swh.model.identifiers import ORIGIN, parse_swhid from swh.web.api.apidoc import api_doc from swh.web.api.apiurls import api_route from swh.web.api.renderers import PlainTextRenderer from swh.web.common import archive from swh.web.config import SWH_WEB_INTERNAL_SERVER_NAME, get_config API_GRAPH_PERM = "swh.web.api.graph" def _resolve_origin_swhid(swhid: str, origin_urls: Dict[str, str]) -> str: """ Resolve origin url from its swhid sha1 representation. """ parsed_swhid = parse_swhid(swhid) if parsed_swhid.object_type == ORIGIN: if parsed_swhid.object_id in origin_urls: return origin_urls[parsed_swhid.object_id] else: origin_info = list( archive.lookup_origins_by_sha1s([parsed_swhid.object_id]) )[0] assert origin_info is not None origin_urls[parsed_swhid.object_id] = origin_info["url"] return origin_info["url"] else: return swhid -def _resolve_origin_swhids_in_graph_response(response: requests.Response) -> str: +def _resolve_origin_swhids_in_graph_response( + response: requests.Response, +) -> Iterator[bytes]: """ Resolve origin urls from their swhid sha1 representations in graph service responses. """ content_type = response.headers["Content-Type"] origin_urls: Dict[str, str] = {} if content_type == "application/x-ndjson": - processed_response = [] - for line in response.text.split("\n")[:-1]: - swhids = json.loads(line) + for line in response.iter_lines(): + swhids = json.loads(line.decode("utf-8")) processed_line = [] for swhid in swhids: processed_line.append(_resolve_origin_swhid(swhid, origin_urls)) - processed_response.append(json.dumps(processed_line)) - return "\n".join(processed_response) + "\n" + yield (json.dumps(processed_line) + "\n").encode() elif content_type == "text/plain": - processed_response = [] - for line in response.text.split("\n")[:-1]: + for line in response.iter_lines(): processed_line = [] - swhids = line.split(" ") + swhids = line.decode("utf-8").split(" ") for swhid in swhids: processed_line.append(_resolve_origin_swhid(swhid, origin_urls)) - processed_response.append(" ".join(processed_line)) - return "\n".join(processed_response) + "\n" - return response.text + yield (" ".join(processed_line) + "\n").encode() + else: + for line in response.iter_lines(): + yield line + b"\n" @api_route(r"/graph/", "api-1-graph-doc") @api_doc("/graph/") def api_graph(request: Request) -> None: """ .. http:get:: /api/1/graph/(graph_query)/ Provide fast access to the graph representation of the Software Heritage archive. That endpoint acts as a proxy for the `Software Heritage Graph service `_. It provides fast access to the `graph representation `_ of the Software Heritage archive. For more details please refer to the `Graph REST API documentation `_. .. warning:: That endpoint is not publicly available and requires authentication and special user permission in order to be able to request it. :param string graph_query: query to forward to the Software Heritage Graph archive (see its `documentation `_) :query boolean resolve_origins: extra parameter defined by that proxy enabling to resolve origin urls from their sha1 representations :statuscode 200: no error :statuscode 400: an invalid graph query has been provided :statuscode 404: provided graph node cannot be found **Examples:** .. parsed-literal:: :swh_web_api:`graph/leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323/` :swh_web_api:`graph/neighbors/swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35/` :swh_web_api:`graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward` :swh_web_api:`graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward&limit=-2` :swh_web_api:`graph/visit/nodes/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc?direction=backward&resolve_origins=true` :swh_web_api:`graph/visit/edges/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc?direction=backward&resolve_origins=true` :swh_web_api:`graph/visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb?direction=backward&resolve_origins=true` """ return None @api_route(r"/graph/(?P.+)/", "api-1-graph") @renderer_classes([PlainTextRenderer]) -def api_graph_proxy(request: Request, graph_query: str) -> Response: +def api_graph_proxy( + request: Request, graph_query: str +) -> Union[Response, StreamingHttpResponse]: if request.get_host() != SWH_WEB_INTERNAL_SERVER_NAME: if not bool(request.user and request.user.is_authenticated): return Response("Authentication credentials were not provided.", status=401) if not request.user.has_perm(API_GRAPH_PERM): return Response( "You do not have permission to perform this action.", status=403 ) graph_query_url = get_config()["graph"]["server_url"] graph_query_url += graph_query if request.GET: graph_query_url += "?" + request.GET.urlencode(safe="/;:") - response = requests.get(graph_query_url) - response_text = response.text - resolve_origins = strtobool(request.GET.get("resolve_origins", "false")) - if response.status_code == 200 and resolve_origins: - response_text = _resolve_origin_swhids_in_graph_response(response) - return Response( - response_text, - status=response.status_code, - content_type=response.headers["Content-Type"], - ) + response = requests.get(graph_query_url, stream=True) + # graph stats and counter endpoint responses are not streamed + if response.headers.get("Transfer-Encoding") != "chunked": + return Response( + response.text, + status=response.status_code, + content_type=response.headers["Content-Type"], + ) + # other endpoint responses are streamed + else: + resolve_origins = strtobool(request.GET.get("resolve_origins", "false")) + if response.status_code == 200 and resolve_origins: + response_stream = _resolve_origin_swhids_in_graph_response(response) + else: + response_stream = map(lambda line: line + b"\n", response.iter_lines()) + return StreamingHttpResponse( + response_stream, + status=response.status_code, + content_type=response.headers["Content-Type"], + ) diff --git a/swh/web/tests/api/views/test_graph.py b/swh/web/tests/api/views/test_graph.py index a6fba6ea..15f7128b 100644 --- a/swh/web/tests/api/views/test_graph.py +++ b/swh/web/tests/api/views/test_graph.py @@ -1,245 +1,255 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib import json import textwrap from hypothesis import given +from django.http.response import StreamingHttpResponse + from swh.model.identifiers import ORIGIN, SNAPSHOT, swhid from swh.web.api.views.graph import API_GRAPH_PERM from swh.web.common.utils import reverse from swh.web.config import SWH_WEB_INTERNAL_SERVER_NAME, get_config from swh.web.tests.auth.keycloak_mock import mock_keycloak from swh.web.tests.auth.sample_data import oidc_profile from swh.web.tests.strategies import origin from swh.web.tests.utils import check_http_get_response def test_graph_endpoint_no_authentication_for_vpn_users(api_client, requests_mock): graph_query = "stats" url = reverse("api-1-graph", url_args={"graph_query": graph_query}) requests_mock.get( get_config()["graph"]["server_url"] + graph_query, json={}, headers={"Content-Type": "application/json"}, ) check_http_get_response( api_client, url, status_code=200, server_name=SWH_WEB_INTERNAL_SERVER_NAME ) def test_graph_endpoint_needs_authentication(api_client): url = reverse("api-1-graph", url_args={"graph_query": "stats"}) check_http_get_response(api_client, url, status_code=401) def _authenticate_graph_user(api_client, mocker): mock_keycloak(mocker, user_permissions=[API_GRAPH_PERM]) api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") def test_graph_endpoint_needs_permission(api_client, mocker, requests_mock): graph_query = "stats" url = reverse("api-1-graph", url_args={"graph_query": graph_query}) api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") mock_keycloak(mocker, user_permissions=[]) check_http_get_response(api_client, url, status_code=403) _authenticate_graph_user(api_client, mocker) requests_mock.get( get_config()["graph"]["server_url"] + graph_query, json={}, headers={"Content-Type": "application/json"}, ) check_http_get_response(api_client, url, status_code=200) def test_graph_text_plain_response(api_client, mocker, requests_mock): _authenticate_graph_user(api_client, mocker) graph_query = "leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323" response_text = textwrap.dedent( """\ swh:1:cnt:1d3dace0a825b0535c37c53ed669ef817e9c1b47 swh:1:cnt:6d5b280f4e33589ae967a7912a587dd5cb8dedaa swh:1:cnt:91bef238bf01356a550d416d14bb464c576ac6f4 swh:1:cnt:58a8b925a463b87d49639fda282b8f836546e396 swh:1:cnt:fd32ee0a87e16ccc853dfbeb7018674f9ce008c0 swh:1:cnt:ab7c39871872589a4fc9e249ebc927fb1042c90d swh:1:cnt:93073c02bf3869845977527de16af4d54765838d swh:1:cnt:4251f795b52c54c447a97c9fe904d8b1f993b1e0 swh:1:cnt:c6e7055424332006d07876ffeba684e7e284b383 swh:1:cnt:8459d8867dc3b15ef7ae9683e21cccc9ab2ec887 swh:1:cnt:5f9981d52202815aa947f85b9dfa191b66f51138 swh:1:cnt:00a685ec51bcdf398c15d588ecdedb611dbbab4b swh:1:cnt:e1cf1ea335106a0197a2f92f7804046425a7d3eb swh:1:cnt:07069b38087f88ec192d2c9aff75a502476fd17d swh:1:cnt:f045ee845c7f14d903a2c035b2691a7c400c01f0 """ ) requests_mock.get( get_config()["graph"]["server_url"] + graph_query, text=response_text, - headers={"Content-Type": "text/plain"}, + headers={"Content-Type": "text/plain", "Transfer-Encoding": "chunked"}, ) url = reverse("api-1-graph", url_args={"graph_query": graph_query}) resp = check_http_get_response( api_client, url, status_code=200, content_type="text/plain" ) - assert resp.content == response_text.encode() + assert isinstance(resp, StreamingHttpResponse) + assert b"".join(resp.streaming_content) == response_text.encode() _response_json = { "counts": {"nodes": 17075708289, "edges": 196236587976}, "ratios": { "compression": 0.16, "bits_per_node": 58.828, "bits_per_edge": 5.119, "avg_locality": 2184278529.729, }, "indegree": {"min": 0, "max": 263180117, "avg": 11.4921492364925}, "outdegree": {"min": 0, "max": 1033207, "avg": 11.4921492364925}, } def test_graph_json_response(api_client, mocker, requests_mock): _authenticate_graph_user(api_client, mocker) graph_query = "stats" requests_mock.get( get_config()["graph"]["server_url"] + graph_query, json=_response_json, headers={"Content-Type": "application/json"}, ) url = reverse("api-1-graph", url_args={"graph_query": graph_query}) resp = check_http_get_response(api_client, url, status_code=200) assert resp.content_type == "application/json" assert resp.content == json.dumps(_response_json).encode() def test_graph_ndjson_response(api_client, mocker, requests_mock): _authenticate_graph_user(api_client, mocker) graph_query = "visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb" response_ndjson = textwrap.dedent( """\ ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\ "swh:1:cnt:acfb7cabd63b368a03a9df87670ece1488c8bce0"] ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\ "swh:1:cnt:2a0837708151d76edf28fdbb90dc3eabc676cff3"] ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\ "swh:1:cnt:eaf025ad54b94b2fdda26af75594cfae3491ec75"] """ ) requests_mock.get( get_config()["graph"]["server_url"] + graph_query, text=response_ndjson, - headers={"Content-Type": "application/x-ndjson"}, + headers={ + "Content-Type": "application/x-ndjson", + "Transfer-Encoding": "chunked", + }, ) url = reverse("api-1-graph", url_args={"graph_query": graph_query}) resp = check_http_get_response(api_client, url, status_code=200) - assert resp.content_type == "application/x-ndjson" - assert resp.content == response_ndjson.encode() + assert isinstance(resp, StreamingHttpResponse) + assert resp["Content-Type"] == "application/x-ndjson" + assert b"".join(resp.streaming_content) == response_ndjson.encode() @given(origin()) def test_graph_response_resolve_origins( archive_data, api_client, mocker, requests_mock, origin ): hasher = hashlib.sha1() hasher.update(origin["url"].encode()) origin_sha1 = hasher.hexdigest() origin_swhid = str(swhid(ORIGIN, origin_sha1)) snapshot = archive_data.snapshot_get_latest(origin["url"])["id"] snapshot_swhid = str(swhid(SNAPSHOT, snapshot)) _authenticate_graph_user(api_client, mocker) for graph_query, response_text, content_type in ( ( f"visit/nodes/{snapshot_swhid}", f"{snapshot_swhid}\n{origin_swhid}\n", "text/plain", ), ( f"visit/edges/{snapshot_swhid}", f"{snapshot_swhid} {origin_swhid}\n", "text/plain", ), ( f"visit/paths/{snapshot_swhid}", f'["{snapshot_swhid}", "{origin_swhid}"]\n', "application/x-ndjson", ), ): # set two lines response to check resolved origins cache response_text = response_text + response_text requests_mock.get( get_config()["graph"]["server_url"] + graph_query, text=response_text, - headers={"Content-Type": content_type}, + headers={"Content-Type": content_type, "Transfer-Encoding": "chunked"}, ) url = reverse( "api-1-graph", url_args={"graph_query": graph_query}, query_params={"direction": "backward"}, ) resp = check_http_get_response(api_client, url, status_code=200) - assert resp.content_type == content_type - assert resp.content == response_text.encode() + assert isinstance(resp, StreamingHttpResponse) + assert resp["Content-Type"] == content_type + assert b"".join(resp.streaming_content) == response_text.encode() url = reverse( "api-1-graph", url_args={"graph_query": graph_query}, query_params={"direction": "backward", "resolve_origins": "true"}, ) resp = check_http_get_response(api_client, url, status_code=200) - assert resp.content_type == content_type + assert isinstance(resp, StreamingHttpResponse) + assert resp["Content-Type"] == content_type assert ( - resp.content == response_text.replace(origin_swhid, origin["url"]).encode() + b"".join(resp.streaming_content) + == response_text.replace(origin_swhid, origin["url"]).encode() ) def test_graph_response_resolve_origins_nothing_to_do( api_client, mocker, requests_mock ): _authenticate_graph_user(api_client, mocker) graph_query = "stats" requests_mock.get( get_config()["graph"]["server_url"] + graph_query, json=_response_json, headers={"Content-Type": "application/json"}, ) url = reverse( "api-1-graph", url_args={"graph_query": graph_query}, query_params={"resolve_origins": "true"}, ) resp = check_http_get_response(api_client, url, status_code=200) assert resp.content_type == "application/json" assert resp.content == json.dumps(_response_json).encode() diff --git a/swh/web/tests/utils.py b/swh/web/tests/utils.py index d9c7bb1f..d3d63a37 100644 --- a/swh/web/tests/utils.py +++ b/swh/web/tests/utils.py @@ -1,207 +1,209 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, Optional, cast -from django.http import HttpResponse +from django.http import HttpResponse, StreamingHttpResponse from django.test.client import Client from rest_framework.response import Response from rest_framework.test import APIClient from swh.web.tests.django_asserts import assert_template_used def _assert_http_response( response: HttpResponse, status_code: int, content_type: str ) -> HttpResponse: if isinstance(response, Response): drf_response = cast(Response, response) error_context = ( drf_response.data.pop("traceback") if isinstance(drf_response.data, dict) and "traceback" in drf_response.data else drf_response.data ) + elif isinstance(response, StreamingHttpResponse): + error_context = getattr(response, "traceback", response.streaming_content) else: error_context = getattr(response, "traceback", response.content) assert response.status_code == status_code, error_context if content_type != "*/*": assert response["Content-Type"].startswith(content_type) return response def check_http_get_response( client: Client, url: str, status_code: int, content_type: str = "*/*", http_origin: Optional[str] = None, server_name: Optional[str] = None, ) -> HttpResponse: """Helper function to check HTTP response for a GET request. Args: client: Django test client url: URL to check response status_code: expected HTTP status code content_type: expected response content type http_origin: optional HTTP_ORIGIN header value Returns: The HTTP response """ return _assert_http_response( response=client.get( url, HTTP_ACCEPT=content_type, HTTP_ORIGIN=http_origin, SERVER_NAME=server_name if server_name else "testserver", ), status_code=status_code, content_type=content_type, ) def check_http_post_response( client: Client, url: str, status_code: int, content_type: str = "*/*", data: Optional[Dict[str, Any]] = None, http_origin: Optional[str] = None, ) -> HttpResponse: """Helper function to check HTTP response for a POST request. Args: client: Django test client url: URL to check response status_code: expected HTTP status code content_type: expected response content type data: optional POST data Returns: The HTTP response """ return _assert_http_response( response=client.post( url, data=data, content_type="application/json", HTTP_ACCEPT=content_type, HTTP_ORIGIN=http_origin, ), status_code=status_code, content_type=content_type, ) def check_api_get_responses( api_client: APIClient, url: str, status_code: int ) -> Response: """Helper function to check Web API responses for GET requests for all accepted content types (JSON, YAML, HTML). Args: api_client: DRF test client url: Web API URL to check responses status_code: expected HTTP status code Returns: The Web API JSON response """ # check JSON response response_json = check_http_get_response( api_client, url, status_code, content_type="application/json" ) # check HTML response (API Web UI) check_http_get_response(api_client, url, status_code, content_type="text/html") # check YAML response check_http_get_response( api_client, url, status_code, content_type="application/yaml" ) return cast(Response, response_json) def check_api_post_response( api_client: APIClient, url: str, status_code: int, content_type: str = "*/*", data: Optional[Dict[str, Any]] = None, ) -> HttpResponse: """Helper function to check Web API response for a POST request for all accepted content types. Args: api_client: DRF test client url: Web API URL to check response status_code: expected HTTP status code Returns: The HTTP response """ return _assert_http_response( response=api_client.post( url, data=data, format="json", HTTP_ACCEPT=content_type, ), status_code=status_code, content_type=content_type, ) def check_api_post_responses( api_client: APIClient, url: str, status_code: int, data: Optional[Dict[str, Any]] = None, ) -> Response: """Helper function to check Web API responses for POST requests for all accepted content types (JSON, YAML). Args: api_client: DRF test client url: Web API URL to check responses status_code: expected HTTP status code Returns: The Web API JSON response """ # check JSON response response_json = check_api_post_response( api_client, url, status_code, content_type="application/json", data=data ) # check YAML response check_api_post_response( api_client, url, status_code, content_type="application/yaml", data=data ) return cast(Response, response_json) def check_html_get_response( client: Client, url: str, status_code: int, template_used: Optional[str] = None ) -> HttpResponse: """Helper function to check HTML responses for a GET request. Args: client: Django test client url: URL to check responses status_code: expected HTTP status code template_used: optional used Django template to check Returns: The HTML response """ response = check_http_get_response( client, url, status_code, content_type="text/html" ) if template_used is not None: assert_template_used(response, template_used) return response