Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/web/api/apiresponse.py b/swh/web/api/apiresponse.py
index aaf63ad7..c99c87bb 100644
--- a/swh/web/api/apiresponse.py
+++ b/swh/web/api/apiresponse.py
@@ -1,229 +1,231 @@
-# Copyright (C) 2017-2019 The Software Heritage developers
+# Copyright (C) 2017-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import logging
import traceback
from typing import Any, Dict, Optional
from django.http import HttpResponse
from django.shortcuts import render
from django.utils.cache import add_never_cache_headers
from django.utils.html import escape
from rest_framework.exceptions import APIException
from rest_framework.request import Request
from rest_framework.response import Response
from rest_framework.utils.encoders import JSONEncoder
from swh.storage.exc import StorageAPIError, StorageDBError
from swh.web.api import utils
from swh.web.common.exc import (
BadInputExc,
ForbiddenExc,
LargePayloadExc,
NotFoundExc,
sentry_capture_exception,
)
from swh.web.common.utils import gen_path_info, shorten_path
from swh.web.config import get_config
logger = logging.getLogger("django")
def compute_link_header(rv: Dict[str, Any], options: Dict[str, Any]) -> Dict[str, Any]:
"""Add Link header in returned value results.
Args:
request: a DRF Request object
rv (dict): dictionary with keys:
- headers: potential headers with 'link-next' and 'link-prev'
keys
- results: containing the result to return
options (dict): the initial dict to update with result if any
Returns:
dict: dictionary with optional keys 'link-next' and 'link-prev'
"""
link_headers = []
if "headers" not in rv:
return {}
rv_headers = rv["headers"]
if "link-next" in rv_headers:
link_headers.append('<%s>; rel="next"' % rv_headers["link-next"])
if "link-prev" in rv_headers:
link_headers.append('<%s>; rel="previous"' % rv_headers["link-prev"])
if link_headers:
link_header_str = ",".join(link_headers)
headers = options.get("headers", {})
headers.update({"Link": link_header_str})
return headers
return {}
def filter_by_fields(request: Request, data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract a request parameter 'fields' if it exists to permit the filtering on
the data dict's keys.
If such field is not provided, returns the data as is.
"""
fields = request.query_params.get("fields")
if fields:
data = utils.filter_field_keys(data, set(fields.split(",")))
return data
def transform(rv: Dict[str, Any]) -> Dict[str, Any]:
"""Transform an eventual returned value with multiple layer of
information with only what's necessary.
If the returned value rv contains the 'results' key, this is the
associated value which is returned.
Otherwise, return the initial dict without the potential 'headers'
key.
"""
if "results" in rv:
return rv["results"]
if "headers" in rv:
rv.pop("headers")
return rv
def make_api_response(
request: Request,
data: Dict[str, Any],
doc_data: Optional[Dict[str, Any]] = None,
options: Optional[Dict[str, Any]] = None,
) -> HttpResponse:
"""Generates an API response based on the requested mimetype.
Args:
request: a DRF Request object
data: raw data to return in the API response
doc_data: documentation data for HTML response
options: optional data that can be used to generate the response
Returns:
a DRF Response a object
"""
options = options or {}
if data:
options["headers"] = compute_link_header(data, options)
data = transform(data)
data = filter_by_fields(request, data)
doc_data = doc_data or {}
headers = {}
if "headers" in options:
doc_data["headers_data"] = options["headers"]
headers = options["headers"]
# get request status code
doc_data["status_code"] = options.get("status", 200)
+ accepted_media_type = getattr(request, "accepted_media_type", "application/json")
+
# when requesting HTML, typically when browsing the API through its
# documented views, we need to enrich the input data with documentation
# and render the apidoc HTML template
- if request.accepted_media_type == "text/html":
+ if accepted_media_type == "text/html":
doc_data["response_data"] = data
if data is not None:
doc_data["response_data"] = json.dumps(
data, cls=JSONEncoder, sort_keys=True, indent=4, separators=(",", ": ")
)
doc_data["heading"] = shorten_path(str(request.path))
# generate breadcrumbs data
if "route" in doc_data:
doc_data["endpoint_path"] = gen_path_info(doc_data["route"])
for i in range(len(doc_data["endpoint_path"]) - 1):
doc_data["endpoint_path"][i]["path"] += "/doc/"
if not doc_data["noargs"]:
doc_data["endpoint_path"][-1]["path"] += "/doc/"
response = render(
request, "api/apidoc.html", doc_data, status=doc_data["status_code"]
)
# otherwise simply return the raw data and let DRF picks
# the correct renderer (JSON or YAML)
else:
response = Response(
data,
headers=headers,
- content_type=request.accepted_media_type,
+ content_type=accepted_media_type,
status=doc_data["status_code"],
)
if getattr(request, "never_cache", False):
add_never_cache_headers(response)
return response
def error_response(
request: Request, exception: Exception, doc_data: Dict[str, Any]
) -> HttpResponse:
"""Private function to create a custom error response.
Args:
request: a DRF Request object
error: the exception that caused the error
doc_data: documentation data for HTML response
"""
error_code = 500
if isinstance(exception, BadInputExc):
error_code = 400
elif isinstance(exception, NotFoundExc):
error_code = 404
elif isinstance(exception, ForbiddenExc):
error_code = 403
elif isinstance(exception, LargePayloadExc):
error_code = 413
elif isinstance(exception, StorageDBError):
error_code = 503
elif isinstance(exception, StorageAPIError):
error_code = 503
elif isinstance(exception, APIException):
error_code = exception.status_code
error_opts = {"status": error_code}
error_data = {
"exception": exception.__class__.__name__,
"reason": str(exception),
}
- if request.accepted_media_type == "text/html":
+ if getattr(request, "accepted_media_type", None) == "text/html":
error_data["reason"] = escape(error_data["reason"])
if get_config()["debug"]:
error_data["traceback"] = traceback.format_exc()
logger.debug(error_data["traceback"])
return make_api_response(request, error_data, doc_data, options=error_opts)
def error_response_handler(
exc: Exception, context: Dict[str, Any]
) -> Optional[HttpResponse]:
"""Custom DRF exception handler used to generate API error responses.
"""
sentry_capture_exception(exc)
doc_data = getattr(exc, "doc_data", None)
return error_response(context["request"], exc, doc_data)
diff --git a/swh/web/api/views/graph.py b/swh/web/api/views/graph.py
index 89846537..edd08521 100644
--- a/swh/web/api/views/graph.py
+++ b/swh/web/api/views/graph.py
@@ -1,160 +1,161 @@
-# Copyright (C) 2020 The Software Heritage developers
+# Copyright (C) 2020-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from distutils.util import strtobool
import json
from typing import Dict, Iterator, Union
import requests
from django.http.response import StreamingHttpResponse
from rest_framework.decorators import renderer_classes
+from rest_framework.renderers import JSONRenderer
from rest_framework.request import Request
from rest_framework.response import Response
from swh.model.hashutil import hash_to_hex
from swh.model.model import Sha1Git
from swh.model.swhids import ExtendedObjectType, ExtendedSWHID
from swh.web.api.apidoc import api_doc
from swh.web.api.apiurls import api_route
from swh.web.api.renderers import PlainTextRenderer
from swh.web.common import archive
from swh.web.config import SWH_WEB_INTERNAL_SERVER_NAME, get_config
API_GRAPH_PERM = "swh.web.api.graph"
def _resolve_origin_swhid(swhid: str, origin_urls: Dict[Sha1Git, str]) -> str:
"""
Resolve origin url from its swhid sha1 representation.
"""
parsed_swhid = ExtendedSWHID.from_string(swhid)
if parsed_swhid.object_type == ExtendedObjectType.ORIGIN:
if parsed_swhid.object_id in origin_urls:
return origin_urls[parsed_swhid.object_id]
else:
origin_info = list(
archive.lookup_origins_by_sha1s([hash_to_hex(parsed_swhid.object_id)])
)[0]
assert origin_info is not None
origin_urls[parsed_swhid.object_id] = origin_info["url"]
return origin_info["url"]
else:
return swhid
def _resolve_origin_swhids_in_graph_response(
response: requests.Response,
) -> Iterator[bytes]:
"""
Resolve origin urls from their swhid sha1 representations in graph service
responses.
"""
content_type = response.headers["Content-Type"]
origin_urls: Dict[Sha1Git, str] = {}
if content_type == "application/x-ndjson":
for line in response.iter_lines():
swhids = json.loads(line.decode("utf-8"))
processed_line = []
for swhid in swhids:
processed_line.append(_resolve_origin_swhid(swhid, origin_urls))
yield (json.dumps(processed_line) + "\n").encode()
elif content_type == "text/plain":
for line in response.iter_lines():
processed_line = []
swhids = line.decode("utf-8").split(" ")
for swhid in swhids:
processed_line.append(_resolve_origin_swhid(swhid, origin_urls))
yield (" ".join(processed_line) + "\n").encode()
else:
for line in response.iter_lines():
yield line + b"\n"
@api_route(r"/graph/", "api-1-graph-doc")
@api_doc("/graph/")
def api_graph(request: Request) -> None:
"""
.. http:get:: /api/1/graph/(graph_query)/
Provide fast access to the graph representation of the Software Heritage
archive.
That endpoint acts as a proxy for the `Software Heritage Graph service
<https://docs.softwareheritage.org/devel/swh-graph/index.html>`_.
It provides fast access to the `graph representation
<https://docs.softwareheritage.org/devel/swh-model/data-model.html#data-structure>`_
of the Software Heritage archive.
For more details please refer to the `Graph RPC API documentation
<https://docs.softwareheritage.org/devel/swh-graph/api.html>`_.
.. warning::
That endpoint is not publicly available and requires authentication and
special user permission in order to be able to request it.
:param string graph_query: query to forward to the Software Heritage Graph
archive (see its `documentation
<https://docs.softwareheritage.org/devel/swh-graph/api.html>`_)
:query boolean resolve_origins: extra parameter defined by that proxy enabling
to resolve origin urls from their sha1 representations
:statuscode 200: no error
:statuscode 400: an invalid graph query has been provided
:statuscode 404: provided graph node cannot be found
**Examples:**
.. parsed-literal::
:swh_web_api:`graph/leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323/`
:swh_web_api:`graph/neighbors/swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35/`
:swh_web_api:`graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward`
:swh_web_api:`graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward&limit=-2`
:swh_web_api:`graph/visit/nodes/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc?direction=backward&resolve_origins=true`
:swh_web_api:`graph/visit/edges/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc?direction=backward&resolve_origins=true`
:swh_web_api:`graph/visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb?direction=backward&resolve_origins=true`
"""
return None
@api_route(r"/graph/(?P<graph_query>.+)/", "api-1-graph")
-@renderer_classes([PlainTextRenderer])
+@renderer_classes([JSONRenderer, PlainTextRenderer])
def api_graph_proxy(
request: Request, graph_query: str
) -> Union[Response, StreamingHttpResponse]:
if request.get_host() != SWH_WEB_INTERNAL_SERVER_NAME:
if not bool(request.user and request.user.is_authenticated):
return Response("Authentication credentials were not provided.", status=401)
if not request.user.has_perm(API_GRAPH_PERM):
return Response(
"You do not have permission to perform this action.", status=403
)
graph_query_url = get_config()["graph"]["server_url"]
graph_query_url += graph_query
if request.GET:
graph_query_url += "?" + request.GET.urlencode(safe="/;:")
response = requests.get(graph_query_url, stream=True)
# graph stats and counter endpoint responses are not streamed
if response.headers.get("Transfer-Encoding") != "chunked":
return Response(
- response.text,
+ response.json(),
status=response.status_code,
content_type=response.headers["Content-Type"],
)
# other endpoint responses are streamed
else:
resolve_origins = strtobool(request.GET.get("resolve_origins", "false"))
if response.status_code == 200 and resolve_origins:
response_stream = _resolve_origin_swhids_in_graph_response(response)
else:
response_stream = map(lambda line: line + b"\n", response.iter_lines())
return StreamingHttpResponse(
response_stream,
status=response.status_code,
content_type=response.headers["Content-Type"],
)
diff --git a/swh/web/tests/api/views/test_graph.py b/swh/web/tests/api/views/test_graph.py
index 81bcd290..b18c40b6 100644
--- a/swh/web/tests/api/views/test_graph.py
+++ b/swh/web/tests/api/views/test_graph.py
@@ -1,257 +1,270 @@
# Copyright (C) 2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
import hashlib
-import json
import textwrap
from django.http.response import StreamingHttpResponse
from swh.model.hashutil import hash_to_bytes
from swh.model.swhids import ExtendedObjectType, ExtendedSWHID
from swh.web.api.views.graph import API_GRAPH_PERM
from swh.web.common.utils import reverse
from swh.web.config import SWH_WEB_INTERNAL_SERVER_NAME, get_config
from swh.web.tests.utils import check_http_get_response
def test_graph_endpoint_no_authentication_for_vpn_users(api_client, requests_mock):
graph_query = "stats"
url = reverse("api-1-graph", url_args={"graph_query": graph_query})
requests_mock.get(
get_config()["graph"]["server_url"] + graph_query,
json={},
headers={"Content-Type": "application/json"},
)
check_http_get_response(
api_client, url, status_code=200, server_name=SWH_WEB_INTERNAL_SERVER_NAME
)
def test_graph_endpoint_needs_authentication(api_client):
url = reverse("api-1-graph", url_args={"graph_query": "stats"})
check_http_get_response(api_client, url, status_code=401)
def _authenticate_graph_user(api_client, keycloak_oidc):
keycloak_oidc.client_permissions = [API_GRAPH_PERM]
oidc_profile = keycloak_oidc.login()
api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}")
def test_graph_endpoint_needs_permission(api_client, keycloak_oidc, requests_mock):
graph_query = "stats"
url = reverse("api-1-graph", url_args={"graph_query": graph_query})
oidc_profile = keycloak_oidc.login()
api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}")
check_http_get_response(api_client, url, status_code=403)
_authenticate_graph_user(api_client, keycloak_oidc)
requests_mock.get(
get_config()["graph"]["server_url"] + graph_query,
json={},
headers={"Content-Type": "application/json"},
)
check_http_get_response(api_client, url, status_code=200)
def test_graph_text_plain_response(api_client, keycloak_oidc, requests_mock):
_authenticate_graph_user(api_client, keycloak_oidc)
graph_query = "leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323"
response_text = textwrap.dedent(
"""\
swh:1:cnt:1d3dace0a825b0535c37c53ed669ef817e9c1b47
swh:1:cnt:6d5b280f4e33589ae967a7912a587dd5cb8dedaa
swh:1:cnt:91bef238bf01356a550d416d14bb464c576ac6f4
swh:1:cnt:58a8b925a463b87d49639fda282b8f836546e396
swh:1:cnt:fd32ee0a87e16ccc853dfbeb7018674f9ce008c0
swh:1:cnt:ab7c39871872589a4fc9e249ebc927fb1042c90d
swh:1:cnt:93073c02bf3869845977527de16af4d54765838d
swh:1:cnt:4251f795b52c54c447a97c9fe904d8b1f993b1e0
swh:1:cnt:c6e7055424332006d07876ffeba684e7e284b383
swh:1:cnt:8459d8867dc3b15ef7ae9683e21cccc9ab2ec887
swh:1:cnt:5f9981d52202815aa947f85b9dfa191b66f51138
swh:1:cnt:00a685ec51bcdf398c15d588ecdedb611dbbab4b
swh:1:cnt:e1cf1ea335106a0197a2f92f7804046425a7d3eb
swh:1:cnt:07069b38087f88ec192d2c9aff75a502476fd17d
swh:1:cnt:f045ee845c7f14d903a2c035b2691a7c400c01f0
"""
)
requests_mock.get(
get_config()["graph"]["server_url"] + graph_query,
text=response_text,
headers={"Content-Type": "text/plain", "Transfer-Encoding": "chunked"},
)
url = reverse("api-1-graph", url_args={"graph_query": graph_query})
resp = check_http_get_response(
api_client, url, status_code=200, content_type="text/plain"
)
assert isinstance(resp, StreamingHttpResponse)
assert b"".join(resp.streaming_content) == response_text.encode()
_response_json = {
"counts": {"nodes": 17075708289, "edges": 196236587976},
"ratios": {
"compression": 0.16,
"bits_per_node": 58.828,
"bits_per_edge": 5.119,
"avg_locality": 2184278529.729,
},
"indegree": {"min": 0, "max": 263180117, "avg": 11.4921492364925},
"outdegree": {"min": 0, "max": 1033207, "avg": 11.4921492364925},
}
def test_graph_json_response(api_client, keycloak_oidc, requests_mock):
_authenticate_graph_user(api_client, keycloak_oidc)
graph_query = "stats"
requests_mock.get(
get_config()["graph"]["server_url"] + graph_query,
json=_response_json,
headers={"Content-Type": "application/json"},
)
url = reverse("api-1-graph", url_args={"graph_query": graph_query})
resp = check_http_get_response(api_client, url, status_code=200)
assert resp.content_type == "application/json"
- assert resp.content == json.dumps(_response_json).encode()
+ assert resp.data == _response_json
def test_graph_ndjson_response(api_client, keycloak_oidc, requests_mock):
_authenticate_graph_user(api_client, keycloak_oidc)
graph_query = "visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb"
response_ndjson = textwrap.dedent(
"""\
["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\
"swh:1:cnt:acfb7cabd63b368a03a9df87670ece1488c8bce0"]
["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\
"swh:1:cnt:2a0837708151d76edf28fdbb90dc3eabc676cff3"]
["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\
"swh:1:cnt:eaf025ad54b94b2fdda26af75594cfae3491ec75"]
"""
)
requests_mock.get(
get_config()["graph"]["server_url"] + graph_query,
text=response_ndjson,
headers={
"Content-Type": "application/x-ndjson",
"Transfer-Encoding": "chunked",
},
)
url = reverse("api-1-graph", url_args={"graph_query": graph_query})
resp = check_http_get_response(api_client, url, status_code=200)
assert isinstance(resp, StreamingHttpResponse)
assert resp["Content-Type"] == "application/x-ndjson"
assert b"".join(resp.streaming_content) == response_ndjson.encode()
def test_graph_response_resolve_origins(
archive_data, api_client, keycloak_oidc, requests_mock, origin
):
hasher = hashlib.sha1()
hasher.update(origin["url"].encode())
origin_sha1 = hasher.digest()
origin_swhid = str(
ExtendedSWHID(object_type=ExtendedObjectType.ORIGIN, object_id=origin_sha1)
)
snapshot = archive_data.snapshot_get_latest(origin["url"])["id"]
snapshot_swhid = str(
ExtendedSWHID(
object_type=ExtendedObjectType.SNAPSHOT, object_id=hash_to_bytes(snapshot)
)
)
_authenticate_graph_user(api_client, keycloak_oidc)
for graph_query, response_text, content_type in (
(
f"visit/nodes/{snapshot_swhid}",
f"{snapshot_swhid}\n{origin_swhid}\n",
"text/plain",
),
(
f"visit/edges/{snapshot_swhid}",
f"{snapshot_swhid} {origin_swhid}\n",
"text/plain",
),
(
f"visit/paths/{snapshot_swhid}",
f'["{snapshot_swhid}", "{origin_swhid}"]\n',
"application/x-ndjson",
),
):
# set two lines response to check resolved origins cache
response_text = response_text + response_text
requests_mock.get(
get_config()["graph"]["server_url"] + graph_query,
text=response_text,
headers={"Content-Type": content_type, "Transfer-Encoding": "chunked"},
)
url = reverse(
"api-1-graph",
url_args={"graph_query": graph_query},
query_params={"direction": "backward"},
)
resp = check_http_get_response(api_client, url, status_code=200)
assert isinstance(resp, StreamingHttpResponse)
assert resp["Content-Type"] == content_type
assert b"".join(resp.streaming_content) == response_text.encode()
url = reverse(
"api-1-graph",
url_args={"graph_query": graph_query},
query_params={"direction": "backward", "resolve_origins": "true"},
)
resp = check_http_get_response(api_client, url, status_code=200)
assert isinstance(resp, StreamingHttpResponse)
assert resp["Content-Type"] == content_type
assert (
b"".join(resp.streaming_content)
== response_text.replace(origin_swhid, origin["url"]).encode()
)
def test_graph_response_resolve_origins_nothing_to_do(
api_client, keycloak_oidc, requests_mock
):
_authenticate_graph_user(api_client, keycloak_oidc)
graph_query = "stats"
requests_mock.get(
get_config()["graph"]["server_url"] + graph_query,
json=_response_json,
headers={"Content-Type": "application/json"},
)
url = reverse(
"api-1-graph",
url_args={"graph_query": graph_query},
query_params={"resolve_origins": "true"},
)
resp = check_http_get_response(api_client, url, status_code=200)
assert resp.content_type == "application/json"
- assert resp.content == json.dumps(_response_json).encode()
+ assert resp.data == _response_json
+
+
+def test_graph_response_invalid_accept_header(api_client):
+ url = reverse(
+ "api-1-graph",
+ url_args={"graph_query": "stats"},
+ query_params={"resolve_origins": "true"},
+ )
+
+ resp = api_client.get(url, HTTP_ACCEPT="text/html")
+ assert resp.status_code == 406
+ assert resp.content_type == "application/json"
+ assert resp.data["exception"] == "NotAcceptable"
+ assert resp.data["reason"] == "Could not satisfy the request Accept header."

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 11:24 AM (3 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3245765

Event Timeline