diff --git a/pytest.ini b/pytest.ini --- a/pytest.ini +++ b/pytest.ini @@ -7,3 +7,4 @@ ignore:.*Using or importing the ABCs from 'collections' ignore:.*uses the 'client' fixture ignore:.*uses the 'mocker' fixture + ignore:.*uses the 'requests_mock' fixture diff --git a/swh/web/api/apidoc.py b/swh/web/api/apidoc.py --- a/swh/web/api/apidoc.py +++ b/swh/web/api/apidoc.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2019 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -214,7 +214,10 @@ self.data["description"] += ":\n\n%s\n" % textwrap.indent(text, "\t") # extract example url if ":swh_web_api:" in text: - self.data["examples"].append("/api/1/" + re.sub(".*`(.*)`.*", r"\1", text)) + for line in text.split("\n"): + self.data["examples"].append( + "/api/1/" + re.sub(".*`(.+)`.*", r"\1", line) + ) def visit_bullet_list(self, node): # bullet list in endpoint description diff --git a/swh/web/api/renderers.py b/swh/web/api/renderers.py --- a/swh/web/api/renderers.py +++ b/swh/web/api/renderers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -35,3 +35,16 @@ allow_unicode=not self.ensure_ascii, default_flow_style=self.default_flow_style, ) + + +class PlainTextRenderer(renderers.BaseRenderer): + """ + Renderer which does not perform any serialization to raw text data. + """ + + media_type = "text/plain" + format = "text" + charset = "utf-8" + + def render(self, data, media_type=None, renderer_context=None): + return data diff --git a/swh/web/api/urls.py b/swh/web/api/urls.py --- a/swh/web/api/urls.py +++ b/swh/web/api/urls.py @@ -10,6 +10,7 @@ from swh.web.api.apiurls import APIUrls import swh.web.api.views.content # noqa import swh.web.api.views.directory # noqa +import swh.web.api.views.graph # noqa import swh.web.api.views.identifiers # noqa import swh.web.api.views.origin # noqa import swh.web.api.views.origin_save # noqa diff --git a/swh/web/api/views/graph.py b/swh/web/api/views/graph.py new file mode 100644 --- /dev/null +++ b/swh/web/api/views/graph.py @@ -0,0 +1,137 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from distutils.util import strtobool +import json +from typing import Dict + +import requests + +from rest_framework.decorators import renderer_classes +from rest_framework.request import Request +from rest_framework.response import Response + +from swh.model.identifiers import ORIGIN, parse_swhid +from swh.web.api.apidoc import api_doc +from swh.web.api.apiurls import api_route +from swh.web.api.renderers import PlainTextRenderer +from swh.web.common import service +from swh.web.config import get_config + +API_GRAPH_PERM = "swh.web.api.graph" + + +def _process_swhid(swhid: str, origin_urls: Dict[str, str]) -> str: + parsed_swhid = parse_swhid(swhid) + if parsed_swhid.object_type == ORIGIN: + if parsed_swhid.object_id in origin_urls: + return origin_urls[parsed_swhid.object_id] + else: + origin_info = list( + service.lookup_origins_by_sha1s([parsed_swhid.object_id]) + )[0] + assert origin_info is not None + origin_urls[parsed_swhid.object_id] = origin_info["url"] + return origin_info["url"] + else: + return swhid + + +def _resolve_origins_in_graph_response(response: requests.Response) -> str: + content_type = response.headers["Content-Type"] + origin_urls: Dict[str, str] = {} + if content_type == "application/x-ndjson": + processed_response = [] + for line in response.text.split("\n")[:-1]: + swhids = json.loads(line) + processed_line = [] + for swhid in swhids: + processed_line.append(_process_swhid(swhid, origin_urls)) + processed_response.append(json.dumps(processed_line)) + return "\n".join(processed_response) + "\n" + elif content_type == "text/plain": + processed_response = [] + for line in response.text.split("\n")[:-1]: + processed_line = [] + swhids = line.split(" ") + for swhid in swhids: + processed_line.append(_process_swhid(swhid, origin_urls)) + processed_response.append(" ".join(processed_line)) + return "\n".join(processed_response) + "\n" + return response.text + + +@api_route(r"/graph/", "api-1-graph-doc") +@api_doc("/graph/") +def api_graph(request: Request) -> None: + """ + .. http:get:: /api/1/graph/(graph_query)/ + + Provide fast access to the graph representation of the Software Heritage + archive. + + That endpoint acts as a proxy for the `Software Heritage Graph service + `_. + + It provides fast access to the `graph representation + `_ + of the Software Heritage archive. + + The full documentation of the available Graph REST API can be found `here + `_. + + .. warning:: + That endpoint is not publicly available and requires authentication and + special user permission in order to be able to request it. + + :param string graph_query: query to forward to the Software Heritage Graph + service (see its `documentation + `_) + :query boolean resolve_origins: extra parameter defined by that proxy enabling + to resolve origin urls from their sha1 representations + + :statuscode 200: no error + :statuscode 400: an invalid graph query has been provided + :statuscode 404: provided graph node cannot be found + + **Examples:** + + .. parsed-literal:: + + :swh_web_api:`graph/leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323/` + :swh_web_api:`graph/neighbors/swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35/` + :swh_web_api:`graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward` + :swh_web_api:`graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward&limit=-2` + :swh_web_api:`graph/visit/nodes/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc?direction=backward&resolve_origins=true` + :swh_web_api:`graph/visit/edges/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc?direction=backward&resolve_origins=true` + :swh_web_api:`graph/visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb?direction=backward&resolve_origins=true` + + """ + return None + + +@api_route(r"/graph/(?P.+)/", "api-1-graph") +@renderer_classes([PlainTextRenderer]) +def api_graph_proxy(request: Request, graph_query: str) -> Response: + if not bool(request.user and request.user.is_authenticated): + return Response("Authentication credentials were not provided.", status=401) + if not request.user.has_perm(API_GRAPH_PERM): + return Response( + "You do not have permission to perform this action.", status=403 + ) + graph_query_url = get_config()["graph"]["server_url"] + graph_query_url += graph_query + if request.GET: + graph_query_url += f"?{request.GET.urlencode(safe='/;:')}" + response = requests.get(graph_query_url) + response_text = response.text + resolve_origins = strtobool(request.GET.get("resolve_origins", "false")) + if response.status_code == 200 and resolve_origins: + response_text = _resolve_origins_in_graph_response(response) + return Response( + response_text, + status=response.status_code, + content_type=response.headers["Content-Type"], + ) diff --git a/swh/web/config.py b/swh/web/config.py --- a/swh/web/config.py +++ b/swh/web/config.py @@ -90,6 +90,10 @@ ), "client_config": ("dict", {}), "keycloak": ("dict", {"server_url": "", "realm_name": ""}), + "graph": ( + "dict", + {"server_url": "http://graph.internal.softwareheritage.org:5009/graph/"}, + ), } swhweb_config = {} # type: Dict[str, Any] diff --git a/swh/web/tests/api/views/test_graph.py b/swh/web/tests/api/views/test_graph.py new file mode 100644 --- /dev/null +++ b/swh/web/tests/api/views/test_graph.py @@ -0,0 +1,210 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import hashlib +import json + +from hypothesis import given + +from swh.model.identifiers import ORIGIN, SNAPSHOT, swhid +from swh.web.api.views.graph import API_GRAPH_PERM +from swh.web.common.utils import reverse +from swh.web.config import get_config +from swh.web.tests.auth.keycloak_mock import mock_keycloak +from swh.web.tests.auth.sample_data import oidc_profile +from swh.web.tests.strategies import origin + + +def test_graph_endpoint_needs_authentication(api_client): + url = reverse("api-1-graph", url_args={"graph_query": "stats"}) + resp = api_client.get(url) + assert resp.status_code == 401 + + +def _authenticate_graph_user(api_client, mocker): + mock_keycloak(mocker, user_permissions=[API_GRAPH_PERM]) + api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") + + +def test_graph_endpoint_needs_permission(api_client, mocker, requests_mock): + graph_query = "stats" + url = reverse("api-1-graph", url_args={"graph_query": graph_query}) + api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") + + mock_keycloak(mocker, user_permissions=[]) + resp = api_client.get(url) + assert resp.status_code == 403 + + _authenticate_graph_user(api_client, mocker) + requests_mock.get( + get_config()["graph"]["server_url"] + graph_query, + json={}, + headers={"Content-Type": "application/json"}, + ) + resp = api_client.get(url) + assert resp.status_code == 200 + + +def test_graph_text_plain_response(api_client, mocker, requests_mock): + _authenticate_graph_user(api_client, mocker) + + graph_query = "leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323" + + response_text = ( + "swh:1:cnt:1d3dace0a825b0535c37c53ed669ef817e9c1b47\n" + "swh:1:cnt:6d5b280f4e33589ae967a7912a587dd5cb8dedaa\n" + "swh:1:cnt:91bef238bf01356a550d416d14bb464c576ac6f4\n" + "swh:1:cnt:58a8b925a463b87d49639fda282b8f836546e396\n" + "swh:1:cnt:fd32ee0a87e16ccc853dfbeb7018674f9ce008c0\n" + "swh:1:cnt:ab7c39871872589a4fc9e249ebc927fb1042c90d\n" + "swh:1:cnt:93073c02bf3869845977527de16af4d54765838d\n" + "swh:1:cnt:4251f795b52c54c447a97c9fe904d8b1f993b1e0\n" + "swh:1:cnt:c6e7055424332006d07876ffeba684e7e284b383\n" + "swh:1:cnt:8459d8867dc3b15ef7ae9683e21cccc9ab2ec887\n" + "swh:1:cnt:5f9981d52202815aa947f85b9dfa191b66f51138\n" + "swh:1:cnt:00a685ec51bcdf398c15d588ecdedb611dbbab4b\n" + "swh:1:cnt:e1cf1ea335106a0197a2f92f7804046425a7d3eb\n" + "swh:1:cnt:07069b38087f88ec192d2c9aff75a502476fd17d\n" + "swh:1:cnt:f045ee845c7f14d903a2c035b2691a7c400c01f0\n" + ) + + requests_mock.get( + get_config()["graph"]["server_url"] + graph_query, + text=response_text, + headers={"Content-Type": "text/plain"}, + ) + + url = reverse("api-1-graph", url_args={"graph_query": graph_query}) + + resp = api_client.get(url) + + assert resp.status_code == 200 + assert resp.content_type == "text/plain" + assert resp.content == response_text.encode() + + +def test_graph_json_response(api_client, mocker, requests_mock): + _authenticate_graph_user(api_client, mocker) + + graph_query = "stats" + + response_json = { + "counts": {"nodes": 17075708289, "edges": 196236587976}, + "ratios": { + "compression": 0.16, + "bits_per_node": 58.828, + "bits_per_edge": 5.119, + "avg_locality": 2184278529.729, + }, + "indegree": {"min": 0, "max": 263180117, "avg": 11.4921492364925}, + "outdegree": {"min": 0, "max": 1033207, "avg": 11.4921492364925}, + } + + requests_mock.get( + get_config()["graph"]["server_url"] + graph_query, + json=response_json, + headers={"Content-Type": "application/json"}, + ) + + url = reverse("api-1-graph", url_args={"graph_query": graph_query}) + + resp = api_client.get(url) + + assert resp.status_code == 200 + assert resp.content_type == "application/json" + assert resp.content == json.dumps(response_json).encode() + + +def test_graph_ndjson_response(api_client, mocker, requests_mock): + _authenticate_graph_user(api_client, mocker) + + graph_query = "visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb" + + response_ndjson = ( + '["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",' + ' "swh:1:cnt:acfb7cabd63b368a03a9df87670ece1488c8bce0"]\n' + '["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",' + ' "swh:1:cnt:2a0837708151d76edf28fdbb90dc3eabc676cff3"]\n' + '["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",' + ' "swh:1:cnt:eaf025ad54b94b2fdda26af75594cfae3491ec75"]\n' + ) + + requests_mock.get( + get_config()["graph"]["server_url"] + graph_query, + text=response_ndjson, + headers={"Content-Type": "application/x-ndjson"}, + ) + + url = reverse("api-1-graph", url_args={"graph_query": graph_query}) + + resp = api_client.get(url) + + assert resp.status_code == 200 + assert resp.content_type == "application/x-ndjson" + assert resp.content == response_ndjson.encode() + + +@given(origin()) +def test_graph_response_resolve_origins( + archive_data, api_client, mocker, requests_mock, origin +): + hasher = hashlib.sha1() + hasher.update(origin["url"].encode()) + origin_sha1 = hasher.hexdigest() + origin_swhid = str(swhid(ORIGIN, origin_sha1)) + snapshot = archive_data.snapshot_get_latest(origin["url"])["id"] + snapshot_swhid = str(swhid(SNAPSHOT, snapshot)) + + _authenticate_graph_user(api_client, mocker) + + for graph_query, response_text, content_type in ( + ( + f"visit/nodes/{snapshot_swhid}", + f"{snapshot_swhid}\n{origin_swhid}\n", + "text/plain", + ), + ( + f"visit/edges/{snapshot_swhid}", + f"{snapshot_swhid} {origin_swhid}\n", + "text/plain", + ), + ( + f"visit/paths/{snapshot_swhid}", + f'["{snapshot_swhid}", "{origin_swhid}"]\n', + "application/x-ndjson", + ), + ): + + requests_mock.get( + get_config()["graph"]["server_url"] + graph_query, + text=response_text, + headers={"Content-Type": content_type}, + ) + + url = reverse( + "api-1-graph", + url_args={"graph_query": graph_query}, + query_params={"direction": "backward"}, + ) + + resp = api_client.get(url) + + assert resp.status_code == 200 + assert resp.content_type == content_type + assert resp.content == response_text.encode() + + url = reverse( + "api-1-graph", + url_args={"graph_query": graph_query}, + query_params={"direction": "backward", "resolve_origins": 1}, + ) + + resp = api_client.get(url) + + assert resp.status_code == 200 + assert resp.content_type == content_type + assert ( + resp.content == response_text.replace(origin_swhid, origin["url"]).encode() + )