diff --git a/docs/uri-scheme-api-graph.rst b/docs/uri-scheme-api-graph.rst new file mode 100644 index 00000000..75eebb65 --- /dev/null +++ b/docs/uri-scheme-api-graph.rst @@ -0,0 +1,4 @@ +Graph +----- + +.. autosimple:: swh.web.api.views.graph.api_graph diff --git a/docs/uri-scheme-api.rst b/docs/uri-scheme-api.rst index f435743d..2a7aa3d8 100644 --- a/docs/uri-scheme-api.rst +++ b/docs/uri-scheme-api.rst @@ -1,22 +1,24 @@ .. _swh-web-api-urls: swh-web API URLs ================ .. include:: uri-scheme-api-content.rst .. include:: uri-scheme-api-directory.rst +.. include:: uri-scheme-api-graph.rst + .. include:: uri-scheme-api-identifiers.rst .. include:: uri-scheme-api-origin.rst .. include:: uri-scheme-api-release.rst .. include:: uri-scheme-api-revision.rst .. include:: uri-scheme-api-snapshot.rst .. include:: uri-scheme-api-stat.rst .. include:: uri-scheme-api-vault.rst diff --git a/pytest.ini b/pytest.ini index 4d203919..3dd2c766 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,9 +1,10 @@ [pytest] addopts = -p no:flask -p no:pytest_swh_storage norecursedirs = docs node_modules .tox DJANGO_SETTINGS_MODULE = swh.web.settings.tests filterwarnings = ignore:.*Plural value must be an integer, got float ignore:.*Using or importing the ABCs from 'collections' ignore:.*uses the 'client' fixture ignore:.*uses the 'mocker' fixture + ignore:.*uses the 'requests_mock' fixture diff --git a/swh/web/api/apidoc.py b/swh/web/api/apidoc.py index 11921cd2..89914cc9 100644 --- a/swh/web/api/apidoc.py +++ b/swh/web/api/apidoc.py @@ -1,453 +1,454 @@ -# Copyright (C) 2015-2019 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import functools from functools import wraps import os import re import textwrap from typing import List import docutils.nodes import docutils.parsers.rst import docutils.utils import sentry_sdk from rest_framework.decorators import api_view from swh.web.api.apiresponse import error_response, make_api_response from swh.web.api.apiurls import APIUrls from swh.web.common.utils import parse_rst class _HTTPDomainDocVisitor(docutils.nodes.NodeVisitor): """ docutils visitor for walking on a parsed rst document containing sphinx httpdomain roles. Its purpose is to extract relevant info regarding swh api endpoints (for instance url arguments) from their docstring written using sphinx httpdomain. """ # httpdomain roles we want to parse (based on sphinxcontrib.httpdomain 1.6) parameter_roles = ("param", "parameter", "arg", "argument") request_json_object_roles = ("reqjsonobj", "reqjson", "jsonobj", ">json") response_json_array_roles = ("resjsonarr", ">jsonarr") query_parameter_roles = ("queryparameter", "queryparam", "qparam", "query") request_header_roles = ("header", "resheader", "responseheader") status_code_roles = ("statuscode", "status", "code") def __init__(self, document, data): super().__init__(document) self.data = data self.args_set = set() self.params_set = set() self.inputs_set = set() self.returns_set = set() self.status_codes_set = set() self.reqheaders_set = set() self.resheaders_set = set() self.field_list_visited = False self.current_json_obj = None def process_paragraph(self, par): """ Process extracted paragraph text before display. Cleanup document model markups and transform the paragraph into a valid raw rst string (as the apidoc documentation transform rst to html when rendering). """ par = par.replace("\n", " ") # keep emphasized, strong and literal text par = par.replace("", "*") par = par.replace("", "*") par = par.replace("", "**") par = par.replace("", "**") par = par.replace("", "``") par = par.replace("", "``") # keep links to web pages if "', r"`\1 <\2>`_", par, ) # remove parsed document markups but keep rst links par = re.sub(r"<[^<]+?>(?!`_)", "", par) # api urls cleanup to generate valid links afterwards subs_made = 1 while subs_made: (par, subs_made) = re.subn(r"(:http:.*)(\(\w+\))", r"\1", par) subs_made = 1 while subs_made: (par, subs_made) = re.subn(r"(:http:.*)(\[.*\])", r"\1", par) par = par.replace("//", "/") # transform references to api endpoints doc into valid rst links par = re.sub(":http:get:`([^,`]*)`", r"`\1 <\1doc/>`_", par) # transform references to some elements into bold text par = re.sub(":http:header:`(.*)`", r"**\1**", par) par = re.sub(":func:`(.*)`", r"**\1**", par) return par def visit_field_list(self, node): """ Visit parsed rst field lists to extract relevant info regarding api endpoint. """ self.field_list_visited = True for child in node.traverse(): # get the parsed field name if isinstance(child, docutils.nodes.field_name): field_name = child.astext() # parse field text elif isinstance(child, docutils.nodes.paragraph): text = self.process_paragraph(str(child)) field_data = field_name.split(" ") # Parameters if field_data[0] in self.parameter_roles: if field_data[2] not in self.args_set: self.data["args"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.args_set.add(field_data[2]) # Query Parameters if field_data[0] in self.query_parameter_roles: if field_data[2] not in self.params_set: self.data["params"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.params_set.add(field_data[2]) # Request data type if ( field_data[0] in self.request_json_array_roles or field_data[0] in self.request_json_object_roles ): # array if field_data[0] in self.request_json_array_roles: self.data["input_type"] = "array" # object else: self.data["input_type"] = "object" # input object field if field_data[2] not in self.inputs_set: self.data["inputs"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.inputs_set.add(field_data[2]) self.current_json_obj = self.data["inputs"][-1] # Response type if ( field_data[0] in self.response_json_array_roles or field_data[0] in self.response_json_object_roles ): # array if field_data[0] in self.response_json_array_roles: self.data["return_type"] = "array" # object else: self.data["return_type"] = "object" # returned object field if field_data[2] not in self.returns_set: self.data["returns"].append( {"name": field_data[2], "type": field_data[1], "doc": text} ) self.returns_set.add(field_data[2]) self.current_json_obj = self.data["returns"][-1] # Status Codes if field_data[0] in self.status_code_roles: if field_data[1] not in self.status_codes_set: self.data["status_codes"].append( {"code": field_data[1], "doc": text} ) self.status_codes_set.add(field_data[1]) # Request Headers if field_data[0] in self.request_header_roles: if field_data[1] not in self.reqheaders_set: self.data["reqheaders"].append( {"name": field_data[1], "doc": text} ) self.reqheaders_set.add(field_data[1]) # Response Headers if field_data[0] in self.response_header_roles: if field_data[1] not in self.resheaders_set: resheader = {"name": field_data[1], "doc": text} self.data["resheaders"].append(resheader) self.resheaders_set.add(field_data[1]) if ( resheader["name"] == "Content-Type" and resheader["doc"] == "application/octet-stream" ): self.data["return_type"] = "octet stream" def visit_paragraph(self, node): """ Visit relevant paragraphs to parse """ # only parsed top level paragraphs if isinstance(node.parent, docutils.nodes.block_quote): text = self.process_paragraph(str(node)) # endpoint description if not text.startswith("**") and text not in self.data["description"]: self.data["description"] += "\n\n" if self.data["description"] else "" self.data["description"] += text def visit_literal_block(self, node): """ Visit literal blocks """ text = node.astext() # literal block in endpoint description if not self.field_list_visited: self.data["description"] += ":\n\n%s\n" % textwrap.indent(text, "\t") - # extract example url + # extract example urls if ":swh_web_api:" in text: - self.data["examples"].append("/api/1/" + re.sub(".*`(.*)`.*", r"\1", text)) + examples_str = re.sub(".*`(.+)`.*", r"/api/1/\1", text) + self.data["examples"] += examples_str.split("\n") def visit_bullet_list(self, node): # bullet list in endpoint description if not self.field_list_visited: self.data["description"] += "\n\n" for child in node.traverse(): # process list item if isinstance(child, docutils.nodes.paragraph): line_text = self.process_paragraph(str(child)) self.data["description"] += "\t* %s\n" % line_text elif self.current_json_obj: self.current_json_obj["doc"] += "\n\n" for child in node.traverse(): # process list item if isinstance(child, docutils.nodes.paragraph): line_text = self.process_paragraph(str(child)) self.current_json_obj["doc"] += "\t\t* %s\n" % line_text self.current_json_obj = None def visit_warning(self, node): text = self.process_paragraph(str(node)) rst_warning = "\n\n.. warning::\n%s\n" % textwrap.indent(text, "\t") if rst_warning not in self.data["description"]: self.data["description"] += rst_warning def unknown_visit(self, node): pass def unknown_departure(self, node): pass def _parse_httpdomain_doc(doc, data): doc_lines = doc.split("\n") doc_lines_filtered = [] urls = defaultdict(list) default_http_methods = ["HEAD", "OPTIONS"] # httpdomain is a sphinx extension that is unknown to docutils but # fortunately we can still parse its directives' content, # so remove lines with httpdomain directives before executing the # rst parser from docutils for doc_line in doc_lines: if ".. http" not in doc_line: doc_lines_filtered.append(doc_line) else: url = doc_line[doc_line.find("/") :] # emphasize url arguments for html rendering url = re.sub(r"\((\w+)\)", r" **\(\1\)** ", url) method = re.search(r"http:(\w+)::", doc_line).group(1) urls[url].append(method.upper()) for url, methods in urls.items(): data["urls"].append({"rule": url, "methods": methods + default_http_methods}) # parse the rst docstring and do not print system messages about # unknown httpdomain roles document = parse_rst("\n".join(doc_lines_filtered), report_level=5) # remove the system_message nodes from the parsed document for node in document.traverse(docutils.nodes.system_message): node.parent.remove(node) # visit the document nodes to extract relevant endpoint info visitor = _HTTPDomainDocVisitor(document, data) document.walkabout(visitor) class APIDocException(Exception): """ Custom exception to signal errors in the use of the APIDoc decorators """ def api_doc( route: str, noargs: bool = False, tags: List[str] = [], handle_response: bool = False, api_version: str = "1", ): """ Decorator for an API endpoint implementation used to generate a dedicated view displaying its HTML documentation. The documentation will be generated from the endpoint docstring based on sphinxcontrib-httpdomain format. Args: route: documentation page's route noargs: set to True if the route has no arguments, and its result should be displayed anytime its documentation is requested. Default to False tags: Further information on api endpoints. Two values are possibly expected: * hidden: remove the entry points from the listing * upcoming: display the entry point but it is not followable handle_response: indicate if the decorated function takes care of creating the HTTP response or delegates that task to the apiresponse module api_version: api version string """ tags_set = set(tags) # @api_doc() Decorator call def decorator(f): # if the route is not hidden, add it to the index if "hidden" not in tags_set: doc_data = get_doc_data(f, route, noargs) doc_desc = doc_data["description"] first_dot_pos = doc_desc.find(".") APIUrls.add_doc_route( route, doc_desc[: first_dot_pos + 1], noargs=noargs, api_version=api_version, tags=tags_set, ) # create a dedicated view to display endpoint HTML doc @api_view(["GET", "HEAD"]) @wraps(f) def doc_view(request): doc_data = get_doc_data(f, route, noargs) return make_api_response(request, None, doc_data) route_name = "%s-doc" % route[1:-1].replace("/", "-") urlpattern = f"^{api_version}{route}doc/$" view_name = "api-%s-%s" % (api_version, route_name) APIUrls.add_url_pattern(urlpattern, doc_view, view_name) @wraps(f) def documented_view(request, **kwargs): doc_data = get_doc_data(f, route, noargs) try: response = f(request, **kwargs) except Exception as exc: sentry_sdk.capture_exception(exc) return error_response(request, exc, doc_data) if handle_response: return response else: return make_api_response(request, response, doc_data) return documented_view return decorator @functools.lru_cache(maxsize=32) def get_doc_data(f, route, noargs): """ Build documentation data for the decorated api endpoint function """ data = { "description": "", "response_data": None, "urls": [], "args": [], "params": [], "input_type": "", "inputs": [], "resheaders": [], "reqheaders": [], "return_type": "", "returns": [], "status_codes": [], "examples": [], "route": route, "noargs": noargs, } if not f.__doc__: raise APIDocException( "apidoc: expected a docstring" " for function %s" % (f.__name__,) ) # use raw docstring as endpoint documentation if sphinx # httpdomain is not used if ".. http" not in f.__doc__: data["description"] = f.__doc__ # else parse the sphinx httpdomain docstring with docutils # (except when building the swh-web documentation through autodoc # sphinx extension, not needed and raise errors with sphinx >= 1.7) elif "SWH_WEB_DOC_BUILD" not in os.environ: _parse_httpdomain_doc(f.__doc__, data) # process input/returned object info for nicer html display inputs_list = "" returns_list = "" for inp in data["inputs"]: # special case for array of non object type, for instance # :jsonarr string -: an array of string if ret["name"] != "-": returns_list += "\t* **%s (%s)**: %s\n" % ( ret["name"], ret["type"], ret["doc"], ) data["inputs_list"] = inputs_list data["returns_list"] = returns_list return data DOC_COMMON_HEADERS = """ :reqheader Accept: the requested response content type, either ``application/json`` (default) or ``application/yaml`` :resheader Content-Type: this depends on :http:header:`Accept` header of request""" DOC_RESHEADER_LINK = """ :resheader Link: indicates that a subsequent result page is available and contains the url pointing to it """ DEFAULT_SUBSTITUTIONS = { "common_headers": DOC_COMMON_HEADERS, "resheader_link": DOC_RESHEADER_LINK, } def format_docstring(**substitutions): def decorator(f): f.__doc__ = f.__doc__.format(**{**DEFAULT_SUBSTITUTIONS, **substitutions}) return f return decorator diff --git a/swh/web/api/renderers.py b/swh/web/api/renderers.py index bd1f5af9..725f1e58 100644 --- a/swh/web/api/renderers.py +++ b/swh/web/api/renderers.py @@ -1,37 +1,50 @@ -# Copyright (C) 2017-2018 The Software Heritage developers +# Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import yaml from rest_framework import renderers class YAMLRenderer(renderers.BaseRenderer): """ Renderer which serializes to YAML. """ media_type = "application/yaml" format = "yaml" charset = "utf-8" ensure_ascii = False default_flow_style = False def render(self, data, accepted_media_type=None, renderer_context=None): """ Renders `data` into serialized YAML. """ assert yaml, "YAMLRenderer requires pyyaml to be installed" if data is None: return "" return yaml.dump( data, stream=None, encoding=self.charset, allow_unicode=not self.ensure_ascii, default_flow_style=self.default_flow_style, ) + + +class PlainTextRenderer(renderers.BaseRenderer): + """ + Renderer which does not perform any serialization to raw text data. + """ + + media_type = "text/plain" + format = "text" + charset = "utf-8" + + def render(self, data, media_type=None, renderer_context=None): + return data diff --git a/swh/web/api/urls.py b/swh/web/api/urls.py index 9b1bb6b7..dea648c9 100644 --- a/swh/web/api/urls.py +++ b/swh/web/api/urls.py @@ -1,30 +1,31 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from django.conf.urls import url from django.contrib.auth.decorators import login_required from django.shortcuts import render from swh.web.api.apiurls import APIUrls import swh.web.api.views.content # noqa import swh.web.api.views.directory # noqa +import swh.web.api.views.graph # noqa import swh.web.api.views.identifiers # noqa import swh.web.api.views.origin # noqa import swh.web.api.views.origin_save # noqa import swh.web.api.views.ping # noqa import swh.web.api.views.release # noqa import swh.web.api.views.revision # noqa import swh.web.api.views.snapshot # noqa import swh.web.api.views.stat # noqa import swh.web.api.views.vault # noqa @login_required(login_url="/oidc/login/", redirect_field_name="next_path") def _tokens_view(request): return render(request, "api/tokens.html") urlpatterns = APIUrls.get_url_patterns() urlpatterns.append(url(r"^tokens/$", _tokens_view, name="api-tokens")) diff --git a/swh/web/api/views/graph.py b/swh/web/api/views/graph.py new file mode 100644 index 00000000..7a18676c --- /dev/null +++ b/swh/web/api/views/graph.py @@ -0,0 +1,144 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from distutils.util import strtobool +import json +from typing import Dict + +import requests + +from rest_framework.decorators import renderer_classes +from rest_framework.request import Request +from rest_framework.response import Response + +from swh.model.identifiers import ORIGIN, parse_swhid +from swh.web.api.apidoc import api_doc +from swh.web.api.apiurls import api_route +from swh.web.api.renderers import PlainTextRenderer +from swh.web.common import service +from swh.web.config import get_config + +API_GRAPH_PERM = "swh.web.api.graph" + + +def _resolve_origin_swhid(swhid: str, origin_urls: Dict[str, str]) -> str: + """ + Resolve origin url from its swhid sha1 representation. + """ + parsed_swhid = parse_swhid(swhid) + if parsed_swhid.object_type == ORIGIN: + if parsed_swhid.object_id in origin_urls: + return origin_urls[parsed_swhid.object_id] + else: + origin_info = list( + service.lookup_origins_by_sha1s([parsed_swhid.object_id]) + )[0] + assert origin_info is not None + origin_urls[parsed_swhid.object_id] = origin_info["url"] + return origin_info["url"] + else: + return swhid + + +def _resolve_origin_swhids_in_graph_response(response: requests.Response) -> str: + """ + Resolve origin urls from their swhid sha1 representations in graph service + responses. + """ + content_type = response.headers["Content-Type"] + origin_urls: Dict[str, str] = {} + if content_type == "application/x-ndjson": + processed_response = [] + for line in response.text.split("\n")[:-1]: + swhids = json.loads(line) + processed_line = [] + for swhid in swhids: + processed_line.append(_resolve_origin_swhid(swhid, origin_urls)) + processed_response.append(json.dumps(processed_line)) + return "\n".join(processed_response) + "\n" + elif content_type == "text/plain": + processed_response = [] + for line in response.text.split("\n")[:-1]: + processed_line = [] + swhids = line.split(" ") + for swhid in swhids: + processed_line.append(_resolve_origin_swhid(swhid, origin_urls)) + processed_response.append(" ".join(processed_line)) + return "\n".join(processed_response) + "\n" + return response.text + + +@api_route(r"/graph/", "api-1-graph-doc") +@api_doc("/graph/") +def api_graph(request: Request) -> None: + """ + .. http:get:: /api/1/graph/(graph_query)/ + + Provide fast access to the graph representation of the Software Heritage + archive. + + That endpoint acts as a proxy for the `Software Heritage Graph service + `_. + + It provides fast access to the `graph representation + `_ + of the Software Heritage archive. + + The full documentation of the available Graph REST API can be found `here + `_. + + .. warning:: + That endpoint is not publicly available and requires authentication and + special user permission in order to be able to request it. + + :param string graph_query: query to forward to the Software Heritage Graph + service (see its `documentation + `_) + :query boolean resolve_origins: extra parameter defined by that proxy enabling + to resolve origin urls from their sha1 representations + + :statuscode 200: no error + :statuscode 400: an invalid graph query has been provided + :statuscode 404: provided graph node cannot be found + + **Examples:** + + .. parsed-literal:: + + :swh_web_api:`graph/leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323/` + :swh_web_api:`graph/neighbors/swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35/` + :swh_web_api:`graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward` + :swh_web_api:`graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward&limit=-2` + :swh_web_api:`graph/visit/nodes/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc?direction=backward&resolve_origins=true` + :swh_web_api:`graph/visit/edges/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc?direction=backward&resolve_origins=true` + :swh_web_api:`graph/visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb?direction=backward&resolve_origins=true` + + """ + return None + + +@api_route(r"/graph/(?P.+)/", "api-1-graph") +@renderer_classes([PlainTextRenderer]) +def api_graph_proxy(request: Request, graph_query: str) -> Response: + if not bool(request.user and request.user.is_authenticated): + return Response("Authentication credentials were not provided.", status=401) + if not request.user.has_perm(API_GRAPH_PERM): + return Response( + "You do not have permission to perform this action.", status=403 + ) + graph_query_url = get_config()["graph"]["server_url"] + graph_query_url += graph_query + if request.GET: + graph_query_url += "?" + request.GET.urlencode(safe="/;:") + response = requests.get(graph_query_url) + response_text = response.text + resolve_origins = strtobool(request.GET.get("resolve_origins", "false")) + if response.status_code == 200 and resolve_origins: + response_text = _resolve_origin_swhids_in_graph_response(response) + return Response( + response_text, + status=response.status_code, + content_type=response.headers["Content-Type"], + ) diff --git a/swh/web/config.py b/swh/web/config.py index 2b21fc0a..11faa8fa 100644 --- a/swh/web/config.py +++ b/swh/web/config.py @@ -1,164 +1,168 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import os from typing import Any, Dict from swh.core import config from swh.indexer.storage import get_indexer_storage from swh.scheduler import get_scheduler from swh.search import get_search from swh.storage import get_storage from swh.vault import get_vault from swh.web import settings SETTINGS_DIR = os.path.dirname(settings.__file__) DEFAULT_CONFIG = { "allowed_hosts": ("list", []), "search": ( "dict", {"cls": "remote", "url": "http://127.0.0.1:5010/", "timeout": 10,}, ), "storage": ( "dict", {"cls": "remote", "url": "http://127.0.0.1:5002/", "timeout": 10,}, ), "indexer_storage": ( "dict", {"cls": "remote", "args": {"url": "http://127.0.0.1:5007/", "timeout": 1,}}, ), "log_dir": ("string", "/tmp/swh/log"), "debug": ("bool", False), "serve_assets": ("bool", False), "host": ("string", "127.0.0.1"), "port": ("int", 5004), "secret_key": ("string", "development key"), # do not display code highlighting for content > 1MB "content_display_max_size": ("int", 5 * 1024 * 1024), "snapshot_content_max_size": ("int", 1000), "throttling": ( "dict", { "cache_uri": None, # production: memcached as cache (127.0.0.1:11211) # development: in-memory cache so None "scopes": { "swh_api": { "limiter_rate": {"default": "120/h"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_api_origin_search": { "limiter_rate": {"default": "10/m"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_vault_cooking": { "limiter_rate": {"default": "120/h", "GET": "60/m"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_save_origin": { "limiter_rate": {"default": "120/h", "POST": "10/h"}, "exempted_networks": ["127.0.0.0/8"], }, "swh_api_origin_visit_latest": { "limiter_rate": {"default": "700/m"}, "exempted_networks": ["127.0.0.0/8"], }, }, }, ), "vault": ("dict", {"cls": "remote", "args": {"url": "http://127.0.0.1:5005/",}}), "scheduler": ("dict", {"cls": "remote", "args": {"url": "http://127.0.0.1:5008/"}}), "development_db": ("string", os.path.join(SETTINGS_DIR, "db.sqlite3")), "test_db": ("string", os.path.join(SETTINGS_DIR, "testdb.sqlite3")), "production_db": ("string", "/var/lib/swh/web.sqlite3"), "deposit": ( "dict", { "private_api_url": "https://deposit.softwareheritage.org/1/private/", "private_api_user": "swhworker", "private_api_password": "", }, ), "coverage_count_origins": ("bool", False), "e2e_tests_mode": ("bool", False), "es_workers_index_url": ("string", ""), "history_counters_url": ( "string", "https://stats.export.softwareheritage.org/history_counters.json", ), "client_config": ("dict", {}), "keycloak": ("dict", {"server_url": "", "realm_name": ""}), + "graph": ( + "dict", + {"server_url": "http://graph.internal.softwareheritage.org:5009/graph/"}, + ), } swhweb_config = {} # type: Dict[str, Any] def get_config(config_file="web/web"): """Read the configuration file `config_file`. If an environment variable SWH_CONFIG_FILENAME is defined, this takes precedence over the config_file parameter. In any case, update the app with parameters (secret_key, conf) and return the parsed configuration as a dict. If no configuration file is provided, return a default configuration. """ if not swhweb_config: config_filename = os.environ.get("SWH_CONFIG_FILENAME") if config_filename: config_file = config_filename cfg = config.load_named_config(config_file, DEFAULT_CONFIG) swhweb_config.update(cfg) config.prepare_folders(swhweb_config, "log_dir") if swhweb_config.get("search"): swhweb_config["search"] = get_search(**swhweb_config["search"]) else: swhweb_config["search"] = None swhweb_config["storage"] = get_storage(**swhweb_config["storage"]) swhweb_config["vault"] = get_vault(**swhweb_config["vault"]) swhweb_config["indexer_storage"] = get_indexer_storage( **swhweb_config["indexer_storage"] ) swhweb_config["scheduler"] = get_scheduler(**swhweb_config["scheduler"]) return swhweb_config def search(): """Return the current application's search. """ return get_config()["search"] def storage(): """Return the current application's storage. """ return get_config()["storage"] def vault(): """Return the current application's vault. """ return get_config()["vault"] def indexer_storage(): """Return the current application's indexer storage. """ return get_config()["indexer_storage"] def scheduler(): """Return the current application's scheduler. """ return get_config()["scheduler"] diff --git a/swh/web/tests/api/views/test_graph.py b/swh/web/tests/api/views/test_graph.py new file mode 100644 index 00000000..29b7e4b7 --- /dev/null +++ b/swh/web/tests/api/views/test_graph.py @@ -0,0 +1,245 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU Affero General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import hashlib +import json +import textwrap + +from hypothesis import given + +from swh.model.identifiers import ORIGIN, SNAPSHOT, swhid +from swh.web.api.views.graph import API_GRAPH_PERM +from swh.web.common.utils import reverse +from swh.web.config import get_config +from swh.web.tests.auth.keycloak_mock import mock_keycloak +from swh.web.tests.auth.sample_data import oidc_profile +from swh.web.tests.strategies import origin + + +def test_graph_endpoint_needs_authentication(api_client): + url = reverse("api-1-graph", url_args={"graph_query": "stats"}) + resp = api_client.get(url) + assert resp.status_code == 401 + + +def _authenticate_graph_user(api_client, mocker): + mock_keycloak(mocker, user_permissions=[API_GRAPH_PERM]) + api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") + + +def test_graph_endpoint_needs_permission(api_client, mocker, requests_mock): + graph_query = "stats" + url = reverse("api-1-graph", url_args={"graph_query": graph_query}) + api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}") + + mock_keycloak(mocker, user_permissions=[]) + resp = api_client.get(url) + assert resp.status_code == 403 + + _authenticate_graph_user(api_client, mocker) + requests_mock.get( + get_config()["graph"]["server_url"] + graph_query, + json={}, + headers={"Content-Type": "application/json"}, + ) + resp = api_client.get(url) + assert resp.status_code == 200 + + +def test_graph_text_plain_response(api_client, mocker, requests_mock): + _authenticate_graph_user(api_client, mocker) + + graph_query = "leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323" + + response_text = textwrap.dedent( + """\ + swh:1:cnt:1d3dace0a825b0535c37c53ed669ef817e9c1b47 + swh:1:cnt:6d5b280f4e33589ae967a7912a587dd5cb8dedaa + swh:1:cnt:91bef238bf01356a550d416d14bb464c576ac6f4 + swh:1:cnt:58a8b925a463b87d49639fda282b8f836546e396 + swh:1:cnt:fd32ee0a87e16ccc853dfbeb7018674f9ce008c0 + swh:1:cnt:ab7c39871872589a4fc9e249ebc927fb1042c90d + swh:1:cnt:93073c02bf3869845977527de16af4d54765838d + swh:1:cnt:4251f795b52c54c447a97c9fe904d8b1f993b1e0 + swh:1:cnt:c6e7055424332006d07876ffeba684e7e284b383 + swh:1:cnt:8459d8867dc3b15ef7ae9683e21cccc9ab2ec887 + swh:1:cnt:5f9981d52202815aa947f85b9dfa191b66f51138 + swh:1:cnt:00a685ec51bcdf398c15d588ecdedb611dbbab4b + swh:1:cnt:e1cf1ea335106a0197a2f92f7804046425a7d3eb + swh:1:cnt:07069b38087f88ec192d2c9aff75a502476fd17d + swh:1:cnt:f045ee845c7f14d903a2c035b2691a7c400c01f0 + """ + ) + + requests_mock.get( + get_config()["graph"]["server_url"] + graph_query, + text=response_text, + headers={"Content-Type": "text/plain"}, + ) + + url = reverse("api-1-graph", url_args={"graph_query": graph_query}) + + resp = api_client.get(url) + + assert resp.status_code == 200 + assert resp.content_type == "text/plain" + assert resp.content == response_text.encode() + + +_response_json = { + "counts": {"nodes": 17075708289, "edges": 196236587976}, + "ratios": { + "compression": 0.16, + "bits_per_node": 58.828, + "bits_per_edge": 5.119, + "avg_locality": 2184278529.729, + }, + "indegree": {"min": 0, "max": 263180117, "avg": 11.4921492364925}, + "outdegree": {"min": 0, "max": 1033207, "avg": 11.4921492364925}, +} + + +def test_graph_json_response(api_client, mocker, requests_mock): + _authenticate_graph_user(api_client, mocker) + + graph_query = "stats" + + requests_mock.get( + get_config()["graph"]["server_url"] + graph_query, + json=_response_json, + headers={"Content-Type": "application/json"}, + ) + + url = reverse("api-1-graph", url_args={"graph_query": graph_query}) + + resp = api_client.get(url) + + assert resp.status_code == 200 + assert resp.content_type == "application/json" + assert resp.content == json.dumps(_response_json).encode() + + +def test_graph_ndjson_response(api_client, mocker, requests_mock): + _authenticate_graph_user(api_client, mocker) + + graph_query = "visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb" + + response_ndjson = textwrap.dedent( + """\ + ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\ + "swh:1:cnt:acfb7cabd63b368a03a9df87670ece1488c8bce0"] + ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\ + "swh:1:cnt:2a0837708151d76edf28fdbb90dc3eabc676cff3"] + ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\ + "swh:1:cnt:eaf025ad54b94b2fdda26af75594cfae3491ec75"] + """ + ) + + requests_mock.get( + get_config()["graph"]["server_url"] + graph_query, + text=response_ndjson, + headers={"Content-Type": "application/x-ndjson"}, + ) + + url = reverse("api-1-graph", url_args={"graph_query": graph_query}) + + resp = api_client.get(url) + + assert resp.status_code == 200 + assert resp.content_type == "application/x-ndjson" + assert resp.content == response_ndjson.encode() + + +@given(origin()) +def test_graph_response_resolve_origins( + archive_data, api_client, mocker, requests_mock, origin +): + hasher = hashlib.sha1() + hasher.update(origin["url"].encode()) + origin_sha1 = hasher.hexdigest() + origin_swhid = str(swhid(ORIGIN, origin_sha1)) + snapshot = archive_data.snapshot_get_latest(origin["url"])["id"] + snapshot_swhid = str(swhid(SNAPSHOT, snapshot)) + + _authenticate_graph_user(api_client, mocker) + + for graph_query, response_text, content_type in ( + ( + f"visit/nodes/{snapshot_swhid}", + f"{snapshot_swhid}\n{origin_swhid}\n", + "text/plain", + ), + ( + f"visit/edges/{snapshot_swhid}", + f"{snapshot_swhid} {origin_swhid}\n", + "text/plain", + ), + ( + f"visit/paths/{snapshot_swhid}", + f'["{snapshot_swhid}", "{origin_swhid}"]\n', + "application/x-ndjson", + ), + ): + + # set two lines response to check resolved origins cache + response_text = response_text + response_text + + requests_mock.get( + get_config()["graph"]["server_url"] + graph_query, + text=response_text, + headers={"Content-Type": content_type}, + ) + + url = reverse( + "api-1-graph", + url_args={"graph_query": graph_query}, + query_params={"direction": "backward"}, + ) + + resp = api_client.get(url) + + assert resp.status_code == 200 + assert resp.content_type == content_type + assert resp.content == response_text.encode() + + url = reverse( + "api-1-graph", + url_args={"graph_query": graph_query}, + query_params={"direction": "backward", "resolve_origins": "true"}, + ) + + resp = api_client.get(url) + + assert resp.status_code == 200 + assert resp.content_type == content_type + assert ( + resp.content == response_text.replace(origin_swhid, origin["url"]).encode() + ) + + +def test_graph_response_resolve_origins_nothing_to_do( + api_client, mocker, requests_mock +): + _authenticate_graph_user(api_client, mocker) + + graph_query = "stats" + + requests_mock.get( + get_config()["graph"]["server_url"] + graph_query, + json=_response_json, + headers={"Content-Type": "application/json"}, + ) + + url = reverse( + "api-1-graph", + url_args={"graph_query": graph_query}, + query_params={"resolve_origins": "true"}, + ) + + resp = api_client.get(url) + + assert resp.status_code == 200 + assert resp.content_type == "application/json" + assert resp.content == json.dumps(_response_json).encode()