diff --git a/docs/api.rst b/docs/api.rst index f6a96d2..50b673a 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,276 +1,276 @@ Graph REST API ============== Terminology ----------- This API uses the following notions: - **Node**: a node in the `Software Heritage graph `_, represented by a `persistent identifier `_ (abbreviated as *SWH PID*, or simply *PID*). - **Node type**: the 3-letter specifier from the node PID (``cnt``, ``dir``, ``rel``, ``rev``, ``snp``, ``ori``), or ``*`` for all node types. - **Edge type**: a pair ``src:dst`` where ``src`` and ``dst`` are either node types, or ``*`` to denote all node types. - **Edge restrictions**: a textual specification of which edges can be followed during graph traversal. Either ``*`` to denote that all edges can be followed or a comma separated list of edge types to allow following only those edges. Note that when traversing the *backward* (i.e., transposed) graph, edge types are reversed too. So, for instance, ``ori:snp`` makes sense when traversing the forward graph, but useless (due to lack of matching edges in the graph) when traversing the backward graph; conversely ``snp:ori`` is useful when traversing the backward graph, but not in the forward one. For the same reason ``dir:dir` allows following edges from parent directories to sub-directories when traversing the forward graph, but the same restriction allows following edges from sub-directories to parent directories. Examples ~~~~~~~~ - ``swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2`` the PID of a node of type content containing the full text of the GPL3 license. - ``swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35`` the PID of a node of type revision corresponding to the commit in Linux that merged the 'x86/urgent' branch on 31 December 2017. - ``"dir:dir,dir:cnt"`` node types allowing edges from directories to directories nodes, or directories to contents nodes. - ``"rev:rev,dir:*"`` node types allowing edges from revisions to revisions nodes, or from directories nodes. - ``"*:rel"`` node types allowing all edges to releases. Leaves ------ .. http:get:: /graph/leaves/:src Performs a graph traversal and returns the leaves of the subgraph rooted at the specified source node. :param string src: source node specified as a SWH PID :query string edges: edges types the traversal can follow; default to ``"*"`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found .. sourcecode:: http HTTP/1.1 200 OK Content-Type: text/plain Transfer-Encoding: chunked swh:1:cnt:669ac7c32292798644b21dbb5a0dc657125f444d swh:1:cnt:da4cb28febe66172a9fdf1a235525ae6c00cde1d", ... Neighbors --------- .. http:get:: /graph/neighbors/:src Returns node direct neighbors (linked with exactly one edge) in the graph. :param string src: source node specified as a SWH PID :query string edges: edges types allowed to be listed as neighbors; default to ``"*"`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found .. sourcecode:: http HTTP/1.1 200 OK Content-Type: text/plain Transfer-Encoding: chunked swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2 swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505 ... Walk ---- -.. http:get:: /graph/walk/:src/:dst - - Performs a graph traversal and returns the first found path from source to - destination (final destination node included). - - :param string src: starting node specified as a SWH PID - :param string dst: destination node, either as a node PID or a node type. - The traversal will stop at the first node encountered matching the - desired destination. - - :query string edges: edges types the traversal can follow; default to - ``"*"`` - :query string traversal: traversal algorithm; can be either ``dfs`` or - ``bfs``, default to ``dfs`` - :query string direction: direction in which graph edges will be followed; - can be either ``forward`` or ``backward``, default to ``forward`` - - :statuscode 200: success - :statuscode 400: invalid query string provided - :statuscode 404: starting node cannot be found - - .. sourcecode:: http - - HTTP/1.1 200 OK - Content-Type: text/plain - Transfer-Encoding: chunked - - swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35 - swh:1:rev:52c90f2d32bfa7d6eccd66a56c44ace1f78fbadd - swh:1:rev:cea92e843e40452c08ba313abc39f59efbb4c29c - swh:1:rev:8d517bdfb57154b8a11d7f1682ecc0f79abf8e02 - ... - +.. + .. http:get:: /graph/walk/:src/:dst + + Performs a graph traversal and returns the first found path from source to + destination (final destination node included). + + :param string src: starting node specified as a SWH PID + :param string dst: destination node, either as a node PID or a node type. + The traversal will stop at the first node encountered matching the + desired destination. + + :query string edges: edges types the traversal can follow; default to + ``"*"`` + :query string traversal: traversal algorithm; can be either ``dfs`` or + ``bfs``, default to ``dfs`` + :query string direction: direction in which graph edges will be followed; + can be either ``forward`` or ``backward``, default to ``forward`` + + :statuscode 200: success + :statuscode 400: invalid query string provided + :statuscode 404: starting node cannot be found + + .. sourcecode:: http + + HTTP/1.1 200 OK + Content-Type: text/plain + Transfer-Encoding: chunked + + swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35 + swh:1:rev:52c90f2d32bfa7d6eccd66a56c44ace1f78fbadd + swh:1:rev:cea92e843e40452c08ba313abc39f59efbb4c29c + swh:1:rev:8d517bdfb57154b8a11d7f1682ecc0f79abf8e02 + ... .. http:get:: /graph/randomwalk/:src/:dst Performs a graph *random* traversal, i.e., picking one random successor node at each hop, from source to destination (final destination node included). :param string src: starting node specified as a SWH PID :param string dst: destination node, either as a node PID or a node type. The traversal will stop at the first node encountered matching the desired destination. :query string edges: edges types the traversal can follow; default to ``"*"`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found .. sourcecode:: http HTTP/1.1 200 OK Content-Type: text/plain Transfer-Encoding: chunked swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35 swh:1:rev:52c90f2d32bfa7d6eccd66a56c44ace1f78fbadd swh:1:rev:cea92e843e40452c08ba313abc39f59efbb4c29c swh:1:rev:8d517bdfb57154b8a11d7f1682ecc0f79abf8e02 ... Visit ----- .. http:get:: /graph/visit/nodes/:src .. http:get:: /graph/visit/paths/:src Performs a graph traversal and returns explored nodes or paths (in the order of the traversal). :param string src: starting node specified as a SWH PID :query string edges: edges types the traversal can follow; default to ``"*"`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found .. sourcecode:: http GET /graph/visit/nodes/... HTTP/1.1 200 OK Content-Type: text/plain Transfer-Encoding: chunked swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35 swh:1:rev:52c90f2d32bfa7d6eccd66a56c44ace1f78fbadd ... swh:1:rev:a31e58e129f73ab5b04016330b13ed51fde7a961 ... .. sourcecode:: http GET /graph/visit/paths/... HTTP/1.1 200 OK Content-Type: application/x-ndjson Transfer-Encoding: chunked ["swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35", "swh:1:rev:52c90f2d32bfa7d6eccd66a56c44ace1f78fbadd", ...] ["swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35", "swh:1:rev:a31e58e129f73ab5b04016330b13ed51fde7a961", ...] ... Counting results ---------------- The following method variants, with trailing ``/count`` added, behave like the already discussed methods but, instead of returning results, return the *amount* of results that would have been returned: .. http:get:: /graph/leaves/count/:src .. http:get:: /graph/neighbors/count/:src .. http:get:: /graph/visit/nodes/count/:src Stats ----- .. http:get:: /graph/stats Returns statistics on the compressed graph. :statuscode 200: success .. sourcecode:: http HTTP/1.1 200 OK Content-Type: application/json { "counts": { "nodes": 16222788, "edges": 9907464 }, "ratios": { "compression": 0.367, "bits_per_node": 5.846, "bits_per_edge": 9.573, "avg_locality": 270.369 }, "indegree": { "min": 0, "max": 12382, "avg": 0.6107127825377487 }, "outdegree": { "min": 0, "max": 1, "avg": 0.6107127825377487 } } diff --git a/swh/graph/server/app.py b/swh/graph/server/app.py index 44a33e0..797f3ab 100644 --- a/swh/graph/server/app.py +++ b/swh/graph/server/app.py @@ -1,216 +1,219 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """ A proxy HTTP server for swh-graph, talking to the Java code via py4j, and using FIFO as a transport to stream integers between the two languages. """ import asyncio import json import aiohttp.web from swh.core.api.asynchronous import RPCServerApp from swh.model.identifiers import PID_TYPES from swh.model.exceptions import ValidationError try: from contextlib import asynccontextmanager except ImportError: # Compatibility with 3.6 backport from async_generator import asynccontextmanager # type: ignore # maximum number of retries for random walks RANDOM_RETRIES = 5 # TODO make this configurable via rpc-serve configuration @asynccontextmanager async def stream_response(request, content_type='text/plain', *args, **kwargs): response = aiohttp.web.StreamResponse(*args, **kwargs) response.content_type = content_type await response.prepare(request) yield response await response.write_eof() async def index(request): return aiohttp.web.Response( content_type='text/html', body=""" Software Heritage storage server

You have reached the Software Heritage graph API server.

See its API documentation for more information.

""") async def stats(request): stats = request.app['backend'].stats() return aiohttp.web.Response(body=stats, content_type='application/json') def get_direction(request): """validate HTTP query parameter `direction`""" s = request.query.get('direction', 'forward') if s not in ('forward', 'backward'): raise aiohttp.web.HTTPBadRequest(body=f'invalid direction: {s}') return s def get_edges(request): """validate HTTP query parameter `edges`, i.e., edge restrictions""" s = request.query.get('edges', '*') if any([node_type != '*' and node_type not in PID_TYPES for edge in s.split(':') for node_type in edge.split(',', maxsplit=1)]): raise aiohttp.web.HTTPBadRequest(body=f'invalid edge restriction: {s}') return s def get_traversal(request): """validate HTTP query parameter `traversal`, i.e., visit order""" s = request.query.get('traversal', 'dfs') if s not in ('bfs', 'dfs'): raise aiohttp.web.HTTPBadRequest(body=f'invalid traversal order: {s}') return s def node_of_pid(pid, backend): """lookup a PID in a pid2node map, failing in an HTTP-nice way if needed""" try: return backend.pid2node[pid] except KeyError: raise aiohttp.web.HTTPNotFound(body=f'PID not found: {pid}') except ValidationError: raise aiohttp.web.HTTPBadRequest(body=f'malformed PID: {pid}') def pid_of_node(node, backend): """lookup a node in a node2pid map, failing in an HTTP-nice way if needed """ try: return backend.node2pid[node] except KeyError: raise aiohttp.web.HTTPInternalServerError( body=f'reverse lookup failed for node id: {node}') def get_simple_traversal_handler(ttype): async def simple_traversal(request): backend = request.app['backend'] src = request.match_info['src'] edges = get_edges(request) direction = get_direction(request) src_node = node_of_pid(src, backend) async with stream_response(request) as response: async for res_node in backend.simple_traversal( ttype, direction, edges, src_node ): res_pid = pid_of_node(res_node, backend) await response.write('{}\n'.format(res_pid).encode()) return response return simple_traversal def get_walk_handler(random=False): async def walk(request): backend = request.app['backend'] src = request.match_info['src'] dst = request.match_info['dst'] edges = get_edges(request) direction = get_direction(request) algo = get_traversal(request) src_node = node_of_pid(src, backend) if dst not in PID_TYPES: dst = node_of_pid(dst, backend) async with stream_response(request) as response: if random: it = backend.random_walk(direction, edges, RANDOM_RETRIES, src_node, dst) else: it = backend.walk(direction, edges, algo, src_node, dst) async for res_node in it: res_pid = pid_of_node(res_node, backend) await response.write('{}\n'.format(res_pid).encode()) return response return walk async def visit_paths(request): backend = request.app['backend'] src = request.match_info['src'] edges = get_edges(request) direction = get_direction(request) src_node = node_of_pid(src, backend) it = backend.visit_paths(direction, edges, src_node) async with stream_response(request, content_type='application/x-ndjson') \ as response: async for res_path in it: res_path_pid = [pid_of_node(n, backend) for n in res_path] line = json.dumps(res_path_pid) await response.write('{}\n'.format(line).encode()) return response def get_count_handler(ttype): async def count(request): loop = asyncio.get_event_loop() backend = request.app['backend'] src = request.match_info['src'] edges = get_edges(request) direction = get_direction(request) src_node = node_of_pid(src, backend) cnt = await loop.run_in_executor( None, backend.count, ttype, direction, edges, src_node) return aiohttp.web.Response(body=str(cnt), content_type='application/json') return count def make_app(backend, **kwargs): app = RPCServerApp(**kwargs) app.router.add_get('/', index) app.router.add_get('/graph/stats', stats) app.router.add_get('/graph/leaves/{src}', get_simple_traversal_handler('leaves')) app.router.add_get('/graph/neighbors/{src}', get_simple_traversal_handler('neighbors')) app.router.add_get('/graph/visit/nodes/{src}', get_simple_traversal_handler('visit_nodes')) app.router.add_get('/graph/visit/paths/{src}', visit_paths) - app.router.add_get('/graph/walk/{src}/{dst}', - get_walk_handler(random=False)) + + # temporarily disabled in wait of a proper fix for T1969 + # app.router.add_get('/graph/walk/{src}/{dst}', + # get_walk_handler(random=False)) + app.router.add_get('/graph/randomwalk/{src}/{dst}', get_walk_handler(random=True)) app.router.add_get('/graph/neighbors/count/{src}', get_count_handler('neighbors')) app.router.add_get('/graph/leaves/count/{src}', get_count_handler('leaves')) app.router.add_get('/graph/visit/nodes/count/{src}', get_count_handler('visit_nodes')) app['backend'] = backend return app