diff --git a/swh/graph/cli.py b/swh/graph/cli.py index bb0442d..1c83b93 100644 --- a/swh/graph/cli.py +++ b/swh/graph/cli.py @@ -1,198 +1,199 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import aiohttp import click import sys from pathlib import Path +from typing import Any, Dict, Tuple from swh.core import config from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup from swh.graph import client, webgraph from swh.graph.pid import PidToIntMap, IntToPidMap from swh.graph.server.app import make_app from swh.graph.backend import Backend class PathlibPath(click.Path): """A Click path argument that returns a pathlib Path, not a string""" def convert(self, value, param, ctx): return Path(super().convert(value, param, ctx)) DEFAULT_CONFIG = { 'graph': ('dict', {}) -} +} # type: Dict[str, Tuple[str, Any]] @click.group(name='graph', context_settings=CONTEXT_SETTINGS, cls=AliasedGroup) @click.option('--config-file', '-C', default=None, type=click.Path(exists=True, dir_okay=False,), help='YAML configuration file') @click.pass_context def cli(ctx, config_file): """Software Heritage graph tools.""" ctx.ensure_object(dict) conf = config.read(config_file, DEFAULT_CONFIG) if 'graph' not in conf: raise ValueError('no "graph" stanza found in configuration file %s' % config_file) ctx.obj['config'] = conf @cli.command('api-client') @click.option('--host', default='localhost', help='Graph server host') @click.option('--port', default='5009', help='Graph server port') @click.pass_context def api_client(ctx, host, port): """client for the graph REST service""" url = 'http://{}:{}'.format(host, port) app = client.RemoteGraphClient(url) # TODO: run web app print(app.stats()) @cli.group('map') @click.pass_context def map(ctx): """Manage swh-graph on-disk maps""" pass def dump_pid2int(filename): for (pid, int) in PidToIntMap(filename): print('{}\t{}'.format(pid, int)) def dump_int2pid(filename): for (int, pid) in IntToPidMap(filename): print('{}\t{}'.format(int, pid)) def restore_pid2int(filename): """read a textual PID->int map from stdin and write its binary version to filename """ with open(filename, 'wb') as dst: for line in sys.stdin: (str_pid, str_int) = line.split() PidToIntMap.write_record(dst, str_pid, int(str_int)) def restore_int2pid(filename, length): """read a textual int->PID map from stdin and write its binary version to filename """ int2pid = IntToPidMap(filename, mode='wb', length=length) for line in sys.stdin: (str_int, str_pid) = line.split() int2pid[int(str_int)] = str_pid int2pid.close() @map.command('dump') @click.option('--type', '-t', 'map_type', required=True, type=click.Choice(['pid2int', 'int2pid']), help='type of map to dump') @click.argument('filename', required=True, type=click.Path(exists=True)) @click.pass_context def dump_map(ctx, map_type, filename): """dump a binary PID<->int map to textual format""" if map_type == 'pid2int': dump_pid2int(filename) elif map_type == 'int2pid': dump_int2pid(filename) else: raise ValueError('invalid map type: ' + map_type) pass @map.command('restore') @click.option('--type', '-t', 'map_type', required=True, type=click.Choice(['pid2int', 'int2pid']), help='type of map to dump') @click.option('--length', '-l', type=int, help='''map size in number of logical records (required for int2pid maps)''') @click.argument('filename', required=True, type=click.Path()) @click.pass_context def restore_map(ctx, map_type, length, filename): """restore a binary PID<->int map from textual format""" if map_type == 'pid2int': restore_pid2int(filename) elif map_type == 'int2pid': if length is None: raise click.UsageError( 'map length is required when restoring {} maps'.format( map_type), ctx) restore_int2pid(filename, length) else: raise ValueError('invalid map type: ' + map_type) @cli.command(name='rpc-serve') @click.option('--host', '-h', default='0.0.0.0', metavar='IP', show_default=True, help='host IP address to bind the server on') @click.option('--port', '-p', default=5009, type=click.INT, metavar='PORT', show_default=True, help='port to bind the server on') @click.option('--graph', '-g', required=True, metavar='GRAPH', help='compressed graph basename') @click.pass_context def serve(ctx, host, port, graph): """run the graph REST service""" backend = Backend(graph_path=graph) app = make_app(backend=backend) with backend: aiohttp.web.run_app(app, host=host, port=port) @cli.command() @click.option('--graph', '-g', required=True, metavar='GRAPH', type=PathlibPath(), help='input graph basename') @click.option('--outdir', '-o', 'out_dir', required=True, metavar='DIR', type=PathlibPath(), help='directory where to store compressed graph') @click.option('--steps', '-s', metavar='STEPS', type=webgraph.StepOption(), help='run only these compression steps (default: all steps)') @click.pass_context def compress(ctx, graph, out_dir, steps): """Compress a graph using WebGraph Input: a pair of files g.nodes.csv.gz, g.edges.csv.gz Output: a directory containing a WebGraph compressed graph Compression steps are: (1) mph, (2) bv, (3) bv_obl, (4) bfs, (5) permute, (6) permute_obl, (7) stats, (8) transpose, (9) transpose_obl. Compression steps can be selected by name or number using --steps, separating them with commas; step ranges (e.g., 3-9, 6-, etc.) are also supported. """ graph_name = graph.name in_dir = graph.parent try: conf = ctx.obj['config']['graph']['compress'] except KeyError: conf = {} # use defaults webgraph.compress(conf, graph_name, in_dir, out_dir, steps) def main(): return cli(auto_envvar_prefix='SWH_GRAPH') if __name__ == '__main__': main() diff --git a/swh/graph/tests/test_cli.py b/swh/graph/tests/test_cli.py index 522a060..93d52d0 100644 --- a/swh/graph/tests/test_cli.py +++ b/swh/graph/tests/test_cli.py @@ -1,47 +1,67 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import TemporaryDirectory, NamedTemporaryFile from typing import Dict from click.testing import CliRunner +from swh.core import config from swh.graph import cli def read_properties(properties_fname) -> Dict[str, str]: """read a Java .properties file""" properties = {} with open(properties_fname) as f: for line in f: if line.startswith('#'): continue (key, value) = line.rstrip().split('=', maxsplit=1) properties[key] = value return properties class TestCompress(unittest.TestCase): DATA_DIR = Path(__file__).parents[0] / 'dataset' def setUp(self): self.runner = CliRunner() + tmpconf = NamedTemporaryFile(mode='w', delete=False, + prefix='swh-graph-test', suffix='.yml') + # bare bone configuration, to allow testing the compression pipeline + # with minimum RAM requirements on trivial graphs + tmpconf.write(""" +graph: + compress: + batch_size: 1000 + java_tool_options: -Dlogback.configurationFile={logback} +""") + tmpconf.close() + self.conffile = Path(tmpconf.name) + self.config = config.read(self.conffile, cli.DEFAULT_CONFIG) + + def tearDown(self): + if self.conffile.exists(): + self.conffile.unlink() + def test_pipeline(self): """run full compression pipeline""" with TemporaryDirectory(suffix='.swh-graph-test') as tmpdir: result = self.runner.invoke( cli.compress, - ['--graph', self.DATA_DIR / 'example', '--outdir', tmpdir]) + ['--graph', self.DATA_DIR / 'example', '--outdir', tmpdir], + obj={'config': self.config}) self.assertEqual(result.exit_code, 0) properties = read_properties(Path(tmpdir) / 'example.properties') self.assertEqual(int(properties['nodes']), 21) self.assertEqual(int(properties['arcs']), 23)