diff --git a/swh/graph/backend.py b/swh/graph/backend.py
--- a/swh/graph/backend.py
+++ b/swh/graph/backend.py
@@ -6,9 +6,7 @@
import asyncio
import contextlib
import io
-import logging
import os
-import pathlib
import struct
import subprocess
import sys
@@ -16,6 +14,7 @@
from py4j.java_gateway import JavaGateway
+from swh.graph.config import check_config
from swh.graph.pid import NodeToPidMap, PidToNodeMap
from swh.model.identifiers import PID_TYPES
@@ -26,30 +25,6 @@
PID2NODE_EXT = 'pid2node.bin'
-def find_graph_jar():
- """find swh-graph.jar, containing the Java part of swh-graph
-
- look both in development directories and installed data (for in-production
- deployments who fecthed the JAR from pypi)
-
- """
- swh_graph_root = pathlib.Path(__file__).parents[2]
- try_paths = [
- swh_graph_root / 'java/target/',
- pathlib.Path(sys.prefix) / 'share/swh-graph/',
- pathlib.Path(sys.prefix) / 'local/share/swh-graph/',
- ]
- for path in try_paths:
- glob = list(path.glob('swh-graph-*.jar'))
- if glob:
- if len(glob) > 1:
- logging.warn('found multiple swh-graph JARs, '
- 'arbitrarily picking one')
- logging.info('using swh-graph JAR: {0}'.format(glob[0]))
- return str(glob[0])
- raise RuntimeError('swh-graph JAR not found. Have you run `make java`?')
-
-
def _get_pipe_stderr():
# Get stderr if possible, or pipe to stdout if running with Jupyter.
try:
@@ -61,28 +36,17 @@
class Backend:
- def __init__(self, graph_path):
+ def __init__(self, graph_path, config=None):
self.gateway = None
self.entry = None
self.graph_path = graph_path
+ self.config = check_config(config or {})
def __enter__(self):
- # TODO: make all of that configurable with sane defaults
- java_opts = [
- '-Xmx200G',
- '-server',
- '-XX:PretenureSizeThreshold=512M',
- '-XX:MaxNewSize=4G',
- '-XX:+UseLargePages',
- '-XX:+UseTransparentHugePages',
- '-XX:+UseNUMA',
- '-XX:+UseTLAB',
- '-XX:+ResizeTLAB',
- ]
self.gateway = JavaGateway.launch_gateway(
java_path=None,
- javaopts=java_opts,
- classpath=find_graph_jar(),
+ javaopts=self.config['java_tool_options'].split(),
+ classpath=self.config['classpath'],
die_on_exit=True,
redirect_stdout=sys.stdout,
redirect_stderr=_get_pipe_stderr(),
diff --git a/swh/graph/cli.py b/swh/graph/cli.py
--- a/swh/graph/cli.py
+++ b/swh/graph/cli.py
@@ -215,7 +215,7 @@
@click.pass_context
def serve(ctx, host, port, graph):
"""run the graph REST service"""
- backend = Backend(graph_path=graph)
+ backend = Backend(graph_path=graph, config=ctx.obj['config'])
app = make_app(backend=backend)
with backend:
diff --git a/swh/graph/config.py b/swh/graph/config.py
new file mode 100644
--- /dev/null
+++ b/swh/graph/config.py
@@ -0,0 +1,104 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import logging
+import psutil
+import sys
+from pathlib import Path
+
+
+def find_graph_jar():
+ """find swh-graph.jar, containing the Java part of swh-graph
+
+ look both in development directories and installed data (for in-production
+ deployments who fecthed the JAR from pypi)
+
+ """
+ swh_graph_root = Path(__file__).parents[2]
+ try_paths = [
+ swh_graph_root / 'java/target/',
+ Path(sys.prefix) / 'share/swh-graph/',
+ Path(sys.prefix) / 'local/share/swh-graph/',
+ ]
+ for path in try_paths:
+ glob = list(path.glob('swh-graph-*.jar'))
+ if glob:
+ if len(glob) > 1:
+ logging.warn('found multiple swh-graph JARs, '
+ 'arbitrarily picking one')
+ logging.info('using swh-graph JAR: {0}'.format(glob[0]))
+ return str(glob[0])
+ raise RuntimeError('swh-graph JAR not found. Have you run `make java`?')
+
+
+def check_config(conf):
+ """check configuration and propagate defaults
+ """
+ conf = conf.copy()
+ if 'batch_size' not in conf:
+ conf['batch_size'] = '1000000000' # 1 billion
+ if 'max_ram' not in conf:
+ conf['max_ram'] = str(psutil.virtual_memory().total)
+ if 'java_tool_options' not in conf:
+ conf['java_tool_options'] = ' '.join([
+ '-Xmx{max_ram}',
+ '-XX:PretenureSizeThreshold=512M',
+ '-XX:MaxNewSize=4G',
+ '-XX:+UseLargePages',
+ '-XX:+UseTransparentHugePages',
+ '-XX:+UseNUMA',
+ '-XX:+UseTLAB',
+ '-XX:+ResizeTLAB',
+ ])
+ conf['java_tool_options'] = conf['java_tool_options'].format(
+ max_ram=conf['max_ram'])
+ if 'java' not in conf:
+ conf['java'] = 'java'
+ if 'classpath' not in conf:
+ conf['classpath'] = find_graph_jar()
+
+ return conf
+
+
+def check_config_compress(config, graph_name, in_dir, out_dir):
+ """check compression-specific configuration and initialize its execution
+ environment.
+ """
+ conf = check_config(config)
+
+ conf['graph_name'] = graph_name
+ conf['in_dir'] = str(in_dir)
+ conf['out_dir'] = str(out_dir)
+ out_dir.mkdir(parents=True, exist_ok=True)
+ if 'tmp_dir' not in conf:
+ tmp_dir = out_dir / 'tmp'
+ conf['tmp_dir'] = str(tmp_dir)
+ else:
+ tmp_dir = Path(conf['tmp_dir'])
+ tmp_dir.mkdir(parents=True, exist_ok=True)
+
+ if 'logback' not in conf:
+ logback_confpath = tmp_dir / 'logback.xml'
+ with open(logback_confpath, 'w') as conffile:
+ conffile.write("""
+
+
+
+ %d %r %p [%t] %logger{1} - %m%n
+
+
+
+
+
+
+""")
+ conf['logback'] = str(logback_confpath)
+
+ conf['java_tool_options'] += ' -Dlogback.configurationFile={logback}'
+ conf['java_tool_options'] = conf['java_tool_options'].format(
+ logback=conf['logback'])
+
+ print(conf)
+ return conf
diff --git a/swh/graph/tests/test_cli.py b/swh/graph/tests/test_cli.py
--- a/swh/graph/tests/test_cli.py
+++ b/swh/graph/tests/test_cli.py
@@ -43,7 +43,6 @@
graph:
compress:
batch_size: 1000
- java_tool_options: -Dlogback.configurationFile={logback}
""")
tmpconf.close()
self.conffile = Path(tmpconf.name)
diff --git a/swh/graph/webgraph.py b/swh/graph/webgraph.py
--- a/swh/graph/webgraph.py
+++ b/swh/graph/webgraph.py
@@ -15,12 +15,9 @@
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Set
-
-import psutil
-
from click import ParamType
-from swh.graph.backend import find_graph_jar
+from swh.graph.config import check_config_compress
class CompressionStep(Enum):
@@ -162,60 +159,6 @@
return rc
-def check_config(conf, graph_name, in_dir, out_dir):
- """check compression configuration, propagate defaults, and initialize
- execution environment
-
- """
- conf = conf.copy()
- conf['graph_name'] = graph_name
- conf['in_dir'] = str(in_dir)
- conf['out_dir'] = str(out_dir)
- out_dir.mkdir(parents=True, exist_ok=True)
- if 'tmp_dir' not in conf:
- tmp_dir = out_dir / 'tmp'
- conf['tmp_dir'] = str(tmp_dir)
- else:
- tmp_dir = Path(conf['tmp_dir'])
- tmp_dir.mkdir(parents=True, exist_ok=True)
- if 'batch_size' not in conf:
- conf['batch_size'] = '1000000000' # 1 billion
- if 'logback' not in conf:
- logback_confpath = tmp_dir / 'logback.xml'
- with open(logback_confpath, 'w') as conffile:
- conffile.write("""
-
-
-
- %d %r %p [%t] %logger{1} - %m%n
-
-
-
-
-
-
-""")
- conf['logback'] = str(logback_confpath)
- if 'max_ram' not in conf:
- conf['max_ram'] = str(psutil.virtual_memory().total)
- if 'java_tool_options' not in conf:
- assert 'logback' in conf
- conf['java_tool_options'] = ' '.join([
- '-Xmx{max_ram}', '-XX:PretenureSizeThreshold=512M',
- '-XX:MaxNewSize=4G', '-XX:+UseLargePages',
- '-XX:+UseTransparentHugePages', '-XX:+UseNUMA', '-XX:+UseTLAB',
- '-XX:+ResizeTLAB', '-Dlogback.configurationFile={logback}'
- ])
- conf['java_tool_options'] = conf['java_tool_options'].format(
- max_ram=conf['max_ram'], logback=conf['logback'])
- if 'java' not in conf:
- conf['java'] = 'java'
- if 'classpath' not in conf:
- conf['classpath'] = find_graph_jar()
-
- return conf
-
-
def compress(graph_name: str, in_dir: Path, out_dir: Path,
steps: Set[CompressionStep] = set(COMP_SEQ),
conf: Dict[str, str] = {}):
@@ -248,7 +191,7 @@
if not steps:
steps = set(COMP_SEQ)
- conf = check_config(conf, graph_name, in_dir, out_dir)
+ conf = check_config_compress(conf, graph_name, in_dir, out_dir)
compression_start_time = datetime.now()
logging.info(f'starting compression at {compression_start_time}')