diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index 43aca7a..7287785 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -1,33 +1,32 @@ FROM maven:3.6.0-jdk-11 # Java global config ARG MAX_RAM=2800G ENV JAVA_TOOL_OPTIONS \ -Xmx${MAX_RAM} -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G \ -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA \ -XX:+UseTLAB -XX:+ResizeTLAB \ -Dlogback.configurationFile=configuration/logback.xml # Monitoring RUN apt-get update RUN apt-get install -y time # Download third party binaries and dependencies WORKDIR /swh/graph-lib RUN curl -O http://webgraph.di.unimi.it/webgraph-big-3.5.1-bin.tar.gz RUN tar xvfz webgraph-big-3.5.1-bin.tar.gz RUN cp webgraph-big-3.5.1/webgraph-big-3.5.1.jar . RUN curl -O http://webgraph.di.unimi.it/webgraph-big-deps.tar.gz RUN tar xvfz webgraph-big-deps.tar.gz # temporary, SWH-specific version of law, waiting for changes to be integrated # upstream RUN curl -o law-2.5.1.jar https://forge.softwareheritage.org/file/download/jizfo55fndw54iwr645c/PHID-FILE-kjtrkzrwotlkh6mtkazj/law-2.5.1_swh1.jar # Add user files WORKDIR /swh/app COPY configuration configuration/ COPY scripts scripts/ - diff --git a/dockerfiles/scripts/compress_graph.sh b/dockerfiles/scripts/compress_graph.sh index 67cd811..8f15612 100755 --- a/dockerfiles/scripts/compress_graph.sh +++ b/dockerfiles/scripts/compress_graph.sh @@ -1,93 +1,95 @@ #!/bin/bash usage() { echo "Usage: --input --output --lib " echo " options:" echo " -t, --tmp (default to /tmp/)" echo " --stdout (default to ./stdout)" echo " --stderr (default to ./stderr)" + echo " --batch-size (default to 10^6): WebGraph internals" exit 1 } graph_path="" out_dir="" lib_dir="" tmp_dir="/tmp/" stdout_file="stdout" stderr_file="stderr" +batch_size=1000000 while (( "$#" )); do case "$1" in -i|--input) shift; graph_path=$1;; -o|--output) shift; out_dir=$1;; -l|--lib) shift; lib_dir=$1;; -t|--tmp) shift; tmp_dir=$1;; --stdout) shift; stdout_file=$1;; --stderr) shift; stderr_file=$1;; + --batch-size) shift; batch_size=$1;; *) usage;; esac shift done if [[ -z $graph_path || -z $out_dir || -z $lib_dir ]]; then usage fi if [[ -f "$stdout_file" || -f "$stderr_file" ]]; then echo "Cannot overwrite previous compression stdout/stderr files" exit 1 fi dataset=$(basename $graph_path) compr_graph_path="$out_dir/$dataset" mkdir -p $out_dir mkdir -p $tmp_dir java_cmd () { /usr/bin/time -v java -cp $lib_dir/'*' $* } { # Build a function (MPH) that maps node names to node numbers in # lexicographic order (output: .mph) java_cmd it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction \ --zipped $compr_graph_path.mph --temp-dir $tmp_dir \ $graph_path.nodes.csv.gz && # Build the graph in BVGraph format (output: .{graph,offsets,properties}) java_cmd it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph \ --function $compr_graph_path.mph --temp-dir $tmp_dir \ --zipped $compr_graph_path-bv < $graph_path.edges.csv.gz && # Build the offset big-list file to load the graph faster (output: .obl) java_cmd it.unimi.dsi.big.webgraph.BVGraph \ --list $compr_graph_path-bv && # Find a better permutation using a BFS traversal order (output: .order) java_cmd it.unimi.dsi.law.graph.BFSBig \ $compr_graph_path-bv $compr_graph_path.order && # Permute the graph accordingly - batch_size=1000000000 && java_cmd it.unimi.dsi.big.webgraph.Transform mapOffline \ $compr_graph_path-bv $compr_graph_path \ $compr_graph_path.order $batch_size $tmp_dir && java_cmd it.unimi.dsi.big.webgraph.BVGraph \ --list $compr_graph_path && # Compute graph statistics (output: .{indegree,outdegree,stats}) java_cmd it.unimi.dsi.big.webgraph.Stats $compr_graph_path && # Create transposed graph (to allow backward traversal) java_cmd it.unimi.dsi.big.webgraph.Transform transposeOffline \ $compr_graph_path $compr_graph_path-transposed \ $batch_size $tmp_dir && java_cmd it.unimi.dsi.big.webgraph.BVGraph \ --list $compr_graph_path-transposed } >> $stdout_file 2>> $stderr_file if [[ $? -eq 0 ]]; then echo "Graph compression done." else echo "Graph compression failed: see $stderr_file for more info." exit 1 fi diff --git a/docs/docker.rst b/docs/docker.rst index 30278ea..a36b481 100644 --- a/docs/docker.rst +++ b/docs/docker.rst @@ -1,75 +1,79 @@ Graph Docker environment ======================== Build ----- .. code:: bash $ git clone https://forge.softwareheritage.org/source/swh-graph.git $ cd swh-graph $ docker build --tag swh-graph dockerfiles Run --- Given a graph specified by: - ``g.edges.csv.gz``: gzip-compressed csv file with one edge per line, as a "SRC_ID SPACE DST_ID" string, where identifiers are the `persistent identifier `_ of each node. - ``g.nodes.csv.gz``: sorted list of unique node identifiers appearing in the corresponding ``g.edges.csv.gz`` file. The format is a gzip-compressed csv file with one persistent identifier per line. .. code:: bash $ docker run \ --volume /path/to/graph/:/graph \ --volume /path/to/output/:/graph/compressed \ --name swh-graph --tty --interactive \ swh-graph:latest bash Where ``/path/to/graph`` is a directory containing the ``g.edges.csv.gz`` and ``g.nodes.csv.gz`` files. Graph compression ~~~~~~~~~~~~~~~~~ To start graph compression: .. code:: bash $ ./scripts/compress_graph.sh \ --input /graph/g \ --output /graph/compressed \ --lib /swh/graph-lib \ --tmp /graph/compressed/tmp \ --stdout /graph/compressed/stdout \ --stderr /graph/compressed/stderr +Warning: very large graphs may need a bigger batch size parameter for WebGraph +internals (you can specify a value when running the compression script using: +``--batch-size 1000000000``). + Node ids mapping ~~~~~~~~~~~~~~~~ To dump the mapping files: .. code:: bash $ java -cp /swh/app/swh-graph.jar \ org.softwareheritage.graph.backend.Setup /graph/compressed/g This command outputs: - ``g.nodeToSwhMap.csv``: long node id to string persistent identifier. - ``g.swhToNodeMap.csv``: string persistent identifier to long node id. REST API ~~~~~~~~ To start the REST API web-service: .. code:: bash $ java -cp /swh/app/swh-graph.jar \ org.softwareheritage.graph.App /graph/compressed/g