diff --git a/api/server/src/test/dataset/generate_graph.sh b/api/server/src/test/dataset/generate_graph.sh index bb5fdb4..ffc7f07 100755 --- a/api/server/src/test/dataset/generate_graph.sh +++ b/api/server/src/test/dataset/generate_graph.sh @@ -1,21 +1,21 @@ #!/bin/bash # Build Docker work environment toplevel_dir=`git rev-parse --show-toplevel` mkdir -p dockerfiles cp -r $toplevel_dir/dockerfiles/ . docker build --tag swh-graph-test dockerfiles # Setup input for compression script tr ' ' '\n' < graph.edges.csv | sort -u > graph.nodes.csv gzip --force --keep graph.edges.csv gzip --force --keep graph.nodes.csv # Setup output rm -f stderr stdout -docker run \ - --name swh-graph-test --rm --tty --interactive \ - --volume $(pwd):/data swh-graph-test:latest \ - ./scripts/compress_graph.sh \ - --input /data/graph --output /data/ --lib /graph-lib/ +docker run \ + --name swh-graph-test --rm --tty --interactive \ + --volume $(pwd):/data swh-graph-test:latest \ + ./scripts/compress_graph.sh \ + --input /data/graph --output /data --lib /swh/graph-lib diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index c455fd9..9d82911 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -1,19 +1,33 @@ FROM maven:3.6.0-jdk-11 -WORKDIR /graph-lib -# Download webgraph binary -RUN curl -O http://webgraph.di.unimi.it/webgraph-big-3.5.0-bin.tar.gz -RUN tar xvfz webgraph-big-3.5.0-bin.tar.gz -RUN cp webgraph-big-3.5.0/webgraph-big-3.5.0.jar . - -# Download webgraph dependencies -RUN curl -O http://webgraph.di.unimi.it/webgraph-big-deps.tar.gz -RUN tar xvfz webgraph-big-deps.tar.gz +# Java global config +ARG MAX_RAM=2800G +ENV JAVA_TOOL_OPTIONS \ + -Xmx${MAX_RAM} -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G \ + -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA \ + -XX:+UseTLAB -XX:+ResizeTLAB \ + -Dlogback.configurationFile=configuration/logback.xml # Monitoring RUN apt-get update RUN apt-get install -y time -WORKDIR /graph +# Download third party binaries and dependencies +WORKDIR /swh/graph-lib + +RUN curl -O http://webgraph.di.unimi.it/webgraph-big-3.5.1-bin.tar.gz +RUN tar xvfz webgraph-big-3.5.1-bin.tar.gz +RUN cp webgraph-big-3.5.1/webgraph-big-3.5.1.jar . + +RUN curl -O http://webgraph.di.unimi.it/webgraph-big-deps.tar.gz +RUN tar xvfz webgraph-big-deps.tar.gz + +RUN curl -O http://law.di.unimi.it/software/download/law-2.5.1-bin.tar.gz +RUN tar xvfz law-2.5.1-bin.tar.gz +RUN cp law-2.5.1/law-2.5.1.jar . + +# Add user files +WORKDIR /swh/app COPY configuration configuration/ COPY scripts scripts/ + diff --git a/dockerfiles/scripts/compress_graph.sh b/dockerfiles/scripts/compress_graph.sh index 5b79cb3..7ad2168 100755 --- a/dockerfiles/scripts/compress_graph.sh +++ b/dockerfiles/scripts/compress_graph.sh @@ -1,87 +1,82 @@ #!/bin/bash usage() { echo "Usage: --input --output --lib " echo " options:" echo " -t, --tmp (default to /tmp/)" exit 1 } graph_path="" out_dir="" lib_path="" tmp_dir="/tmp/" while (( "$#" )); do case "$1" in -i|--input) shift; graph_path=$1;; -o|--output) shift; out_dir=$1;; -l|--lib) shift; lib_path=$1;; -t|--tmp) shift; tmp_dir=$1;; *) usage;; esac shift done if [[ -z $graph_path || -z $out_dir || -z $lib_path ]]; then usage fi dataset=$(basename $graph_path) compr_graph_path="$out_dir/$dataset" stdout_file="$out_dir/stdout" stderr_file="$out_dir/stderr" mkdir -p $out_dir mkdir -p $tmp_dir if [[ -f "$stdout_file" || -f "$stderr_file" ]]; then echo "Cannot overwrite compression stdout/stderr files" exit 1 fi java_cmd () { - /usr/bin/time -v java \ - -server -Xmx1024G -XX:PretenureSizeThreshold=512M \ - -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseNUMA \ - -XX:+UseTransparentHugePages -XX:+UseTLAB -XX:+ResizeTLAB \ - -Dlogback.configurationFile=configuration/logback.xml \ - -cp $lib_path/'*' $* + /usr/bin/time -v java -cp $lib_path/'*' $* } { # Build a function (MPH) that maps node names to node numbers in # lexicographic order (output: .mph) java_cmd it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction \ --zipped $compr_graph_path.mph --temp-dir $tmp_dir \ $graph_path.nodes.csv.gz ; # Build the graph in BVGraph format (output: .{graph,offsets,properties}) java_cmd it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph \ --function $compr_graph_path.mph --temp-dir $tmp_dir \ --zipped $compr_graph_path-bv < $graph_path.edges.csv.gz ; # Build the offset big-list file to load the graph faster (output: .obl) java_cmd it.unimi.dsi.big.webgraph.BVGraph \ --list $compr_graph_path-bv ; # Find a better permutation using a BFS traversal order (output: .order) java_cmd it.unimi.dsi.law.graph.BFSBig \ $compr_graph_path-bv $compr_graph_path.order ; # Permute the graph accordingly batch_size=1000000000 java_cmd it.unimi.dsi.big.webgraph.Transform mapOffline \ $compr_graph_path-bv $compr_graph_path \ $compr_graph_path.order $batch_size ; java_cmd it.unimi.dsi.big.webgraph.BVGraph \ --list $compr_graph_path ; # Compute graph statistics (output: .{indegree,outdegree,stats}) java_cmd it.unimi.dsi.big.webgraph.Stats $compr_graph_path ; # Create transposed graph (to allow backward traversal) java_cmd it.unimi.dsi.big.webgraph.Transform transposeOffline \ $compr_graph_path $compr_graph_path-transposed $batch_size ; java_cmd it.unimi.dsi.big.webgraph.BVGraph \ --list $compr_graph_path-transposed ; } >> $stdout_file 2>> $stderr_file echo "Graph compression done."