diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index 698ad34..01e5055 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -1,31 +1,34 @@ FROM openjdk:12 # Java global config ARG MAX_RAM=2800G ENV JAVA_TOOL_OPTIONS \ -Xmx${MAX_RAM} -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G \ -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA \ -XX:+UseTLAB -XX:+ResizeTLAB \ -Dlogback.configurationFile=configuration/logback.xml +# Monitoring +RUN yum install -y time + # Download third party binaries and dependencies WORKDIR /srv/softwareheritage/graph/lib RUN curl -O http://webgraph.di.unimi.it/webgraph-big-3.5.1-bin.tar.gz RUN tar xvfz webgraph-big-3.5.1-bin.tar.gz RUN cp webgraph-big-3.5.1/webgraph-big-3.5.1.jar . RUN curl -O http://webgraph.di.unimi.it/webgraph-big-deps.tar.gz RUN tar xvfz webgraph-big-deps.tar.gz RUN curl -O http://law.di.unimi.it/software/download/law-2.5.2-bin.tar.gz RUN tar xvfz law-2.5.2-bin.tar.gz RUN cp law-2.5.2/law-2.5.2.jar . # Add user files WORKDIR /srv/softwareheritage/graph/app COPY configuration configuration/ COPY scripts scripts/ # Default dir WORKDIR /srv/softwareheritage/graph diff --git a/dockerfiles/scripts/compress_graph.sh b/dockerfiles/scripts/compress_graph.sh index edafadb..4db32c6 100755 --- a/dockerfiles/scripts/compress_graph.sh +++ b/dockerfiles/scripts/compress_graph.sh @@ -1,102 +1,102 @@ #!/bin/bash usage() { echo "Usage: compress_graph.sh --lib --input " echo "Options:" echo " -o, --outdir (Default: GRAPH_DIR/compressed)" echo " -t, --tmp (Default: OUT_DIR/tmp)" echo " --stdout (Default: OUT_DIR/stdout)" echo " --stderr (Default: OUT_DIR/stderr)" echo " --batch-size (Default: 10^6): WebGraph internals" exit 1 } graph_path="" out_dir="" lib_dir="" stdout_file="" stderr_file="" batch_size=1000000 while (( "$#" )); do case "$1" in -i|--input) shift; graph_path=$1 ;; -o|--outdir) shift; out_dir=$1 ;; -l|--lib) shift; lib_dir=$1 ;; -t|--tmp) shift; tmp_dir=$1 ;; --stdout) shift; stdout_file=$1 ;; --stderr) shift; stderr_file=$1 ;; --batch-size) shift; batch_size=$1 ;; *) usage ;; esac shift done if [[ -z "$graph_path" || ! -d "$lib_dir" ]]; then usage fi if [ -z "$out_dir" ] ; then out_dir="$(dirname $graph_path)/compressed" fi if [ -z "$tmp_dir" ] ; then tmp_dir="${out_dir}/tmp" fi if [ -z "$stdout_file" ] ; then stdout_file="${out_dir}/stdout" fi if [ -z "$stderr_file" ] ; then stderr_file="${out_dir}/stderr" fi dataset=$(basename $graph_path) compr_graph_path="${out_dir}/${dataset}" test -d "$out_dir" || mkdir -p "$out_dir" test -d "$tmp_dir" || mkdir -p "$tmp_dir" java_cmd () { - java -cp $lib_dir/'*' $* + /usr/bin/time -v java -cp $lib_dir/'*' $* } { # Build a function (MPH) that maps node names to node numbers in # lexicographic order (output: .mph) java_cmd it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction \ --zipped $compr_graph_path.mph --temp-dir $tmp_dir \ $graph_path.nodes.csv.gz && # Build the graph in BVGraph format (output: .{graph,offsets,properties}) java_cmd it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph \ --function $compr_graph_path.mph --temp-dir $tmp_dir \ --zipped $compr_graph_path-bv < $graph_path.edges.csv.gz && # Build the offset big-list file to load the graph faster (output: .obl) java_cmd it.unimi.dsi.big.webgraph.BVGraph \ --list $compr_graph_path-bv && # Find a better permutation using a BFS traversal order (output: .order) java_cmd it.unimi.dsi.law.big.graph.BFS \ $compr_graph_path-bv $compr_graph_path.order && # Permute the graph accordingly java_cmd it.unimi.dsi.big.webgraph.Transform mapOffline \ $compr_graph_path-bv $compr_graph_path \ $compr_graph_path.order $batch_size $tmp_dir && java_cmd it.unimi.dsi.big.webgraph.BVGraph \ --list $compr_graph_path && # Compute graph statistics (output: .{indegree,outdegree,stats}) java_cmd it.unimi.dsi.big.webgraph.Stats $compr_graph_path && # Create transposed graph (to allow backward traversal) java_cmd it.unimi.dsi.big.webgraph.Transform transposeOffline \ $compr_graph_path $compr_graph_path-transposed \ $batch_size $tmp_dir && java_cmd it.unimi.dsi.big.webgraph.BVGraph \ --list $compr_graph_path-transposed } > $stdout_file 2> $stderr_file if [[ $? -eq 0 ]]; then echo "Graph compression done." else echo "Graph compression failed: see $stderr_file for more info." exit 1 fi