diff --git a/compression/Dockerfile b/compression/Dockerfile index 0c26403..8637d9a 100644 --- a/compression/Dockerfile +++ b/compression/Dockerfile @@ -1,23 +1,18 @@ FROM maven:3.6.0-jdk-11 WORKDIR /app # Download webgraph binary RUN curl -O http://webgraph.di.unimi.it/webgraph-big-3.5.0-bin.tar.gz RUN tar xvfz webgraph-big-3.5.0-bin.tar.gz RUN cp webgraph-big-3.5.0/webgraph-big-3.5.0.jar . # Download webgraph dependencies RUN curl -O http://webgraph.di.unimi.it/webgraph-big-deps.tar.gz RUN tar xvfz webgraph-big-deps.tar.gz -# Download LAW (for LLP ordering) -RUN curl -O http://law.di.unimi.it/software/download/law-2.5.1-bin.tar.gz -RUN tar xvfz law-2.5.1-bin.tar.gz -RUN cp law-2.5.1/law-2.5.1.jar . - # Monitoring RUN apt-get update RUN apt-get install -y time WORKDIR /graph COPY compress_graph.sh . diff --git a/compression/compress_graph.sh b/compression/compress_graph.sh index 6a88d81..16326b3 100755 --- a/compression/compress_graph.sh +++ b/compression/compress_graph.sh @@ -1,89 +1,72 @@ #!/bin/bash usage() { echo "Usage: --input --output --lib " echo " options:" echo " -t, --tmp (default to /tmp/)" exit 1 } graph_path="" out_dir="" lib_path="" tmp_dir="/tmp/" while (( "$#" )); do case "$1" in -i|--input) shift; graph_path=$1;; -o|--output) shift; out_dir=$1;; -l|--lib) shift; lib_path=$1;; -t|--tmp) shift; tmp_dir=$1;; *) usage;; esac shift done if [[ -z $graph_path || -z $out_dir || -z $lib_path ]]; then usage fi dataset=$(basename $graph_path) compr_graph_path="$out_dir/$dataset" mkdir -p $out_dir mkdir -p $tmp_dir java_cmd () { /usr/bin/time -v java \ -Xmx1024G -server -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G \ -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA \ -XX:+UseTLAB -XX:+ResizeTLAB \ -cp /app/'*' $* } -llp_ordering () { - # Create a symmetrized version of the graph - # (output: .{graph,offsets,properties}) - java_cmd it.unimi.dsi.big.webgraph.Transform symmetrizeOffline \ - $compr_graph_path-bv $compr_graph_path-bv-sym - java_cmd it.unimi.dsi.big.webgraph.BVGraph --list $compr_graph_path-bv-sym - - # Find a better permutation through Layered LPA - # WARNING: no 64-bit version of LLP - java_cmd it.unimi.dsi.law.graph.LayeredLabelPropagation \ - --longs $compr_graph_path-bv-sym $compr_graph_path.order -} - -bfs_ordering () { - java_cmd it.unimi.dsi.law.graph.BFSBig \ - $compr_graph_path-bv $compr_graph_path.order -} - # Build a function (MPH) that maps node names to node numbers in lexicographic # order (output: .mph) java_cmd it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction \ --zipped $compr_graph_path.mph --temp-dir $tmp_dir \ $graph_path.nodes.csv.gz # Build the graph in BVGraph format (output: .{graph,offsets,properties}) java_cmd it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph \ --function $compr_graph_path.mph --temp-dir $tmp_dir \ --zipped $compr_graph_path-bv < $graph_path.edges.csv.gz # Build the offset big-list file to load the graph faster (output: .obl) java_cmd it.unimi.dsi.big.webgraph.BVGraph --list $compr_graph_path-bv -# Find a better permutation -bfs_ordering +# Find a better permutation using a BFS traversal order (output: .order) +java_cmd it.unimi.dsi.law.graph.BFSBig \ + $compr_graph_path-bv $compr_graph_path.order # Permute the graph accordingly batch_size=1000000000 java_cmd it.unimi.dsi.big.webgraph.Transform mapOffline \ $compr_graph_path-bv $compr_graph_path $compr_graph_path.order $batch_size java_cmd it.unimi.dsi.big.webgraph.BVGraph --list $compr_graph_path # Compute graph statistics (output: .{indegree,outdegree,stats}) java_cmd it.unimi.dsi.big.webgraph.Stats $compr_graph_path # Create transposed graph (to allow backward traversal) java_cmd it.unimi.dsi.big.webgraph.Transform transposeOffline \ $compr_graph_path $compr_graph_path-transposed $batch_size java_cmd it.unimi.dsi.big.webgraph.BVGraph --list $compr_graph_path-transposed