diff --git a/compression/Dockerfile b/compression/Dockerfile new file mode 100644 index 0000000..8a9040f --- /dev/null +++ b/compression/Dockerfile @@ -0,0 +1,19 @@ +FROM maven:3.6.0-jdk-11 +WORKDIR /app + +# Download webgraph binary +RUN curl -O http://webgraph.di.unimi.it/webgraph-3.6.1-bin.tar.gz +RUN tar xvfz webgraph-3.6.1-bin.tar.gz +RUN cp webgraph-3.6.1/webgraph-3.6.1.jar . + +# Download webgraph dependencies +RUN curl -O http://webgraph.di.unimi.it/webgraph-deps.tar.gz +RUN tar xvfz webgraph-deps.tar.gz + +# Download LAW (for LLP ordering) +RUN curl -O http://law.di.unimi.it/software/download/law-2.5-bin.tar.gz +RUN tar xvfz law-2.5-bin.tar.gz +RUN cp law-2.5/law-2.5.jar . + +WORKDIR /graph +COPY compress_graph.sh . diff --git a/compression/compress_graph.sh b/compression/compress_graph.sh new file mode 100755 index 0000000..ac1e42b --- /dev/null +++ b/compression/compress_graph.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +if [ "$#" -ne 2 ]; then + echo "Expected two arguments: " + exit -1 +fi + +INPUT_GRAPH=$1 +OUTPUT_DIR=$2 +DATASET=$(basename $INPUT_GRAPH) +COMPR_GRAPH="$OUTPUT_DIR/$DATASET" + +java_cmd () { + /usr/bin/time -v java -Xmx1024G -cp /app/'*' $* +} + +llp_ordering () { + # Create a symmetrized version of the graph + # (output: .{graph,offsets,properties}) + java_cmd it.unimi.dsi.webgraph.Transform symmetrizeOffline \ + $COMPR_GRAPH-bv $COMPR_GRAPH-bv-sym + java_cmd it.unimi.dsi.webgraph.BVGraph --list $COMPR_GRAPH-bv-sym + + # Find a better permutation through Layered LPA + java_cmd it.unimi.dsi.law.graph.LayeredLabelPropagation \ + $COMPR_GRAPH-bv-sym $COMPR_GRAPH.order +} + +bfs_ordering () { + java_cmd it.unimi.dsi.law.graph.BFS $COMPR_GRAPH-bv $COMPR_GRAPH.order +} + +mkdir -p $OUTPUT_DIR + +# Build a function (MPH) that maps node names to node numbers in lexicographic +# order (output: .mph) +java_cmd it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction \ + --zipped $COMPR_GRAPH.mph $INPUT_GRAPH.nodes.csv.gz + +# Build the graph in BVGraph format (output: .{graph,offsets,properties}) +java_cmd it.unimi.dsi.webgraph.ScatteredArcsASCIIGraph \ + --function $COMPR_GRAPH.mph \ + --zipped $COMPR_GRAPH-bv < $INPUT_GRAPH.edges.csv.gz +# Build the offset big-list file to load the graph faster (output: .obl) +java_cmd it.unimi.dsi.webgraph.BVGraph --list $COMPR_GRAPH-bv + +# Find a better permutation +bfs_ordering + +# Permute the graph accordingly +java_cmd it.unimi.dsi.webgraph.Transform mapOffline \ + $COMPR_GRAPH-bv $COMPR_GRAPH $COMPR_GRAPH.order +java_cmd it.unimi.dsi.webgraph.BVGraph --list $COMPR_GRAPH + +# Compute graph statistics (output: .{indegree,outdegree,stats}) +java_cmd it.unimi.dsi.webgraph.Stats $COMPR_GRAPH