diff --git a/api/server/src/test/dataset/.gitignore b/api/server/src/test/dataset/.gitignore index 1b9b4a7..71ef93f 100644 --- a/api/server/src/test/dataset/.gitignore +++ b/api/server/src/test/dataset/.gitignore @@ -1,20 +1,24 @@ dockerfiles/ # Generated input files *.csv.gz *.nodes.csv # Generated WebGraph files *.graph *.indegree *.mph *.obl *.offsets *.order *.outdegree *.properties *.stats # Generated node ids mapping *.nodeToSwhMap.csv *.swhToNodeMap.csv + +# Logs +stdout +stderr diff --git a/api/server/src/test/dataset/generate_graph.sh b/api/server/src/test/dataset/generate_graph.sh index cb2f15a..55e85e3 100755 --- a/api/server/src/test/dataset/generate_graph.sh +++ b/api/server/src/test/dataset/generate_graph.sh @@ -1,17 +1,20 @@ #!/bin/bash # Build Docker work environment toplevel_dir=`git rev-parse --show-toplevel` mkdir -p dockerfiles cp $toplevel_dir/compression/{compress_graph.sh,Dockerfile} dockerfiles/ docker build --tag swh-graph-test dockerfiles # Setup input for compression script tr ' ' '\n' < graph.edges.csv | sort -u > graph.nodes.csv gzip --force --keep graph.edges.csv gzip --force --keep graph.nodes.csv +# Setup output +rm -f stderr stdout + docker run \ --name swh-graph-test --rm --tty --interactive \ --volume $(pwd):/data swh-graph-test:latest \ ./compress_graph.sh --input /data/graph --output /data/ --lib /graph-lib/ diff --git a/compression/compress_graph.sh b/compression/compress_graph.sh index ca0d6a0..8ac2756 100755 --- a/compression/compress_graph.sh +++ b/compression/compress_graph.sh @@ -1,72 +1,86 @@ #!/bin/bash usage() { echo "Usage: --input --output --lib " echo " options:" echo " -t, --tmp (default to /tmp/)" exit 1 } graph_path="" out_dir="" lib_path="" tmp_dir="/tmp/" while (( "$#" )); do case "$1" in -i|--input) shift; graph_path=$1;; -o|--output) shift; out_dir=$1;; -l|--lib) shift; lib_path=$1;; -t|--tmp) shift; tmp_dir=$1;; *) usage;; esac shift done if [[ -z $graph_path || -z $out_dir || -z $lib_path ]]; then usage fi dataset=$(basename $graph_path) compr_graph_path="$out_dir/$dataset" +stdout_file="$out_dir/stdout" +stderr_file="$out_dir/stderr" mkdir -p $out_dir mkdir -p $tmp_dir +if [[ -f "$stdout_file" || -f "$stderr_file" ]]; then + echo "Cannot overwrite compression stdout/stderr files" + exit 1 +fi java_cmd () { - /usr/bin/time -v java \ - -Xmx1024G -server -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G \ - -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA \ - -XX:+UseTLAB -XX:+ResizeTLAB \ + /usr/bin/time -v java \ + -server -Xmx1024G -XX:PretenureSizeThreshold=512M \ + -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseNUMA \ + -XX:+UseTransparentHugePages -XX:+UseTLAB -XX:+ResizeTLAB \ -cp $lib_path/'*' $* } -# Build a function (MPH) that maps node names to node numbers in lexicographic -# order (output: .mph) -java_cmd it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction \ - --zipped $compr_graph_path.mph --temp-dir $tmp_dir \ - $graph_path.nodes.csv.gz +{ + # Build a function (MPH) that maps node names to node numbers in + # lexicographic order (output: .mph) + java_cmd it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction \ + --zipped $compr_graph_path.mph --temp-dir $tmp_dir \ + $graph_path.nodes.csv.gz ; + + # Build the graph in BVGraph format (output: .{graph,offsets,properties}) + java_cmd it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph \ + --function $compr_graph_path.mph --temp-dir $tmp_dir \ + --zipped $compr_graph_path-bv < $graph_path.edges.csv.gz ; + # Build the offset big-list file to load the graph faster (output: .obl) + java_cmd it.unimi.dsi.big.webgraph.BVGraph \ + --list $compr_graph_path-bv ; -# Build the graph in BVGraph format (output: .{graph,offsets,properties}) -java_cmd it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph \ - --function $compr_graph_path.mph --temp-dir $tmp_dir \ - --zipped $compr_graph_path-bv < $graph_path.edges.csv.gz -# Build the offset big-list file to load the graph faster (output: .obl) -java_cmd it.unimi.dsi.big.webgraph.BVGraph --list $compr_graph_path-bv + # Find a better permutation using a BFS traversal order (output: .order) + java_cmd it.unimi.dsi.law.graph.BFSBig \ + $compr_graph_path-bv $compr_graph_path.order ; -# Find a better permutation using a BFS traversal order (output: .order) -java_cmd it.unimi.dsi.law.graph.BFSBig \ - $compr_graph_path-bv $compr_graph_path.order + # Permute the graph accordingly + batch_size=1000000000 + java_cmd it.unimi.dsi.big.webgraph.Transform mapOffline \ + $compr_graph_path-bv $compr_graph_path \ + $compr_graph_path.order $batch_size ; + java_cmd it.unimi.dsi.big.webgraph.BVGraph \ + --list $compr_graph_path ; -# Permute the graph accordingly -batch_size=1000000000 -java_cmd it.unimi.dsi.big.webgraph.Transform mapOffline \ - $compr_graph_path-bv $compr_graph_path $compr_graph_path.order $batch_size -java_cmd it.unimi.dsi.big.webgraph.BVGraph --list $compr_graph_path + # Compute graph statistics (output: .{indegree,outdegree,stats}) + java_cmd it.unimi.dsi.big.webgraph.Stats $compr_graph_path ; -# Compute graph statistics (output: .{indegree,outdegree,stats}) -java_cmd it.unimi.dsi.big.webgraph.Stats $compr_graph_path + # Create transposed graph (to allow backward traversal) + java_cmd it.unimi.dsi.big.webgraph.Transform transposeOffline \ + $compr_graph_path $compr_graph_path-transposed $batch_size ; + java_cmd it.unimi.dsi.big.webgraph.BVGraph \ + --list $compr_graph_path-transposed ; +} >> $stdout_file 2>> $stderr_file -# Create transposed graph (to allow backward traversal) -java_cmd it.unimi.dsi.big.webgraph.Transform transposeOffline \ - $compr_graph_path $compr_graph_path-transposed $batch_size -java_cmd it.unimi.dsi.big.webgraph.BVGraph --list $compr_graph_path-transposed +echo "Graph compression done."