diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..09a6f12 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,13 @@ +FROM openjdk:11-slim-buster + +# Install swh.graph (both Python and Java parts) +RUN apt-get update && \ + apt-get install --no-install-recommends --yes \ + curl time \ + gcc pkg-config libsystemd-dev python3-dev \ + python3-pip python3-setuptools && \ + rm -rf /var/lib/apt/lists/* && \ + pip3 install swh.graph + +# Default dir +WORKDIR /srv/softwareheritage/graph diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile deleted file mode 100644 index a71d493..0000000 --- a/dockerfiles/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM openjdk:13-slim-buster - -# Java global config -ARG MAX_RAM=2800G -ENV JAVA_TOOL_OPTIONS \ - -Xmx${MAX_RAM} -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G \ - -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA \ - -XX:+UseTLAB -XX:+ResizeTLAB \ - -Dlogback.configurationFile=app/configuration/logback.xml - -# Install swh.graph (both Python and Java parts) -RUN apt-get update && \ - apt-get install --no-install-recommends --yes \ - curl time \ - gcc pkg-config libsystemd-dev python3-dev \ - python3-pip python3-setuptools && \ - rm -rf /var/lib/apt/lists/* && \ - pip3 install swh.graph - -# Install 3rd party dependencies (not shipped with swh.graph) -WORKDIR /srv/softwareheritage/graph/lib - -RUN ln /usr/local/share/swh-graph/*.jar . - -# Add user files -WORKDIR /srv/softwareheritage/graph/app -COPY configuration configuration/ -COPY scripts scripts/ - -# Default dir -WORKDIR /srv/softwareheritage/graph diff --git a/dockerfiles/configuration/logback.xml b/dockerfiles/configuration/logback.xml deleted file mode 100644 index 76a4c7a..0000000 --- a/dockerfiles/configuration/logback.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - %d %r %p [%t] %logger{1} - %m%n - - - - - - - diff --git a/dockerfiles/scripts/compress_graph.sh b/dockerfiles/scripts/compress_graph.sh deleted file mode 100755 index 1194689..0000000 --- a/dockerfiles/scripts/compress_graph.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash - -usage() { - echo "Usage: compress_graph.sh --lib --input " - echo "Options:" - echo " -o, --outdir (Default: GRAPH_DIR/compressed)" - echo " -t, --tmp (Default: OUT_DIR/tmp)" - echo " --stdout (Default: OUT_DIR/stdout)" - echo " --stderr (Default: OUT_DIR/stderr)" - echo " --batch-size (Default: 10^6): WebGraph internals" - exit 1 -} - -graph_path="" -out_dir="" -lib_dir="" -stdout_file="" -stderr_file="" -batch_size=1000000 -while (( "$#" )); do - case "$1" in - -i|--input) shift; graph_path=$1 ;; - -o|--outdir) shift; out_dir=$1 ;; - -l|--lib) shift; lib_dir=$1 ;; - -t|--tmp) shift; tmp_dir=$1 ;; - --stdout) shift; stdout_file=$1 ;; - --stderr) shift; stderr_file=$1 ;; - --batch-size) shift; batch_size=$1 ;; - *) usage ;; - esac - shift -done - -if [[ -z "$graph_path" || ! -d "$lib_dir" ]]; then - usage -fi -if [ -z "$out_dir" ] ; then - out_dir="$(dirname $graph_path)/compressed" -fi -if [ -z "$tmp_dir" ] ; then - tmp_dir="${out_dir}/tmp" -fi -if [ -z "$stdout_file" ] ; then - stdout_file="${out_dir}/stdout" -fi -if [ -z "$stderr_file" ] ; then - stderr_file="${out_dir}/stderr" -fi - -dataset=$(basename $graph_path) -compr_graph_path="${out_dir}/${dataset}" - -test -d "$out_dir" || mkdir -p "$out_dir" -test -d "$tmp_dir" || mkdir -p "$tmp_dir" - -step_info() { - echo -e "\n* swh-graph: $1 step... ($2)\n" -} - -java_cmd () { - /usr/bin/time -v java -cp $lib_dir/'*' $* -} - -{ - # Build a function (MPH) that maps node names to node numbers in - # lexicographic order (output: .mph) - step_info "MPH" "1/6" && - java_cmd it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction \ - --zipped $compr_graph_path.mph --temp-dir $tmp_dir \ - $graph_path.nodes.csv.gz && - - # Build the graph in BVGraph format (output: .{graph,offsets,properties}) - step_info "BV compress" "2/6" && - java_cmd it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph \ - --function $compr_graph_path.mph --temp-dir $tmp_dir \ - --zipped $compr_graph_path-bv < $graph_path.edges.csv.gz && - # Build the offset big-list file to load the graph faster (output: .obl) - java_cmd it.unimi.dsi.big.webgraph.BVGraph \ - --list $compr_graph_path-bv && - - # Find a better permutation using a BFS traversal order (output: .order) - step_info "BFS" "3/6" && - java_cmd it.unimi.dsi.law.big.graph.BFS \ - $compr_graph_path-bv $compr_graph_path.order && - - # Permute the graph accordingly - step_info "Permute" "4/6" && - java_cmd it.unimi.dsi.big.webgraph.Transform mapOffline \ - $compr_graph_path-bv $compr_graph_path \ - $compr_graph_path.order $batch_size $tmp_dir && - java_cmd it.unimi.dsi.big.webgraph.BVGraph \ - --list $compr_graph_path && - - # Compute graph statistics (output: .{indegree,outdegree,stats}) - step_info "Stats" "5/6" && - java_cmd it.unimi.dsi.big.webgraph.Stats $compr_graph_path && - - # Create transposed graph (to allow backward traversal) - step_info "Transpose" "6/6" && - java_cmd it.unimi.dsi.big.webgraph.Transform transposeOffline \ - $compr_graph_path $compr_graph_path-transposed \ - $batch_size $tmp_dir && - java_cmd it.unimi.dsi.big.webgraph.BVGraph \ - --list $compr_graph_path-transposed -} > $stdout_file 2> $stderr_file - -if [[ $? -eq 0 ]]; then - echo "Graph compression done." -else - echo "Graph compression failed: see $stderr_file for more info." - exit 1 -fi diff --git a/docs/docker.rst b/docs/docker.rst index c0d6ba5..62434c2 100644 --- a/docs/docker.rst +++ b/docs/docker.rst @@ -1,79 +1,58 @@ Docker environment ================== Build ----- .. code:: bash $ git clone https://forge.softwareheritage.org/source/swh-graph.git $ cd swh-graph - $ docker build --tag swh-graph dockerfiles + $ docker build --tag swh-graph docker/ Run --- Given a graph ``g`` specified by: -- ``g.edges.csv.gz``: gzip-compressed csv file with one edge per line, as a +- ``g.edges.csv.zst``: zstd-compressed CSV file with one edge per line, as a "SRC_ID SPACE DST_ID" string, where identifiers are the :ref:`persistent-identifiers` of each node. -- ``g.nodes.csv.gz``: sorted list of unique node identifiers appearing in the - corresponding ``g.edges.csv.gz`` file. The format is a gzip-compressed csv - file with one persistent identifier per line. +- ``g.nodes.csv.zst``: sorted list of unique node identifiers appearing in the + corresponding ``g.edges.csv.zst`` file. The format is a zst-compressed CSV + file (single column) with one persistent identifier per line. .. code:: bash $ docker run -ti \ --volume /PATH/TO/GRAPH/:/srv/softwareheritage/graph/data \ --publish 127.0.0.1:5009:5009 \ swh-graph:latest \ bash -Where ``/PATH/TO/GRAPH`` is a directory containing the ``g.edges.csv.gz`` and -``g.nodes.csv.gz`` files. By default, when entering the container the current +Where ``/PATH/TO/GRAPH`` is a directory containing the ``g.edges.csv.zst`` and +``g.nodes.csv.zst`` files. By default, when entering the container the current working directory will be ``/srv/softwareheritage/graph``; all relative paths found below are intended to be relative to that dir. Graph compression ~~~~~~~~~~~~~~~~~ To compress the graph: .. code:: bash - $ app/scripts/compress_graph.sh --lib lib/ --input data/g - -Warning: very large graphs may need a bigger batch size parameter for WebGraph -internals (you can specify a value when running the compression script using: -``--batch-size 1000000000``). - - -Node identifier mappings -~~~~~~~~~~~~~~~~~~~~~~~~ - -To dump the mapping files (i.e., various node id <-> other info mapping files, -in either ``.csv.gz`` or ad-hoc ``.map`` format): - -.. code:: bash - - $ java -cp lib/swh-graph-*.jar \ - org.softwareheritage.graph.backend.MapBuilder \ - data/g.nodes.csv.gz data/compressed/g + $ swh graph compress --graph data/g --outdir data/compressed Graph server ~~~~~~~~~~~~ To start the swh-graph server: .. code:: bash - $ java -cp lib/swh-graph-*.jar \ - org.softwareheritage.graph.App data/compressed/g - -To specify the port on which the server will run, use the `--port` or `-p` flag -(default is 5009). + $ swh graph rpc-serve --graph data/compressed/g diff --git a/reports/experiments/experiments.tex b/reports/experiments/experiments.tex index fcf1131..fce997e 100644 --- a/reports/experiments/experiments.tex +++ b/reports/experiments/experiments.tex @@ -1,233 +1,233 @@ \documentclass[11pt,a4paper]{article} \usepackage[english]{babel} \usepackage{a4wide} \usepackage{booktabs} \usepackage{minted} \usepackage{siunitx} \usepackage[colorlinks,urlcolor=blue,linkcolor=magenta,citecolor=red,linktocpage=true]{hyperref} \title{Google Summer of Code 2019} \author{Thibault Allançon} \date{8 April 2019} \begin{document} \maketitle Early experiments running WebGraph framework on the Software Heritage datasets. \section{Environment} Docker environment and compression script can be found here: -\url{https://forge.softwareheritage.org/source/swh-graph/browse/master/dockerfiles/}. +\url{https://forge.softwareheritage.org/source/swh-graph/browse/master/docker/}. \section{Datasets analysis} \begin{center} \begin{tabular}{@{} l *4r @{}} \toprule \multicolumn{1}{c}{} & \textbf{\mintinline{text}{.nodes.csv.gz}} & \textbf{\mintinline{text}{.edges.csv.gz}} & \textbf{\# of nodes} & \textbf{\# of edges} \\ \midrule \texttt{rel\_to\_obj} & 344M & 382M & \num{16222788} & \num{9907464} \\ \texttt{ori\_to\_snp} & 1.3G & 3.7G & \num{112564374} & \num{194970670} \\ \texttt{dir\_to\_rev} & 745M & 12G & \num{35399184} & \num{481829426} \\ \texttt{snp\_to\_obj} & 3.5G & 21G & \num{170999796} & \num{831089515} \\ \texttt{rev\_to\_rev} & 22G & 33G & \num{1117498391} & \num{1165813689} \\ \texttt{rev\_to\_dir} & 41G & 48G & \num{2047888941} & \num{1125083793} \\ \texttt{dir\_to\_dir} & 95G & 1.3T & \num{4805057515} & \num{48341950415} \\ \texttt{dir\_to\_cnt} & 180G & 3T & \num{9231457233} & \num{112363058067} \\ \midrule Entire graph (\texttt{all}) & 340G & 4.5T & \num{11595403407} & \num{164513703039} \\ \bottomrule \end{tabular} \end{center} \section{Individual datasets compression} The first experiments were done on individual datasets. \subsection{Results} Datasets were compressed on different VM (depending on availability): \begin{itemize} \item \textit{(sexus)} 1TB of RAM and 40vCPU: \mintinline{text}{dir_to_dir} \item \textit{(monster)} 700GB of RAM and 72vCPU: \mintinline{text}{dir_to_cnt} \item \textit{(chub)} 2TB of RAM and 128vCPU: all the others datasets \end{itemize} Note: the results may vary because random permutations are used in the graph compression process. \begin{center} \begin{tabular}{@{} l *4r @{}} \toprule \multicolumn{1}{c}{} & \textbf{compr ratio} & \textbf{bit/edge} & \textbf{compr size\footnotemark} \\ \midrule \texttt{rel\_to\_obj} & 0.367 & 9.573 & 23M \\ \texttt{ori\_to\_snp} & 0.291 & 8.384 & 140M \\ \texttt{dir\_to\_rev} & 0.07 & 1.595 & 120M & \\ \texttt{snp\_to\_obj} & 0.067 & 1.798 & 253M \\ \texttt{rev\_to\_rev} & 0.288 & 9.063 & 2.2G \\ \texttt{rev\_to\_dir} & 0.291 & 9.668 & 2.6G \\ \texttt{dir\_to\_dir} & 0.336 & 10.178 & 61G \\ \texttt{dir\_to\_cnt} & 0.228 & 7.054 & 97G \\ \midrule Entire graph (estimated) & & & 163G \\ \bottomrule \end{tabular} \end{center} \footnotetext{calculated as: size of \mintinline{bash}{*.graph} + size of \mintinline{bash}{*.offsets}} \subsection{Timings} \begin{center} \begin{tabular}{@{} l *6r @{}} \toprule \multicolumn{1}{c}{} & \textbf{MPH} & \textbf{BV Compress} & \textbf{Symmetrized} & \textbf{LLP} & \textbf{Permute} & \textbf{Total} \\ \midrule \texttt{rel\_to\_obj} & 14s & 25s & 18s & 8min & 10s & \textbf{9min} \\ \texttt{ori\_to\_snp} & 1min & 5min & 3min & 1h30 & 1min & \textbf{1h40} \\ \texttt{dir\_to\_rev} & 56s & 22min & 6min & 41min & 2min & \textbf{1h13} \\ \texttt{snp\_to\_obj} & 3min & 22min & 8min & 2h50 & 5min & \textbf{3h30} \\ \texttt{rev\_to\_rev} & 11min & 56min & 24min & 31h52 & 20min & \textbf{33h42} \\ \texttt{rev\_to\_dir} & 20min & 1h & 30min & 52h45 & 23min & \textbf{55h} \\ \bottomrule \end{tabular} \end{center} \vspace{0.5cm} For the \mintinline{text}{dir_to_*} datasets we decided not use LLP algorithm because it would take too long, and instead used a BFS traversal order for the node re-ordering. This allows \textbf{much} faster computation and yields similar results (thanks to our graph topology). \vspace{0.5cm} \begin{center} \begin{tabular}{@{} l *5r @{}} \toprule \multicolumn{1}{c}{} & \textbf{MPH} & \textbf{BV Compress} & \textbf{BFS} & \textbf{Permute} & \textbf{Total} \\ \midrule \texttt{dir\_to\_dir} & 4h36 & 50h & 4h44 & 12h38 & \textbf{72h} \\ \texttt{dir\_to\_cnt} & 3h07 & 101h & 17h18 & 20h38 & \textbf{142h} \\ \bottomrule \end{tabular} \end{center} \subsection{Memory usage} Memory usage monitoring during the compression process: \begin{center} \begin{tabular}{@{} l c @{}} \toprule \multicolumn{1}{c}{} & \textbf{Maximum resident set size} \\ \midrule \texttt{rel\_to\_obj} & 11G \\ \texttt{ori\_to\_snp} & 15G \\ \texttt{dir\_to\_rev} & 22G \\ \texttt{snp\_to\_obj} & 23G \\ \texttt{rev\_to\_rev} & 86G \\ \texttt{rev\_to\_dir} & 154G \\ \texttt{dir\_to\_dir} & 345G \\ \texttt{dir\_to\_cnt} & 764G \\ \midrule Entire graph (estimated) & 1.4T \\ \bottomrule \end{tabular} \end{center} \section{Entire graph compression} After studying feasibility on the individual datasets and estimating the final results, we assembled the entire graph into a single dataset and launched the compression process on it. \subsection{Results} Two different VM where used depending on the compression step: \begin{itemize} \item \textit{(monster)} 700GB of RAM and 72vCPU: for the BV compress step. \item \textit{(rioc)} 3TB of RAM and 48vCPU: all the other steps. \end{itemize} The reason to use monster instead of rioc for the BV compress step was because the I/O on rioc was too slow for the job to complete within the time limit allowed on the cluster. \begin{center} \begin{tabular}{@{} l *3r @{}} \toprule \multicolumn{1}{c}{} & \textbf{compr ratio} & \textbf{bit/edge} & \textbf{compr size} \\ \midrule \texttt{all} & 0.158 & 4.913 & 101G \\ \texttt{all-transposed} & 0.144 & 4.481 & 94G \\ \bottomrule \end{tabular} \end{center} \subsection{Timings and max memory usage} \begin{center} \begin{tabular}{@{} r *2r @{}} \toprule \multicolumn{1}{c}{} & \textbf{Timings} & \textbf{Max mem usage} \\ \midrule \texttt{MPH} & 3h30 & 10GB \\ \texttt{BV Compress} & 103h & 16GB \\ \texttt{BFS} & 10h & 1057GB \\ \texttt{Permute} & 25h & 115GB \\ \texttt{Stats} & 4h & 102GB \\ \texttt{Transpose} & 22h & 19GB \\ \midrule Total & \begin{tabular}{@{}r@{}}\textbf{168h} \\ \scriptsize{(7 days)}\end{tabular} & \textbf{1TB} \\ \bottomrule \end{tabular} \end{center} \end{document} diff --git a/swh/graph/tests/dataset/.gitignore b/swh/graph/tests/dataset/.gitignore index cf41cb1..531c841 100644 --- a/swh/graph/tests/dataset/.gitignore +++ b/swh/graph/tests/dataset/.gitignore @@ -1,4 +1,5 @@ -dockerfiles/ +docker/ output/*-bv.* output/stderr output/stdout +output/compression.log diff --git a/swh/graph/tests/dataset/generate_graph.sh b/swh/graph/tests/dataset/generate_graph.sh index e1a72ee..7b78d36 100755 --- a/swh/graph/tests/dataset/generate_graph.sh +++ b/swh/graph/tests/dataset/generate_graph.sh @@ -1,27 +1,23 @@ #!/bin/bash # Clean previous run -rm -rf dockerfiles output +rm -rf docker/ output mkdir output # Build Docker work environment toplevel_dir=`git rev-parse --show-toplevel` -mkdir -p dockerfiles -cp -r $toplevel_dir/dockerfiles/ . -docker build --tag swh-graph-test dockerfiles +mkdir -p docker +cp -r $toplevel_dir/docker/ . +docker build --tag swh-graph-test docker # Setup input for compression script tr ' ' '\n' < example.edges.csv | sort -u > example.nodes.csv zstd < example.nodes.csv > example.edges.csv.zst zstd < example.edges.csv > example.nodes.csv.zst -docker run \ - --user $(id -u):$(id -g) \ +docker run \ + --user $(id -u):$(id -g) \ --name swh-graph-test --rm --tty --interactive \ - --volume $(pwd):/input \ - --volume $(pwd)/output:/output \ - swh-graph-test:latest \ - app/scripts/compress_graph.sh \ - --lib lib/ \ - --input /input/example \ - --outdir /output + --volume $(pwd):/input --volume $(pwd)/output:/output \ + swh-graph-test:latest \ + swh graph compress --graph /input/example --outdir /output