Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9123377
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
19 KB
Subscribers
None
View Options
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 0000000..09a6f12
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,13 @@
+FROM openjdk:11-slim-buster
+
+# Install swh.graph (both Python and Java parts)
+RUN apt-get update && \
+ apt-get install --no-install-recommends --yes \
+ curl time \
+ gcc pkg-config libsystemd-dev python3-dev \
+ python3-pip python3-setuptools && \
+ rm -rf /var/lib/apt/lists/* && \
+ pip3 install swh.graph
+
+# Default dir
+WORKDIR /srv/softwareheritage/graph
diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile
deleted file mode 100644
index a71d493..0000000
--- a/dockerfiles/Dockerfile
+++ /dev/null
@@ -1,31 +0,0 @@
-FROM openjdk:13-slim-buster
-
-# Java global config
-ARG MAX_RAM=2800G
-ENV JAVA_TOOL_OPTIONS \
- -Xmx${MAX_RAM} -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G \
- -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA \
- -XX:+UseTLAB -XX:+ResizeTLAB \
- -Dlogback.configurationFile=app/configuration/logback.xml
-
-# Install swh.graph (both Python and Java parts)
-RUN apt-get update && \
- apt-get install --no-install-recommends --yes \
- curl time \
- gcc pkg-config libsystemd-dev python3-dev \
- python3-pip python3-setuptools && \
- rm -rf /var/lib/apt/lists/* && \
- pip3 install swh.graph
-
-# Install 3rd party dependencies (not shipped with swh.graph)
-WORKDIR /srv/softwareheritage/graph/lib
-
-RUN ln /usr/local/share/swh-graph/*.jar .
-
-# Add user files
-WORKDIR /srv/softwareheritage/graph/app
-COPY configuration configuration/
-COPY scripts scripts/
-
-# Default dir
-WORKDIR /srv/softwareheritage/graph
diff --git a/dockerfiles/configuration/logback.xml b/dockerfiles/configuration/logback.xml
deleted file mode 100644
index 76a4c7a..0000000
--- a/dockerfiles/configuration/logback.xml
+++ /dev/null
@@ -1,11 +0,0 @@
-<configuration>
- <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
- <encoder>
- <pattern>%d %r %p [%t] %logger{1} - %m%n</pattern>
- </encoder>
- </appender>
-
- <root level="INFO">
- <appender-ref ref="STDOUT"/>
- </root>
-</configuration>
diff --git a/dockerfiles/scripts/compress_graph.sh b/dockerfiles/scripts/compress_graph.sh
deleted file mode 100755
index 1194689..0000000
--- a/dockerfiles/scripts/compress_graph.sh
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/bin/bash
-
-usage() {
- echo "Usage: compress_graph.sh --lib <LIB_DIR> --input <GRAPH_BASEPATH>"
- echo "Options:"
- echo " -o, --outdir <OUT_DIR> (Default: GRAPH_DIR/compressed)"
- echo " -t, --tmp <TMP_DIR> (Default: OUT_DIR/tmp)"
- echo " --stdout <STDOUT_LOG> (Default: OUT_DIR/stdout)"
- echo " --stderr <STDERR_LOG> (Default: OUT_DIR/stderr)"
- echo " --batch-size <BATCH_SIZE> (Default: 10^6): WebGraph internals"
- exit 1
-}
-
-graph_path=""
-out_dir=""
-lib_dir=""
-stdout_file=""
-stderr_file=""
-batch_size=1000000
-while (( "$#" )); do
- case "$1" in
- -i|--input) shift; graph_path=$1 ;;
- -o|--outdir) shift; out_dir=$1 ;;
- -l|--lib) shift; lib_dir=$1 ;;
- -t|--tmp) shift; tmp_dir=$1 ;;
- --stdout) shift; stdout_file=$1 ;;
- --stderr) shift; stderr_file=$1 ;;
- --batch-size) shift; batch_size=$1 ;;
- *) usage ;;
- esac
- shift
-done
-
-if [[ -z "$graph_path" || ! -d "$lib_dir" ]]; then
- usage
-fi
-if [ -z "$out_dir" ] ; then
- out_dir="$(dirname $graph_path)/compressed"
-fi
-if [ -z "$tmp_dir" ] ; then
- tmp_dir="${out_dir}/tmp"
-fi
-if [ -z "$stdout_file" ] ; then
- stdout_file="${out_dir}/stdout"
-fi
-if [ -z "$stderr_file" ] ; then
- stderr_file="${out_dir}/stderr"
-fi
-
-dataset=$(basename $graph_path)
-compr_graph_path="${out_dir}/${dataset}"
-
-test -d "$out_dir" || mkdir -p "$out_dir"
-test -d "$tmp_dir" || mkdir -p "$tmp_dir"
-
-step_info() {
- echo -e "\n* swh-graph: $1 step... ($2)\n"
-}
-
-java_cmd () {
- /usr/bin/time -v java -cp $lib_dir/'*' $*
-}
-
-{
- # Build a function (MPH) that maps node names to node numbers in
- # lexicographic order (output: .mph)
- step_info "MPH" "1/6" &&
- java_cmd it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction \
- --zipped $compr_graph_path.mph --temp-dir $tmp_dir \
- $graph_path.nodes.csv.gz &&
-
- # Build the graph in BVGraph format (output: .{graph,offsets,properties})
- step_info "BV compress" "2/6" &&
- java_cmd it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph \
- --function $compr_graph_path.mph --temp-dir $tmp_dir \
- --zipped $compr_graph_path-bv < $graph_path.edges.csv.gz &&
- # Build the offset big-list file to load the graph faster (output: .obl)
- java_cmd it.unimi.dsi.big.webgraph.BVGraph \
- --list $compr_graph_path-bv &&
-
- # Find a better permutation using a BFS traversal order (output: .order)
- step_info "BFS" "3/6" &&
- java_cmd it.unimi.dsi.law.big.graph.BFS \
- $compr_graph_path-bv $compr_graph_path.order &&
-
- # Permute the graph accordingly
- step_info "Permute" "4/6" &&
- java_cmd it.unimi.dsi.big.webgraph.Transform mapOffline \
- $compr_graph_path-bv $compr_graph_path \
- $compr_graph_path.order $batch_size $tmp_dir &&
- java_cmd it.unimi.dsi.big.webgraph.BVGraph \
- --list $compr_graph_path &&
-
- # Compute graph statistics (output: .{indegree,outdegree,stats})
- step_info "Stats" "5/6" &&
- java_cmd it.unimi.dsi.big.webgraph.Stats $compr_graph_path &&
-
- # Create transposed graph (to allow backward traversal)
- step_info "Transpose" "6/6" &&
- java_cmd it.unimi.dsi.big.webgraph.Transform transposeOffline \
- $compr_graph_path $compr_graph_path-transposed \
- $batch_size $tmp_dir &&
- java_cmd it.unimi.dsi.big.webgraph.BVGraph \
- --list $compr_graph_path-transposed
-} > $stdout_file 2> $stderr_file
-
-if [[ $? -eq 0 ]]; then
- echo "Graph compression done."
-else
- echo "Graph compression failed: see $stderr_file for more info."
- exit 1
-fi
diff --git a/docs/docker.rst b/docs/docker.rst
index c0d6ba5..62434c2 100644
--- a/docs/docker.rst
+++ b/docs/docker.rst
@@ -1,79 +1,58 @@
Docker environment
==================
Build
-----
.. code:: bash
$ git clone https://forge.softwareheritage.org/source/swh-graph.git
$ cd swh-graph
- $ docker build --tag swh-graph dockerfiles
+ $ docker build --tag swh-graph docker/
Run
---
Given a graph ``g`` specified by:
-- ``g.edges.csv.gz``: gzip-compressed csv file with one edge per line, as a
+- ``g.edges.csv.zst``: zstd-compressed CSV file with one edge per line, as a
"SRC_ID SPACE DST_ID" string, where identifiers are the
:ref:`persistent-identifiers` of each node.
-- ``g.nodes.csv.gz``: sorted list of unique node identifiers appearing in the
- corresponding ``g.edges.csv.gz`` file. The format is a gzip-compressed csv
- file with one persistent identifier per line.
+- ``g.nodes.csv.zst``: sorted list of unique node identifiers appearing in the
+ corresponding ``g.edges.csv.zst`` file. The format is a zst-compressed CSV
+ file (single column) with one persistent identifier per line.
.. code:: bash
$ docker run -ti \
--volume /PATH/TO/GRAPH/:/srv/softwareheritage/graph/data \
--publish 127.0.0.1:5009:5009 \
swh-graph:latest \
bash
-Where ``/PATH/TO/GRAPH`` is a directory containing the ``g.edges.csv.gz`` and
-``g.nodes.csv.gz`` files. By default, when entering the container the current
+Where ``/PATH/TO/GRAPH`` is a directory containing the ``g.edges.csv.zst`` and
+``g.nodes.csv.zst`` files. By default, when entering the container the current
working directory will be ``/srv/softwareheritage/graph``; all relative paths
found below are intended to be relative to that dir.
Graph compression
~~~~~~~~~~~~~~~~~
To compress the graph:
.. code:: bash
- $ app/scripts/compress_graph.sh --lib lib/ --input data/g
-
-Warning: very large graphs may need a bigger batch size parameter for WebGraph
-internals (you can specify a value when running the compression script using:
-``--batch-size 1000000000``).
-
-
-Node identifier mappings
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-To dump the mapping files (i.e., various node id <-> other info mapping files,
-in either ``.csv.gz`` or ad-hoc ``.map`` format):
-
-.. code:: bash
-
- $ java -cp lib/swh-graph-*.jar \
- org.softwareheritage.graph.backend.MapBuilder \
- data/g.nodes.csv.gz data/compressed/g
+ $ swh graph compress --graph data/g --outdir data/compressed
Graph server
~~~~~~~~~~~~
To start the swh-graph server:
.. code:: bash
- $ java -cp lib/swh-graph-*.jar \
- org.softwareheritage.graph.App data/compressed/g
-
-To specify the port on which the server will run, use the `--port` or `-p` flag
-(default is 5009).
+ $ swh graph rpc-serve --graph data/compressed/g
diff --git a/reports/experiments/experiments.tex b/reports/experiments/experiments.tex
index fcf1131..fce997e 100644
--- a/reports/experiments/experiments.tex
+++ b/reports/experiments/experiments.tex
@@ -1,233 +1,233 @@
\documentclass[11pt,a4paper]{article}
\usepackage[english]{babel}
\usepackage{a4wide}
\usepackage{booktabs}
\usepackage{minted}
\usepackage{siunitx}
\usepackage[colorlinks,urlcolor=blue,linkcolor=magenta,citecolor=red,linktocpage=true]{hyperref}
\title{Google Summer of Code 2019}
\author{Thibault Allançon}
\date{8 April 2019}
\begin{document}
\maketitle
Early experiments running WebGraph framework on the Software Heritage datasets.
\section{Environment}
Docker environment and compression script can be found here:
-\url{https://forge.softwareheritage.org/source/swh-graph/browse/master/dockerfiles/}.
+\url{https://forge.softwareheritage.org/source/swh-graph/browse/master/docker/}.
\section{Datasets analysis}
\begin{center}
\begin{tabular}{@{} l *4r @{}}
\toprule
\multicolumn{1}{c}{} &
\textbf{\mintinline{text}{.nodes.csv.gz}} &
\textbf{\mintinline{text}{.edges.csv.gz}} &
\textbf{\# of nodes} & \textbf{\# of edges} \\
\midrule
\texttt{rel\_to\_obj}
& 344M & 382M & \num{16222788} & \num{9907464} \\
\texttt{ori\_to\_snp}
& 1.3G & 3.7G & \num{112564374} & \num{194970670} \\
\texttt{dir\_to\_rev}
& 745M & 12G & \num{35399184} & \num{481829426} \\
\texttt{snp\_to\_obj}
& 3.5G & 21G & \num{170999796} & \num{831089515} \\
\texttt{rev\_to\_rev}
& 22G & 33G & \num{1117498391} & \num{1165813689} \\
\texttt{rev\_to\_dir}
& 41G & 48G & \num{2047888941} & \num{1125083793} \\
\texttt{dir\_to\_dir}
& 95G & 1.3T & \num{4805057515} & \num{48341950415} \\
\texttt{dir\_to\_cnt}
& 180G & 3T & \num{9231457233} & \num{112363058067} \\
\midrule
Entire graph (\texttt{all})
& 340G & 4.5T & \num{11595403407} & \num{164513703039} \\
\bottomrule
\end{tabular}
\end{center}
\section{Individual datasets compression}
The first experiments were done on individual datasets.
\subsection{Results}
Datasets were compressed on different VM (depending on availability):
\begin{itemize}
\item \textit{(sexus)} 1TB of RAM and 40vCPU: \mintinline{text}{dir_to_dir}
\item \textit{(monster)} 700GB of RAM and 72vCPU:
\mintinline{text}{dir_to_cnt}
\item \textit{(chub)} 2TB of RAM and 128vCPU: all the others datasets
\end{itemize}
Note: the results may vary because random permutations are used in the graph
compression process.
\begin{center}
\begin{tabular}{@{} l *4r @{}}
\toprule
\multicolumn{1}{c}{} &
\textbf{compr ratio} & \textbf{bit/edge} & \textbf{compr
size\footnotemark} \\
\midrule
\texttt{rel\_to\_obj} & 0.367 & 9.573 & 23M \\
\texttt{ori\_to\_snp} & 0.291 & 8.384 & 140M \\
\texttt{dir\_to\_rev} & 0.07 & 1.595 & 120M & \\
\texttt{snp\_to\_obj} & 0.067 & 1.798 & 253M \\
\texttt{rev\_to\_rev} & 0.288 & 9.063 & 2.2G \\
\texttt{rev\_to\_dir} & 0.291 & 9.668 & 2.6G \\
\texttt{dir\_to\_dir} & 0.336 & 10.178 & 61G \\
\texttt{dir\_to\_cnt} & 0.228 & 7.054 & 97G \\
\midrule
Entire graph (estimated) & & & 163G \\
\bottomrule
\end{tabular}
\end{center}
\footnotetext{calculated as: size of \mintinline{bash}{*.graph} + size of
\mintinline{bash}{*.offsets}}
\subsection{Timings}
\begin{center}
\begin{tabular}{@{} l *6r @{}}
\toprule
\multicolumn{1}{c}{} &
\textbf{MPH} &
\textbf{BV Compress} &
\textbf{Symmetrized} &
\textbf{LLP} &
\textbf{Permute} &
\textbf{Total} \\
\midrule
\texttt{rel\_to\_obj}
& 14s & 25s & 18s & 8min & 10s & \textbf{9min} \\
\texttt{ori\_to\_snp}
& 1min & 5min & 3min & 1h30 & 1min & \textbf{1h40} \\
\texttt{dir\_to\_rev}
& 56s & 22min & 6min & 41min & 2min & \textbf{1h13} \\
\texttt{snp\_to\_obj}
& 3min & 22min & 8min & 2h50 & 5min & \textbf{3h30} \\
\texttt{rev\_to\_rev}
& 11min & 56min & 24min & 31h52 & 20min & \textbf{33h42} \\
\texttt{rev\_to\_dir}
& 20min & 1h & 30min & 52h45 & 23min & \textbf{55h} \\
\bottomrule
\end{tabular}
\end{center}
\vspace{0.5cm}
For the \mintinline{text}{dir_to_*} datasets we decided not use LLP algorithm
because it would take too long, and instead used a BFS traversal order for the
node re-ordering. This allows \textbf{much} faster computation and yields
similar results (thanks to our graph topology).
\vspace{0.5cm}
\begin{center}
\begin{tabular}{@{} l *5r @{}}
\toprule
\multicolumn{1}{c}{} &
\textbf{MPH} &
\textbf{BV Compress} &
\textbf{BFS} &
\textbf{Permute} &
\textbf{Total} \\
\midrule
\texttt{dir\_to\_dir}
& 4h36 & 50h & 4h44 & 12h38 & \textbf{72h} \\
\texttt{dir\_to\_cnt}
& 3h07 & 101h & 17h18 & 20h38 & \textbf{142h} \\
\bottomrule
\end{tabular}
\end{center}
\subsection{Memory usage}
Memory usage monitoring during the compression process:
\begin{center}
\begin{tabular}{@{} l c @{}}
\toprule
\multicolumn{1}{c}{} &
\textbf{Maximum resident set size} \\
\midrule
\texttt{rel\_to\_obj} & 11G \\
\texttt{ori\_to\_snp} & 15G \\
\texttt{dir\_to\_rev} & 22G \\
\texttt{snp\_to\_obj} & 23G \\
\texttt{rev\_to\_rev} & 86G \\
\texttt{rev\_to\_dir} & 154G \\
\texttt{dir\_to\_dir} & 345G \\
\texttt{dir\_to\_cnt} & 764G \\
\midrule
Entire graph (estimated) & 1.4T \\
\bottomrule
\end{tabular}
\end{center}
\section{Entire graph compression}
After studying feasibility on the individual datasets and estimating the final
results, we assembled the entire graph into a single dataset and launched the
compression process on it.
\subsection{Results}
Two different VM where used depending on the compression step:
\begin{itemize}
\item \textit{(monster)} 700GB of RAM and 72vCPU: for the BV compress step.
\item \textit{(rioc)} 3TB of RAM and 48vCPU: all the other steps.
\end{itemize}
The reason to use monster instead of rioc for the BV compress step was because
the I/O on rioc was too slow for the job to complete within the time limit
allowed on the cluster.
\begin{center}
\begin{tabular}{@{} l *3r @{}}
\toprule
\multicolumn{1}{c}{} &
\textbf{compr ratio} & \textbf{bit/edge} & \textbf{compr size} \\
\midrule
\texttt{all} & 0.158 & 4.913 & 101G \\
\texttt{all-transposed} & 0.144 & 4.481 & 94G \\
\bottomrule
\end{tabular}
\end{center}
\subsection{Timings and max memory usage}
\begin{center}
\begin{tabular}{@{} r *2r @{}}
\toprule
\multicolumn{1}{c}{} &
\textbf{Timings} & \textbf{Max mem usage} \\
\midrule
\texttt{MPH} & 3h30 & 10GB \\
\texttt{BV Compress} & 103h & 16GB \\
\texttt{BFS} & 10h & 1057GB \\
\texttt{Permute} & 25h & 115GB \\
\texttt{Stats} & 4h & 102GB \\
\texttt{Transpose} & 22h & 19GB \\
\midrule
Total &
\begin{tabular}{@{}r@{}}\textbf{168h} \\
\scriptsize{(7 days)}\end{tabular} & \textbf{1TB} \\
\bottomrule
\end{tabular}
\end{center}
\end{document}
diff --git a/swh/graph/tests/dataset/.gitignore b/swh/graph/tests/dataset/.gitignore
index cf41cb1..531c841 100644
--- a/swh/graph/tests/dataset/.gitignore
+++ b/swh/graph/tests/dataset/.gitignore
@@ -1,4 +1,5 @@
-dockerfiles/
+docker/
output/*-bv.*
output/stderr
output/stdout
+output/compression.log
diff --git a/swh/graph/tests/dataset/generate_graph.sh b/swh/graph/tests/dataset/generate_graph.sh
index e1a72ee..7b78d36 100755
--- a/swh/graph/tests/dataset/generate_graph.sh
+++ b/swh/graph/tests/dataset/generate_graph.sh
@@ -1,27 +1,23 @@
#!/bin/bash
# Clean previous run
-rm -rf dockerfiles output
+rm -rf docker/ output
mkdir output
# Build Docker work environment
toplevel_dir=`git rev-parse --show-toplevel`
-mkdir -p dockerfiles
-cp -r $toplevel_dir/dockerfiles/ .
-docker build --tag swh-graph-test dockerfiles
+mkdir -p docker
+cp -r $toplevel_dir/docker/ .
+docker build --tag swh-graph-test docker
# Setup input for compression script
tr ' ' '\n' < example.edges.csv | sort -u > example.nodes.csv
zstd < example.nodes.csv > example.edges.csv.zst
zstd < example.edges.csv > example.nodes.csv.zst
-docker run \
- --user $(id -u):$(id -g) \
+docker run \
+ --user $(id -u):$(id -g) \
--name swh-graph-test --rm --tty --interactive \
- --volume $(pwd):/input \
- --volume $(pwd)/output:/output \
- swh-graph-test:latest \
- app/scripts/compress_graph.sh \
- --lib lib/ \
- --input /input/example \
- --outdir /output
+ --volume $(pwd):/input --volume $(pwd)/output:/output \
+ swh-graph-test:latest \
+ swh graph compress --graph /input/example --outdir /output
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Sat, Jun 21, 5:23 PM (1 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3245583
Attached To
rDGRPH Compressed graph representation
Event Timeline
Log In to Comment