diff --git a/reports/experiments/experiments.tex b/reports/experiments/experiments.tex index fa9fe6b..7320132 100644 --- a/reports/experiments/experiments.tex +++ b/reports/experiments/experiments.tex @@ -1,256 +1,256 @@ \documentclass[11pt,a4paper]{article} \usepackage[english]{babel} \usepackage{a4wide} \usepackage{booktabs} \usepackage{minted} \usepackage{siunitx} \usepackage[colorlinks,urlcolor=blue,linkcolor=magenta,citecolor=red,linktocpage=true]{hyperref} \title{Google Summer of Code 2019} \author{Thibault Allançon} \date{8 April 2019} \begin{document} \maketitle Early experiments running WebGraph framework on the Software Heritage datasets. \section{Environment} Docker environment and compression script can be found here: \url{https://forge.softwareheritage.org/source/swh-graph/browse/master/dockerfiles/}. \section{Datasets analysis} \begin{center} \begin{tabular}{@{} l *4r @{}} \toprule \multicolumn{1}{c}{} & \textbf{\mintinline{text}{.nodes.csv.gz}} & \textbf{\mintinline{text}{.edges.csv.gz}} & \textbf{\# of nodes} & \textbf{\# of edges} \\ \midrule \texttt{rel\_to\_obj} & 344M & 382M & \num{16222788} & \num{9907464} \\ \texttt{ori\_to\_snp} & 1.3G & 3.7G & \num{112564374} & \num{194970670} \\ \texttt{dir\_to\_rev} & 745M & 12G & \num{35399184} & \num{481829426} \\ \texttt{snp\_to\_obj} & 3.5G & 21G & \num{170999796} & \num{831089515} \\ \texttt{rev\_to\_rev} & 22G & 33G & \num{1117498391} & \num{1165813689} \\ \texttt{rev\_to\_dir} & 41G & 48G & \num{2047888941} & \num{1125083793} \\ \texttt{dir\_to\_dir} & 95G & 1.3T & \num{4805057515} & \num{48341950415} \\ \texttt{dir\_to\_cnt} & 180G & 3T & \num{9231457233} & \num{112363058067} \\ \midrule Entire graph (\texttt{all}) & 340G & 4.5T & \num{11595403407} & \num{164513703039} \\ \bottomrule \end{tabular} \end{center} \section{Individual datasets compression} The first experiments were done on individual datasets. \subsection{Results} Datasets were compressed on different VM (depending on availability): \begin{itemize} \item \textit{(sexus)} 1TB of RAM and 40vCPU: \mintinline{text}{dir_to_dir} \item \textit{(monster)} 700GB of RAM and 72vCPU: \mintinline{text}{dir_to_cnt} \item \textit{(chub)} 2TB of RAM and 128vCPU: all the others datasets \end{itemize} Note: the results may vary because random permutations are used in the graph compression process. \begin{center} \begin{tabular}{@{} l *4r @{}} \toprule \multicolumn{1}{c}{} & \textbf{compr ratio} & \textbf{bit/edge} & \textbf{compr size\footnotemark} \\ \midrule \texttt{rel\_to\_obj} & 0.367 & 9.573 & 23M \\ \texttt{ori\_to\_snp} & 0.291 & 8.384 & 140M \\ \texttt{dir\_to\_rev} & 0.07 & 1.595 & 120M & \\ \texttt{snp\_to\_obj} & 0.067 & 1.798 & 253M \\ \texttt{rev\_to\_rev} & 0.288 & 9.063 & 2.2G \\ \texttt{rev\_to\_dir} & 0.291 & 9.668 & 2.6G \\ \texttt{dir\_to\_dir} & 0.336 & 10.178 & 61G \\ \texttt{dir\_to\_cnt} & 0.228 & 7.054 & 97G \\ \midrule Entire graph (estimated) & & & 163G \\ \bottomrule \end{tabular} \end{center} \footnotetext{calculated as: size of \mintinline{bash}{*.graph} + size of \mintinline{bash}{*.offsets}} \subsection{Timings} \begin{center} \begin{tabular}{@{} l *6r @{}} \toprule \multicolumn{1}{c}{} & \textbf{MPH} & - \textbf{BVGraph} & + \textbf{BV Compress} & \textbf{Symmetrized} & \textbf{LLP} & - \textbf{Permutation} & + \textbf{Permute} & \textbf{Total} \\ \midrule \texttt{rel\_to\_obj} & 14s & 25s & 18s & 8min & 10s & \textbf{9min} \\ \texttt{ori\_to\_snp} & 1min & 5min & 3min & 1h30 & 1min & \textbf{1h40} \\ \texttt{dir\_to\_rev} & 56s & 22min & 6min & 41min & 2min & \textbf{1h13} \\ \texttt{snp\_to\_obj} & 3min & 22min & 8min & 2h50 & 5min & \textbf{3h30} \\ \texttt{rev\_to\_rev} & 11min & 56min & 24min & 31h52 & 20min & \textbf{33h42} \\ \texttt{rev\_to\_dir} & 20min & 1h & 30min & 52h45 & 23min & \textbf{55h} \\ \bottomrule \end{tabular} \end{center} \vspace{0.5cm} For the \mintinline{text}{dir_to_*} datasets we decided not use LLP algorithm because it would take too long, and instead used a BFS traversal order for the node re-ordering. This allows \textbf{much} faster computation and yields similar results (thanks to our graph topology). \vspace{0.5cm} \begin{center} \begin{tabular}{@{} l *5r @{}} \toprule \multicolumn{1}{c}{} & \textbf{MPH} & - \textbf{BVGraph} & + \textbf{BV Compress} & \textbf{BFS} & - \textbf{Permutation} & + \textbf{Permute} & \textbf{Total} \\ \midrule \texttt{dir\_to\_dir} & 4h36 & 50h & 4h44 & 12h38 & \textbf{72h} \\ \texttt{dir\_to\_cnt} & 3h07 & 101h & 17h18 & 20h38 & \textbf{142h} \\ \bottomrule \end{tabular} \end{center} \subsection{Memory usage} Memory usage monitoring during the compression process: \begin{center} \begin{tabular}{@{} l c @{}} \toprule \multicolumn{1}{c}{} & \textbf{Maximum resident set size} \\ \midrule \texttt{rel\_to\_obj} & 11G \\ \texttt{ori\_to\_snp} & 15G \\ \texttt{dir\_to\_rev} & 22G \\ \texttt{snp\_to\_obj} & 23G \\ \texttt{rev\_to\_rev} & 86G \\ \texttt{rev\_to\_dir} & 154G \\ \texttt{dir\_to\_dir} & 345G \\ \texttt{dir\_to\_cnt} & 764G \\ \midrule Entire graph (estimated) & 1.4T \\ \bottomrule \end{tabular} \end{center} \section{Entire graph compression} After studying feasibility on the individual datasets and estimating the final results, we assembled the entire graph into a single dataset and launched the compression process on it. \subsection{Results} Two different VM where used depending on the compression step: \begin{itemize} - \item \textit{(monster)} 700GB of RAM and 72vCPU: for the BVGraph step. + \item \textit{(monster)} 700GB of RAM and 72vCPU: for the BV compress step. \item \textit{(rioc)} 3TB of RAM and 48vCPU: all the other steps. \end{itemize} -The reason to use monster instead of rioc for the BVGraph step was because the -I/O on rioc was too slow for the job to complete within the time limit allowed -on the cluster. +The reason to use monster instead of rioc for the BV compress step was because +the I/O on rioc was too slow for the job to complete within the time limit +allowed on the cluster. \begin{center} \begin{tabular}{@{} l *3r @{}} \toprule \multicolumn{1}{c}{} & \textbf{compr ratio} & \textbf{bit/edge} & \textbf{compr size} \\ \midrule \texttt{all} & 0.158 & 4.913 & 101G \\ \bottomrule \end{tabular} \end{center} \subsection{Timings} \begin{center} \begin{tabular}{@{} l *5r @{}} \toprule \multicolumn{1}{c}{} & \textbf{MPH} & - \textbf{BVGraph} & + \textbf{BV Compress} & \textbf{BFS} & - \textbf{Permutation} & + \textbf{Permute} & \textbf{Total} \\ \midrule \texttt{all} & 3h30 & 103h & 9h52 & 24h39 & \textbf{141h} \\ \bottomrule \end{tabular} \end{center} Extra steps (more specific to our needs): \begin{center} \begin{tabular}{@{} l *2r @{}} \toprule \multicolumn{1}{c}{} & \textbf{Stats} & \textbf{Transpose} \\ \midrule \texttt{all} & 3h58 & 21h33 \\ \bottomrule \end{tabular} \end{center} \subsection{Memory usage} \begin{center} \begin{tabular}{@{} l c @{}} \toprule \multicolumn{1}{c}{} & \textbf{Maximum resident set size} \\ \midrule \texttt{all} & 1057G \\ \bottomrule \end{tabular} \end{center} \end{document}