diff --git a/talks-public/2020-12-07-coregraphie/2020-12-07-coregraphie.org b/talks-public/2020-12-07-coregraphie/2020-12-07-coregraphie.org new file mode 100644 index 0000000..6eb67cc --- /dev/null +++ b/talks-public/2020-12-07-coregraphie/2020-12-07-coregraphie.org @@ -0,0 +1,479 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) +#+TITLE: Ultra-Large-Scale Repository Analysis via Graph Compression +#+BEAMER_HEADER: \date[7 Dec 2020, Coregraphie]{7 December 2020\\Coregraphie -- Graph libraries and platforms Virtual Meeting} +#+AUTHOR: Antoine Pietri +#+DATE: 7 December 2020 +#+EMAIL: antoine.pietri@inria.fr + +#+INCLUDE: "../../common/modules/prelude.org" :minlevel 1 +#+INCLUDE: "../../common/modules/169.org" +#+BEAMER_HEADER: \pgfdeclareimage[height=90mm,width=160mm]{bgd}{world-169.png} +#+BEAMER_HEADER: \titlegraphic{} +#+BEAMER_HEADER: \institute[Inria]{Inria, France} +#+BEAMER_HEADER: \author[Antoine Pietri]{Antoine Pietri\\ {\small\tt antoine.pietri@inria.fr\\ @seirl\_}\\ based on work with Paolo Boldi, Sebastiano Vigna and Stefano Zacchiroli} +#+BEAMER_HEADER: \title[WebGraph for Large-Scale Repository Analysis]{Using WebGraph for Large-Scale Repository Analysis} + +# Syntax highlighting setup +#+LATEX_HEADER_EXTRA: \usepackage{minted} +#+LaTeX_HEADER_EXTRA: \usemintedstyle{tango} +#+LaTeX_HEADER_EXTRA: \newminted{sql}{fontsize=\scriptsize} +#+name: setup-minted +#+begin_src emacs-lisp :exports results :results silent + (setq org-latex-listings 'minted) + (setq org-latex-minted-options + '(("fontsize" "\\scriptsize"))) + (setq org-latex-to-pdf-process + '("pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f" + "pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f" + "pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f")) +#+end_src +# End syntax highlighting setup + +* Introduction +** Background + - Our work: helping researchers study *public software development* + - Free/Open Source Software (*FOSS*) + *social coding* (GitHub, GitLab, ...) + - Applications: software evolution, software health, clone detection, provenance... + + #+BEAMER: \vfill \pause +*** Massive amounts of data + - GitHub alone: ~100 M repositories + - exponential growth rate, doubling every ~2 years (Rousseau et al., 2009) + - possibly the tip of the iceberg w.r.t. the rise of distributed forges and + non-public collaborative development (cf. /inner source/) + + #+BEAMER: \vfill \pause +*** Current approaches + - *scale-out analysis:* not always applicable, expensive + - *sampling:* (e.g., top-starred repos) prone to selection bias and + external validity issues + +** Compression approach +*** Objective + #+BEAMER: \Large + /Storing the entire graph of public software development on a single machine/ +*** + - Simple for prototyping + - Allows us to run topological analyses quickly + - Doesn't require scale-out approaches/algorithms + +** Corpus --- Software Heritage + - our archive of publicly available software: + #+BEAMER: \vspace{-3mm} + #+BEAMER: \begin{center}\includegraphics[width=0.6\textwidth]{SWH-logo+motto}\end{center} + #+BEAMER: \vspace{-3mm} + - both *source code* and its *development history* as captured by VCS + - coverage: + - software forges: GitHub, GitLab, Google Code, Gitorious... + - package manager repositories: NPM, PyPI, Debian, ... + - 110 M repositories, 8.2 B unique files, 1.7 B unique commits (data dump: + May 2020) + - available as offline dataset + #+BEGIN_EXPORT latex + \begin{thebibliography}{Foo Bar, 1969} + \footnotesize + \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli\newblock + The Software Heritage Graph Dataset: Public software development under one roof\newblock + MSR 2019: 16th Intl. Conf. on Mining Software Repositories. IEEE + \end{thebibliography} + #+END_EXPORT + # \newblock preprint: \url{http://deb.li/swhmsr19} + +* Background +** (Web) graph compression +*** The graph of the Web + :PROPERTIES: + :BEAMER_env: definition + :END: + Directed graph that has Web pages as nodes and hyperlinks between them as + edges. +*** Properties (1) + - **Locality:** pages links to pages whose URL is lexicographically + similar. URLs share long common prefixes. + + → use *D-gap compression* +*** Adjacency lists + :PROPERTIES: + :BEAMER_env: block + :BEAMER_col: 0.5 + :END: + #+BEAMER: \scriptsize + | *Node* | *Outdegree* | *Successors* | + |--------+-------------+--------------------------------------| + | ... | ... | ... | + | 15 | 11 | 13,15,16,17,18,19,23,24,203,315,1034 | + | 16 | 10 | 15,16,17,22,23,24,315,316,317,3041 | + | 17 | 0 | | + | 18 | 5 | 13,15,16,17,50 | + | ... | ... | ... | +*** D-gapped adjacency lists + :PROPERTIES: + :BEAMER_env: block + :BEAMER_col: 0.5 + :END: + #+BEAMER: \scriptsize + | *Node* | *Outdegree* | *Successors* | + |--------+-------------+-----------------------------| + | ... | ... | ... | + | 15 | 11 | 3,1,0,0,0,0,3,0,178,111,718 | + | 16 | 10 | 1,0,0,4,0,0,290,0,0,2723 | + | 17 | 0 | | + | 18 | 5 | 9,1,0,0,32 | + | ... | ... | ... | + +** (Web) graph compression (cont.) +*** The graph of the Web + :PROPERTIES: + :BEAMER_env: definition + :END: + Directed graph that has Web pages as nodes and hyperlinks between them as + edges. +*** Properties (2) + - **Similarity:** pages that are close together in lexicographic order tend + to have many common successors. + → use *reference compression* +*** Adjacency lists + :PROPERTIES: + :BEAMER_env: block + :BEAMER_col: 0.5 + :END: + #+BEAMER: \scriptsize + | *Node* | *Outd.* | *Successors* | + |--------+---------+--------------------------------------| + | ... | ... | ... | + | 15 | 11 | 13,15,16,17,18,19,23,24,203,315,1034 | + | 16 | 10 | 15,16,17,22,23,24,315,316,317,3041 | + | 17 | 0 | | + | 18 | 5 | 13,15,16,17,50 | + | ... | ... | ... | +*** Copy lists + :PROPERTIES: + :BEAMER_env: block + :BEAMER_col: 0.55 + :END: + #+BEAMER: \scriptsize + | *Node* | *Ref.* | *Copy list* | *Extra nodes* | + |--------+--------+-------------+--------------------------------------| + | ... | ... | ... | ... | + | 15 | 0 | | 13,15,16,17,18,19,23,24,203,315,1034 | + | 16 | 1 | 01110011010 | 22,316,317,3041 | + | 17 | | | | + | 18 | 3 | 11110000000 | 50 | + | ... | ... | ... | | + +* Data model +** Data model + #+BEAMER: \centering \includegraphics[width=\textwidth]{swh-data-model-h} + +** Data model --- example :noexport: + #+BEAMER: \centering + #+BEAMER: \only<1>{\includegraphics[width=\textwidth]{swh-merkle-dag-small-visit1}} + #+BEAMER: \only<2>{\includegraphics[width=\textwidth]{swh-merkle-dag-small-visit2}} + #+BEAMER: \only<3>{\includegraphics[width=\textwidth]{swh-merkle-dag-small}} + +* VCS compression +** Corpus --- as a graph +*** Nodes + :PROPERTIES: + :BEAMER_env: block + :BEAMER_col: 0.5 + :END: + | *Node type* | *N. of nodes* | + |-------------+---------------| + | origins | 88 M | + | snapshots | 57 M | + | releases | 9.9 M | + | revisions | 1.1 B | + | directories | 4.9 B | + | contents | 5.5 B | + |-------------+---------------| + | Total nodes | 12 B | +*** Edges + :PROPERTIES: + :BEAMER_env: block + :BEAMER_col: 0.5 + :END: + | *Edge type* | *N. of edges* | + |-----------------------+---------------| + | origin → snapshot | 195 M | + | snapshot → revision | 616 M | + | snapshot → release | 215 M | + | release → revision | 9.9 M | + | revision → revision | 1.2 B | + | revision → directory | 1.1 B | + | directory → directory | 48 B | + | directory → revisiony | 482 M | + | directory → content | 112 B | + |-----------------------+---------------| + | Total edges | 165 B | +*** + Archive snapshot 2018-09-25, from the Software Heritage graph dataset.\\ + Growth rate: exponential, doubling every 22-30 months. + +** Compression pipeline +*** Pipeline + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + #+BEAMER: \vspace{-8mm} + #+BEAMER: \hspace*{-0.1\linewidth} \includegraphics[width=1.2\linewidth]{compression/compression-steps} + #+BEAMER: \vspace{-1.2cm} + - *MPH:* minimal perfect hash, mapping Merkle IDs to 0..N-1 integers + - *BV compress:* Boldi-Vigna compression (based on MPH order) + - *BFS:* breadth-first visit to renumber + - *Permute:* update BV compression according to BFS order + #+BEAMER: \vfill +*** (Re)establishing locality + - key for good compression is a node ordering that ensures locality and + similarity + - which is very much /not/ the case with Merkle IDs... + - ...but is the case /again/ after BFS + +** Compression time + We ran the compression pipeline on the input corpus using the WebGraph + framework + #+BEGIN_EXPORT latex + \begin{thebibliography}{Foo Bar, 1969} + \footnotesize + \bibitem{BoVWFI} Paolo Boldi and Sebastiano Vigna. + \newblock The WebGraph framework I: Compression techniques + \newblock WWW 2004: 13th Intl. World Wide Web Conference. ACM + \end{thebibliography} + #+END_EXPORT + + | *Step* | *Wall time* (hours) | + |-------------+---------------------| + | MPH | 2 | + | BV Compress | 84 | + | BFS | 19 | + | Permute | 18 | + | Transpose | 15 | + |-------------+---------------------| + | Total | 138 (6 days) | + + - server equipped with 24 CPUs and 750 GB of RAM + - RAM mostly used as I/O cache for the BFS step + - /minimum/ memory requirements are close to the RAM needed to load the + final compressed graph in memory + +** Compression efficiency +*** Forward graph + :PROPERTIES: + :BEAMER_env: block + :BEAMER_col: 0.4 + :END: + | total size | 91 GiB | + | bits per edge | 4.91 | + # | compression ratio | 15.8% | +*** Backward graph + :PROPERTIES: + :BEAMER_env: block + :BEAMER_col: 0.4 + :END: + | total size | 83 GiB | + | bits per edge | 4.49 | + # | compression ratio | 14.4% | + #+BEAMER: \vfill +*** Operation cost + The structure of a full bidirectional archive graph fits in less than 200 + GiB of RAM, for a hardware cost of ~300 USD. + +* Exploitation +** A domain-agnostic benchmark --- full corpus traversal +*** Benchmark --- Full BFS visit + #+BEAMER: \begin{columns}\begin{column}{0.45\textwidth} + | *Forward graph* | | + |------------------+----------------| + | wall time | 1h48m | + | throughput | 1.81 M nodes/s | + | | (553 ns/node) | + #+BEAMER: \end{column}\begin{column}{0.45\textwidth} + | *Backward graph* | | + |------------------+----------------| + | wall time | 3h17m | + | throughput | 988 M nodes/s | + | | (1.01 µs/node) | + #+BEAMER: \end{column}\end{columns} +*** + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + For comparison, BFS of this graph on Spark/GraphX: 4 hours, 80 nodes(!) +** A domain-agnostic benchmark --- edge lookup +*** Benchmark --- Edge lookup + random sample: 1 B nodes (8.3% of entire graph) + #+BEAMER: \begin{columns}\begin{column}{0.45\textwidth} + | *Forward graph* | | + |------------------+----------------| + | visited edges | 13.6 B | + | throughput | 12.0 M edges/s | + | | (83 ns/edge) | + #+BEAMER: \end{column}\begin{column}{0.45\textwidth} + | *Backward graph* | | + |------------------+----------------| + | visited edges | 13.6 B | + | throughput | 9.45 M edges/s | + | | (106 ns/edge) | + #+BEAMER: \end{column}\end{columns} +*** + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + Note how edge lookup time is close to DRAM random access time (50-60 ns). + +# ** Domain-specific benchmarks --- source code artifact multiplication +# *** +# Simple *clone detection* style experiments realized exploiting the +# compressed corpus: +# 1. *file→commit multiplication:* how much identical source code files +# re-occur in different comments +# 2. *commit→origin multiplication:* how much identical commits re-occur in +# different repositories +# *** Implementation +# - for each node---content for (1), commit for (2)---*visit* the backward +# graph and *count* all reachable nodes of the desired type---commit for +# (1), origin for (2) +# - naive approach, O(|V|x|E|) complexity +# ** File→commit multiplication --- results +# #+BEAMER: \vspace{-5mm} +# *** Multiplication factor +# :PROPERTIES: +# :BEAMER_env: block +# :BEAMER_col: 0.5 +# :END: +# #+BEAMER: \centering \includegraphics[width=0.9\textwidth]{this/content-revision-multiplication.png} +# *** Visit size +# :PROPERTIES: +# :BEAMER_env: block +# :BEAMER_col: 0.5 +# :END: +# #+BEAMER: \centering \includegraphics[width=0.9\textwidth]{this/content-revision-node-size.png} +# *** Benchmark figures +# :PROPERTIES: +# :BEAMER_env: ignoreheading +# :END: +# #+BEAMER: \vfill +# - random sample of 953 M contents (17% of the full corpus) +# - processing time: ~2.5 days (single machine with 20 x 2.4 GHz cores) +# - /in spite of/ the naive O(|V|x|E|) approach, generally considered +# intractable at this scale + +* Discussion +** Limitations + #+BEAMER: \vspace{-2mm} +*** Incrementality + - compression is inherently *not incremental* + - not an issue for most research use cases, because we analyze immutable + data dumps + - common workaround (e.g., for the Web and social networks) is to keep an + uncompressed *in-memory overlay* for graph updates, and periodically + recompress + #+BEAMER: \pause +* Code examples +** Code example: simple DFS + :PROPERTIES: + :BEAMER_OPT: fragile + :END: +#+attr_latex: :options fontsize=\scriptsize,highlightlines={8} +#+begin_src java + HashSet visited = new HashSet<>(); + Stack stack = new Stack<>(); + stack.push(srcNodeId); + visited.add(srcNodeId); + + while (!stack.isEmpty()) { + long currentNodeId = stack.pop(); + LazyLongIterator it = graph.successors(currentNodeId); + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1; ) { + if (!visited.contains(neighborNodeId)) { + stack.push(neighborNodeId); + visited.add(neighborNodeId); + } + } + } +#+end_src +* Going further +** Going further: LLP compression +*** Layered Label Propagation + #+BEGIN_EXPORT latex + \begin{thebibliography}{Foo Bar, 1969} + \small \vspace{-2mm} + \bibitem{Boldi2010} Paolo Boldi, Marco Rosa, Massimo Santini, Sebastiano Vigna + \newblock Layered Label Propagation: A MultiResolution Coordinate-Free Ordering for Compressing Social Networks + \end{thebibliography} + #+END_EXPORT +*** + - New algorithm to find locality information + - Propagates labels on random nodes to discover neighborhoods + - Requires more memory (33 bytes per node) + - Even more impressive compression ratio (117 GiB → 77 GiB) + +** Going further: Attributes +*** Node attributes + - the compressed in-memory graph structure has *no attributes* + - usual data design is to exploit the 0..N-1 integer ranges to *memory map + /node/ attributes* to secondary storage + - we have done this with a *node type map*; it weights 4 GB (3 bit per + node) +*** Edge attributes + - Built-in WebGraph support for attributes on the *edges* (generally integers) + - For file /names/, we use another minimal perfect hash to map file names to integers +*** Disk/memory consideration + - Labels and mappings can be either stored on disk or in RAM + - Hybrid setup works well for queries that do graph traversal first and + "join" node attributes last; ping-pong between the two is expensive + +** Code example: reading a labelled graph +#+attr_latex: :options fontsize=\scriptsize,highlightlines={9,12,13,14} +#+begin_src java +ArcLabelledImmutableGraph graph = BitStreamArcLabelledImmutableGraph.load( + graphPath + "-labelled"); +ArcLabelledNodeIterator it = graph.nodeIterator(); +while (it.hasNext()) { + long srcNode = it.nextLong(); + ArcLabelledNodeIterator.LabelledArcIterator s = it.successors(); + long dstNode; + while ((dstNode = s.nextLong()) >= 0) { + int label = (int) s.label().get(); + System.out.format("%s %s %s\n", + // Fetch the actual attributes from their integer ID + nodeMap.get(srcNode), + nodeMap.get(dstNode), + labelMap.get(label) + ); + } +} +#+end_src + +* Conclusion +** Wrapping up + #+BEAMER: \vspace{-2mm} +*** Key points + - WebGraph can compress immense graphs (~200 B edges) to + a reasonable size (around 200 GiB) + - Runs on commodity hardware, cheaper than a cluster + - Very efficient and performance-conscious + - Lots of tooling (perfect hash functions, graph algorithms, ...) + - Only stores topology, but has the necessary tooling to store node/edge attributes + - Not incremental, only on a static version + +* Conclusion +** Ending notes +*** Full paper of our return of experience + #+BEGIN_EXPORT latex + \begin{thebibliography}{Foo Bar, 1969} + \small \vspace{-2mm} + \bibitem{Boldi2020} Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli + \newblock Ultra-Large-Scale Repository Analysis via Graph Compression + \newblock SANER 2020, 27th Intl. Conf. on Software Analysis, Evolution and Reengineering. IEEE + \end{thebibliography} + preprint: {\small \url{http://bit.ly/swh-graph-saner20}} + #+END_EXPORT +*** Contacts + Antoine Pietri / antoine.pietri@inria.fr / @seirl_ \\ + Stefano Zacchiroli / zack@irif.fr / @zacchiro + +* Appendix :B_appendix: + :PROPERTIES: + :BEAMER_env: appendix + :END: diff --git a/talks-public/2020-12-07-coregraphie/Makefile b/talks-public/2020-12-07-coregraphie/Makefile new file mode 100644 index 0000000..68fbee7 --- /dev/null +++ b/talks-public/2020-12-07-coregraphie/Makefile @@ -0,0 +1 @@ +include ../Makefile.slides