diff --git a/common/images/compression/compression-steps.dot b/common/images/compression/compression-steps.dot new file mode 100644 index 0000000..dd0d48e --- /dev/null +++ b/common/images/compression/compression-steps.dot @@ -0,0 +1,28 @@ +digraph "Compression steps" { + // Horizontal graph + rankdir=LR; + + mph [label="MPH", shape=box]; + + bv_compress [label="BV compress", shape=box]; + + bfs [label="BFS", shape=box]; + + permute [label="Permute", shape=box]; + comp_fwd [label="Compressed\ngraph\n(forward)", shape=none]; + comp_bwd [label="Compressed\ngraph\n(backward)", shape=none]; + + transpose [label="Transpose", shape=box]; + + merkle_dag [label="Merkle\nDAG", shape=none]; + + merkle_dag -> mph; + merkle_dag -> bv_compress; + mph -> bv_compress; + bv_compress -> bfs; + bv_compress -> permute; + bfs -> permute; + permute -> comp_fwd; + comp_fwd -> transpose; + transpose -> comp_bwd; +} diff --git a/common/images/compression/compression-steps.pdf b/common/images/compression/compression-steps.pdf new file mode 100644 index 0000000..7b1ee32 Binary files /dev/null and b/common/images/compression/compression-steps.pdf differ diff --git a/talks-public/2019-11-14-zurich-uzh/2019-11-14-zurich-uzh.org b/talks-public/2019-11-14-zurich-uzh/2019-11-14-zurich-uzh.org new file mode 100644 index 0000000..2c1943f --- /dev/null +++ b/talks-public/2019-11-14-zurich-uzh/2019-11-14-zurich-uzh.org @@ -0,0 +1,145 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) +#+TITLE: Software Heritage +#+SUBTITLE: Source Code Archival and Analysis at the Scale of the World +#+BEAMER_HEADER: \date[14 Nov 2019, UZH]{14 November 2019\\University of Zurich--- Zurich, Switzerland} +#+AUTHOR: Stefano Zacchiroli +#+DATE: 14 November 2019 +#+EMAIL: zack@upsilon.cc + +#+INCLUDE: "../../common/modules/prelude-toc.org" :minlevel 1 +#+INCLUDE: "../../common/modules/169.org" +#+BEAMER_HEADER: \institute[UPD \& Inria]{Univ. Paris Diderot \& Inria --- {\tt zack@upsilon.cc, @zacchiro}} +#+BEAMER_HEADER: \author{Stefano Zacchiroli} + +* Source code in science +** Software Source code: pillar of Open Science +*** Software is everywhere in modern research :B_picblock: + :PROPERTIES: + :BEAMER_opt: pic=papermountain, leftpic=true, width=.3\linewidth + :BEAMER_env: picblock + :BEAMER_COL: .6 + :END: +#+BEGIN_QUOTE +[...] software [...] essential in their fields. + +\mbox{}\hfill Top 100 papers (Nature, 2014) +#+END_QUOTE +#+BEGIN_QUOTE +Sometimes, if you dont have the software, you dont have the data + +\mbox{}\hfill Christine Borgman, Paris, 2018 +#+END_QUOTE +# http://www.nature.com/news/the-top-100-papers-1.16224 +#+BEAMER: \pause +*** Open Science: three pillars :B_block: + :PROPERTIES: + :BEAMER_COL: .45 + :BEAMER_env: block + :END: +#+latex: \begin{center} +#+ATTR_LATEX: :width \extblockscale{\linewidth} +file:PreservationTriangle.png +#+latex: \end{center} +#+BEAMER: \pause +*** :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: +*** Nota bene + \hfill The links in the picture are *essential* +** Source code is /special/ +#+INCLUDE: "../../common/modules/source-code-different-short.org::#softwareisdifferent" :only-contents t :minlevel 3 +** The state of the art is not ideal +#+INCLUDE: "../../common/modules/reprod-bad-sota.org::#collbergmethod" :only-contents t :minlevel 3 +** The state of the art is not ideal... (cont.) +#+INCLUDE: "../../common/modules/reprod-bad-sota.org::#collbergfindings" :only-contents t :minlevel 3 +#+BEAMER: \pause +*** The main reason + \hfill source code (/or the right version of it/) cannot be found +** Motivations :noexport: +#+INCLUDE: "../../common/modules/swh-motivations-foss.org::#fragile" :minlevel 2 +#+INCLUDE: "../../common/modules/swh-motivations-foss.org::#spread" :minlevel 2 +#+INCLUDE: "../../common/modules/swh-motivations-foss.org::#research" :minlevel 2 + +* Software Heritage + #+INCLUDE: "../../common/modules/swh-overview-sourcecode.org::#mission" :minlevel 2 + #+INCLUDE: "../../common/modules/principles-short.org::#principles" :minlevel 2 + #+INCLUDE: "../../common/modules/status-extended.org::#archivinggoals" :minlevel 2 + #+INCLUDE: "../../common/modules/status-extended.org::#architecture" :minlevel 2 :only-contents t + #+INCLUDE: "../../common/modules/status-extended.org::#datamodel" :only-contents t + #+INCLUDE: "../../common/modules/status-extended.org::#dagdetailsmall" :only-contents t + #+INCLUDE: "../../common/modules/status-extended.org::#archive" :minlevel 2 + # #+INCLUDE: "../../common/modules/status-extended.org::#api" :only-contents t + # #+INCLUDE: "../../common/modules/status-extended.org::#apiintro" :minlevel 2 + # #+INCLUDE: "../../common/modules/vault.org::#overview" :minlevel 2 + # #+INCLUDE: "../../common/modules/webui.org::#intro" :minlevel 2 + +* Research challenges & roadmap + #+INCLUDE: "this/research-roadmap.org::#main" :only-contents t +* Conclusion +** MSR 2020 Mining Challenge = Software Heritage Graph Dataset + #+BEGIN_EXPORT latex + \includegraphics[width=\textwidth]{this/msr-2020} + #+END_EXPORT + + https://2020.msrconf.org/track/msr-2020-mining-challenge + +*** + #+BEGIN_EXPORT latex + \begin{thebibliography}{Foo Bar, 1969} + \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano + Zacchiroli\newblock The Software Heritage graph dataset: public software + development under one roof\newblock MSR 2019: Mining Software Repositories, + IEEE + \end{thebibliography} + #+END_EXPORT + +** We're hiring! (a postdoc) +*** Paris-based postdoc on software provenance + - large-scale, big data *graph analysis* + - tracking the *provenance of source code* artifacts + - … at the *scale of the world* (what else?) + - in the context of *industrial partnerships* on open source license + compliance + - supervision: Stefano Zacchiroli, Roberto Di Cosmo + +*** Learn more and apply + - https://www.softwareheritage.org/jobs/ + - ask me! zack@upsilon.cc + +** Wrapping up + #+latex: \vspace{-2mm} +*** + - Software Heritage archives all software source code with its development + history. + - It is a major endeavor that benefits society, science, and industry. + - For computer scientists, it is a gold mine of research opportunities. + Wanna join? + #+latex: \vspace{-2mm} +*** References + #+latex: \vspace{-1mm} + #+BEGIN_EXPORT latex + \begin{thebibliography}{Foo Bar, 1969} + \scriptsize + + \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli\newblock + The Software Heritage graph dataset: public software development under one roof\newblock + MSR 2019: Mining Software Repositories, IEEE + + \bibitem{Abramatic2018} Jean-François Abramatic, Roberto Di Cosmo, Stefano Zacchiroli\newblock + Building the Universal Archive of Source Code\newblock + Communications of the ACM, October 2018 + + \bibitem{DiCosmo2017} Roberto Di Cosmo, Stefano Zacchiroli\newblock + Software Heritage: Why and How to Preserve Software Source Code\newblock + iPRES 2017: Intl. Conf. on Digital Preservation + + \end{thebibliography} + #+END_EXPORT +*** Contacts + Stefano Zacchiroli / zack@upsilon.cc / @zacchiro +* Appendix :B_appendix: + :PROPERTIES: + :BEAMER_env: appendix + :END: + #+INCLUDE: "../../common/modules/dataset.org::#main" :only-contents t diff --git a/talks-public/2019-11-14-zurich-uzh/Makefile b/talks-public/2019-11-14-zurich-uzh/Makefile new file mode 100644 index 0000000..68fbee7 --- /dev/null +++ b/talks-public/2019-11-14-zurich-uzh/Makefile @@ -0,0 +1 @@ +include ../Makefile.slides diff --git a/talks-public/2019-11-14-zurich-uzh/this/msr-2020.png b/talks-public/2019-11-14-zurich-uzh/this/msr-2020.png new file mode 100644 index 0000000..0c87433 Binary files /dev/null and b/talks-public/2019-11-14-zurich-uzh/this/msr-2020.png differ diff --git a/talks-public/2019-11-14-zurich-uzh/this/research-roadmap.org b/talks-public/2019-11-14-zurich-uzh/this/research-roadmap.org new file mode 100644 index 0000000..10ddf2f --- /dev/null +++ b/talks-public/2019-11-14-zurich-uzh/this/research-roadmap.org @@ -0,0 +1,273 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) + +# A research roadmap to realize the "big code" analysis vision in practice + +#+INCLUDE: "prelude.org" :minlevel 1 + +* Research roadmap + :PROPERTIES: + :CUSTOM_ID: main + :END: +** Realizing the "large telescope of source code" in practice +*** Requirements + - *Availability*: Software Heritage mirror, relatively up-to-date + - *Efficiency*: massive computing resources with fast access to the mirror + - *Sustainability*: pay-per-use or bring-your-own-computing +*** Challenges + - mirroring + - compression + - efficient processing + - experiments description language + - big code analysis (i.e., ML on source code) +*** :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + ... at the scale of the world! + +** Challenge #1: Mirroring +*** Thomas Jefferson, February 18, 1791 + #+latex: \begin{quote} + Let us save what remains: not by vaults and locks which fence them from the + public eye and use in consigning them to the waste of time, but by such a + *multiplication of copies*, as shall place them *beyond the reach of + accident*. + #+latex: \end{quote} + #+beamer: \pause +*** Mirroring: the good + - big, but not /that/ big + - append-only archive (in theory), easy to journal and incrementally update +** Challenge #1: Mirroring (cont.) +*** Mirroring: the bad --- Merkle DAGs with "holes" + - the world sucks: corrupted repositories, takedown notices, data losses + - nodes can go missing at archival time or disappear later on + - top-level hash(es) no longer capture the full state of the archive +*** Open questions + - how do you capture such full state then? + - by extension: how do you /timestamp/ the archive? + - how do you efficiently check if something is to be re-archived? + - ultimately, what's your notion of having "fully archived" something? + +** Challenge #2: Compression --- file contents +*** Storage figures + - file contents: ~400 TB (raw), ~200 TB compressed (1-by-1 content + compression) + - median compressed size: 3 KB → *a lot (~6 B) of very small files* +*** :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + Practical problem: scale-out object storages are not designed for this + workload. + #+beamer: \pause +*** Content compression + - low compression ratio (2x) with 1-by-1 compression + - typical Git/VCS packing heuristics do not work here, because contents + occur in many different contexts + - early experiences with Rabin-style compression & co. were unsatisfactory + - increased deduplication granularity (e.g., SLOCs) will likely suffer of + the same problem + - research lead: *object packing*, using heuristics that maximize the + chances of compressability (e.g., having /a/ file name in common) + +** Challenge #2: Compression --- graph +*** Storage figures + - Merkle DAG: ~20 B nodes, ~280 B edges + - breakdown by node type: ~40% contents, ~40% directories, ~10% commits +*** :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + Still outside the limits of what (cheaply & trivially) fits in RAM. + #+beamer: \pause +*** Graph compression + - related: Web graph compression techniques in the style of, e.g.: + #+BEGIN_EXPORT latex + \begin{thebibliography}{Foo Bar, 1969} + \bibitem{Vigna2004} Paolo Boldi, Sebastiano Vigna + \newblock The WebGraph framework I: compression techniques + \newblock WWW 2004 + \end{thebibliography} + #+END_EXPORT + - challenges: no canonical identifiers with good locality properties, + different node types/subgraphs, live updates + - research lead: graph topology characterization + +** Graph compression --- preliminary results + #+BEGIN_EXPORT latex + \vspace{-5mm} + \includegraphics[width=\linewidth]{../../common/images/compression/compression-steps} + #+END_EXPORT + #+BEAMER: \vspace{-1cm} +*** Dataset + - archive snapshot 2018-09-25 + - size: 12 B nodes, 165 B edges + #+BEAMER: \pause +*** Compression efficiency (space) + #+BEAMER: \vspace{-3mm}\begin{columns}\begin{column}{0.45\textwidth} + | *forward graph* | | + |-------------------+--------| + | total size | 91 GiB | + | bits per edge | 4.91 | + | compression ratio | 15.8% | + #+BEAMER: \end{column}\begin{column}{0.45\textwidth} + | *backward graph* | | + |-------------------+--------| + | total size | 83 GiB | + | bits per edge | 4.49 | + | compression ratio | 14.4% | + #+BEAMER: \end{column}\end{columns} +*** + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + The structure of a full bidirectional archive graph fits in less than 200 + GiB of RAM, for a hardware cost of ~300 USD. +** Graph compression --- preliminary results (cont.) +*** Analysis efficiency (time) --- Full BFS visit + #+BEAMER: \begin{columns}\begin{column}{0.45\textwidth} + | *forward graph* | | + |------------------+----------------| + | wall time | 1h48m | + | throughput | 1.81 M nodes/s | + | | (553 ns/node) | + #+BEAMER: \end{column}\begin{column}{0.45\textwidth} + | *backward graph* | | + |------------------+----------------| + | wall time | 3h17m | + | throughput | 988 M nodes/s | + | | (1.01 µs/node) | + #+BEAMER: \end{column}\end{columns} +*** Analysis efficiency (time) --- Edge lookup + random sample: 1 B nodes (8.3% of entire graph) + #+BEAMER: \begin{columns}\begin{column}{0.45\textwidth} + | *forward graph* | | + |------------------+----------------| + | visited edges | 13.6 B | + | throughput | 12.0 M edges/s | + | | (83 ns/edge) | + #+BEAMER: \end{column}\begin{column}{0.45\textwidth} + | *backward graph* | | + |------------------+----------------| + | visited edges | 13.6 B | + | throughput | 9.45 M edges/s | + | | (106 ns/edge) | + #+BEAMER: \end{column}\end{columns} +*** + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + Note how edge lookup time is close to DRAM random access time (50-60 ns). +** Challenge #3: Efficient processing --- graph + Big graphs calls for scale-out processing, e.g.: + #+BEGIN_EXPORT latex + \begin{thebibliography}{Foo Bar, 1969} \footnotesize + \bibitem{Pregel} Malewicz, Grzegorz, et al. + \newblock Pregel: a system for large-scale graph processing + \newblock ACM SIGMOD 2010 + \bibitem{Chaos} Roy, Amitabha, et al. + \newblock Chaos: Scale-out graph processing from secondary storage + \newblock ACM SOSP 2015 + \end{thebibliography} + #+END_EXPORT + support very large graphs, exploiting topological characteristics (e.g., + small world) +*** \vspace :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + #+beamer: \vspace{-1mm} +*** Software Heritage graph characteristics + - scale-free, not small world (?) + - connected components size distribution unclear yet +*** \vspace :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + #+beamer: \vspace{-1mm} +*** Approach: topological characterization + - if amenable to scale-out → distribution + - if not → scale-up using graph compression + +** Challenge #3: Efficient processing --- file contents +*** Code search: a natural need + Find code snippets for reuse, find reverse dependencies for maintenance, + impact analysis, etc. +*** Approaches vary with their level of code "understanding" + - full-text search: treat source code as text + - symbol extraction (e.g., ctags) + - AST search (language-specific) +** Challenge #3: Efficient processing --- file contents (cont.) + #+BEGIN_EXPORT latex + \vspace{-2mm} + \begin{thebibliography}{Foo Bar, 1969} + \bibitem{CodeSearch} Russ Cox \footnotesize + \newblock Regular Expression Matching with a Trigram Index or How Google Code Search Worked + \newblock 2012, https://swtch.com/~rsc/regexp/regexp4.html + \end{thebibliography} + \vspace{-1mm} + #+END_EXPORT +*** Use regexs, Luke! + - build an inverted index of trigrams for all source code files + - compile regex to trigrams, find /potential/ matches + - =grep= each potential match using map reduce +*** \vspace :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + #+beamer: \vspace{-1mm} +*** Application + - Google Code Search (now gone) + - Debsources (https://sources.debian.org) + - scale up to 1 B SLOCs (Debian development branch) +*** \vspace :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + #+beamer: \vspace{-1mm} +*** Challenge + Scale it up to Software Heritage, several orders of magnitude later + +** Challenge #4: Experiment definition +*** Problem + How would researchers define their experiments on the dataset? + - corpus selection (code? history? sampling? etc.) + - how will they run their tools in a fully-deduplicated world? + - job scheduling (well-known HPC problem) +*** Related work (at a smaller scale) + #+BEGIN_EXPORT latex + \vspace{-2mm} + \begin{thebibliography}{Foo Bar, 1969} + \bibitem{Dyer2015} Robert Dyer et al. + \newblock Boa: Ultra-large-scale software repository and source-code mining + \newblock ACM TOSEM 25.1 (2015): 7 + \end{thebibliography} + #+END_EXPORT +*** Approach + formal languages, DSLs, tool adapters/porting + +** Challenge #5: Machine Learning on code +*** Big Code = Big Data + Source Code + Popular research trends: NLP on code / SBSE using ML, e.g.: + #+BEGIN_EXPORT latex + \begin{thebibliography}{Foo Bar, 1969} \footnotesize + \bibitem{Allamanis2018} Miltiadis Allamanis et al. + \newblock A survey of machine learning for big code and naturalness + \newblock ACM Computing Surveys + \bibitem{Allamanis2018} Xiaodong Gu, Hongyu Zhang, Sunghun Kim + \newblock Deep Code Search + \newblock ICSE 2018 + \end{thebibliography} + #+END_EXPORT +*** :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + Common assumptions: AST availability, "small" datasets (e.g., random GitHub + samples) + + #+latex: \begin{example}[Universal programming language detection] + - programming language detection at the scale of Software Heritage + - no AST, only bytes (not characters!) + - supervised or unsupervised (would be best for language evolution) + #+latex: \end{example}