diff --git a/common/modules/dataset.org b/common/modules/dataset.org index 6bb3d90..32525a1 100644 --- a/common/modules/dataset.org +++ b/common/modules/dataset.org @@ -1,214 +1,227 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 * Open Datasets :PROPERTIES: :CUSTOM_ID: main :END: ** Software Heritage Graph dataset :PROPERTIES: :CUSTOM_ID: graphdataset :END: #+BEAMER: \vspace{-1mm} **Use case:** large scale analyses of the most comprehensive corpus on the development history of free/open source software. *** #+BEGIN_EXPORT latex \vspace{-2mm} \begin{thebibliography}{Foo Bar, 1969} \small \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli\newblock The Software Heritage Graph Dataset: Public software development under one roof\newblock MSR 2019: 16th Intl. Conf. on Mining Software Repositories. IEEE\newblock preprint: \url{http://deb.li/swhmsr19} \end{thebibliography} #+END_EXPORT #+BEAMER: \vspace{-1mm} *** Dataset - Relational representation of the full graph as a set of tables - Available as open data: https://doi.org/10.5281/zenodo.2583978 - Chosen as subject for the *MSR 2020 Mining Challenge* #+BEAMER: \vspace{-1mm} *** Formats - Local use: PostgreSQL dumps, or Apache Parquet files (~1 TiB each) - Live usage: Amazon Athena (SQL-queriable), Azure Data Lake (soon) ** Sample query --- most frequent first commit words :PROPERTIES: :CUSTOM_ID: graphquery1 :END: *** #+begin_src sql SELECT COUNT(*) AS c, word FROM ( SELECT LOWER(REGEXP_EXTRACT(FROM_UTF8( message), 'ˆ\w+')) AS word FROM revision) WHERE word != '' GROUP BY word ORDER BY COUNT(*) DESC LIMIT 5; #+end_src #+BEAMER: \pause *** | *Count* | *Word* | |------------+--------| | 71 338 310 | update | | 64 980 346 | merge | | 56 854 372 | add | | 44 971 954 | added | | 33 222 056 | fix | ** Sample query --- fork and merge arities :PROPERTIES: :CUSTOM_ID: graphquery2 :END: *** Fork arity :PROPERTIES: :BEAMER_env: block :BEAMER_COL: 0.5 :END: i.e., how often is a commit based upon? #+BEAMER: \scriptsize #+begin_src sql SELECT fork_deg, count(*) FROM ( SELECT id, count(*) AS fork_deg FROM revision_history GROUP BY id) t GROUP BY fork_deg ORDER BY fork_deg; #+end_src #+BEAMER: \includegraphics[width=\linewidth]{fork-degree} #+BEAMER: \pause *** Merge arity :PROPERTIES: :BEAMER_env: block :BEAMER_COL: 0.5 :END: i.e., how large are merges? #+BEAMER: \scriptsize #+begin_src sql SELECT merge_deg, COUNT(*) FROM ( SELECT parent_id, COUNT(*) AS merge_deg FROM revision_history GROUP BY parent_id) t GROUP BY deg ORDER BY deg; #+end_src #+BEAMER: \includegraphics[width=\linewidth]{merge-degree} * Other queries :PROPERTIES: :CUSTOM_ID: morequery :END: ** Sample query --- ratio of commits performed during weekends :PROPERTIES: :CUSTOM_ID: weekendsrc :END: #+BEGIN_SRC sql WITH revision_date AS (SELECT FROM_UNIXTIME(date / 1000000) AS date FROM revision) SELECT yearly_rev.year AS year, CAST(yearly_weekend_rev.number AS DOUBLE) / yearly_rev.number * 100.0 AS weekend_pc FROM (SELECT YEAR(date) AS year, COUNT(*) AS number FROM revision_date WHERE YEAR(date) BETWEEN 1971 AND 2018 GROUP BY YEAR(date) ) AS yearly_rev JOIN (SELECT YEAR(date) AS year, COUNT(*) AS number FROM revision_date WHERE DAY_OF_WEEK(date) >= 6 AND YEAR(date) BETWEEN 1971 AND 2018 GROUP BY YEAR(date) ) AS yearly_weekend_rev ON yearly_rev.year = yearly_weekend_rev.year ORDER BY year DESC; #+END_SRC ** Sample query --- ratio of commits performed during weekends (cont.) :PROPERTIES: :CUSTOM_ID: weekendout :END: | *Year* | *Weekend* | *Total* | *Weekend percentage* | |--------+-----------+-----------+----------------------| | 2018 | 15130065 | 78539158 | 19.26 | | 2017 | 33776451 | 168074276 | 20.09 | | 2016 | 43890325 | 209442130 | 20.95 | | 2015 | 35781159 | 166884920 | 21.44 | | 2014 | 24591048 | 122341275 | 20.10 | | 2013 | 17792778 | 88524430 | 20.09 | | 2012 | 12794430 | 64516008 | 19.83 | | 2011 | 9765190 | 48479321 | 20.14 | | 2010 | 7766348 | 38561515 | 20.14 | | 2009 | 6352253 | 31053219 | 20.45 | | 2008 | 4568373 | 22474882 | 20.32 | | 2007 | 3318881 | 16289632 | 20.37 | | 2006 | 2597142 | 12224905 | 21.24 | | 2005 | 2086697 | 9603804 | 21.72 | | 2004 | 1752400 | 7948104 | 22.04 | | 2003 | 1426033 | 6941593 | 20.54 | | 2002 | 1159294 | 5378538 | 21.55 | | 2001 | 849905 | 4098587 | 20.73 | | 2000 | 2091770 | 4338842 | 48.21 | | 1999 | 438540 | 2026906 | 21.63 | | 1998 | 311888 | 1430567 | 21.80 | | 1997 | 263995 | 1129249 | 23.37 | | 1996 | 192543 | 795827 | 24.19 | | 1995 | 176270 | 670417 | 26.29 | | 1994 | 137811 | 581563 | 23.69 | | 1993 | 169767 | 697343 | 24.34 | | 1992 | 74923 | 422068 | 17.75 | | 1991 | 92782 | 484547 | 19.14 | | 1990 | 113201 | 340489 | 33.24 | | 1989 | 31742 | 182325 | 17.40 | | 1988 | 44983 | 206275 | 21.80 | | 1987 | 27892 | 146157 | 19.08 | | 1986 | 54200 | 237330 | 22.83 | | 1985 | 75595 | 306564 | 24.65 | | 1984 | 26391 | 95506 | 27.63 | | 1983 | 89776 | 370687 | 24.21 | | 1982 | 51524 | 191933 | 26.84 | | 1981 | 32995 | 123618 | 26.69 | | 1980 | 31832 | 133733 | 23.80 | | 1979 | 20943 | 175164 | 11.95 | | 1978 | 3773 | 33677 | 11.20 | | 1977 | 4783 | 19376 | 24.68 | | 1976 | 1907 | 7048 | 27.05 | | 1975 | 2089 | 26579 | 7.85 | | 1974 | 2095 | 14290 | 14.66 | | 1973 | 2988 | 15580 | 19.17 | | 1972 | 1755 | 6552 | 26.78 | | 1971 | 1723 | 6125 | 28.13 | ** Sample query --- average size of the most popular file types :PROPERTIES: :CUSTOM_ID: popfilesrc :END: #+BEGIN_SRC sql SELECT suffix, ROUND(COUNT(*) * 100 / 1e6) AS Million_files, ROUND(AVG(length) / 1024) AS Average_k_length FROM (SELECT length, suffix FROM -- File length in joinable form (SELECT TO_BASE64(sha1_git) AS sha1_git64, length FROM content ) AS content_length JOIN -- Sample of files with popular suffixes (SELECT target64, file_suffix_sample.suffix AS suffix FROM -- Popular suffixes (SELECT suffix FROM ( SELECT REGEXP_EXTRACT(FROM_UTF8(name), '\.[^.]+$') AS suffix FROM directory_entry_file) AS file_suffix GROUP BY suffix ORDER BY COUNT(*) DESC LIMIT 20 ) AS pop_suffix JOIN -- Sample of files and suffixes (SELECT TO_BASE64(target) AS target64, REGEXP_EXTRACT(FROM_UTF8(name), '\.[^.]+$') AS suffix FROM directory_entry_file TABLESAMPLE BERNOULLI(1)) AS file_suffix_sample ON file_suffix_sample.suffix = pop_suffix.suffix) AS pop_suffix_sample ON pop_suffix_sample.target64 = content_length.sha1_git64) GROUP BY suffix ORDER BY AVG(length) DESC; #+END_SRC +* Discussion + :PROPERTIES: + :CUSTOM_ID: discussion + :END: +** Discussion + - one /can/ query such a corpus SQL-style + - but relational representation shows its limits at this scale + - ...at least as deployed on commercial SQL offerings such as Athena + - note: (naive) sharding is ineffective, due to the pseudo-random + distribution of node identifiers + - experiments with Google BigQuery are ongoing + - (we broke it at the first import attempt..., due to very large arrays in + directory entry tables) diff --git a/common/modules/foss-commons.org b/common/modules/foss-commons.org index 3ca554b..002f37a 100644 --- a/common/modules/foss-commons.org +++ b/common/modules/foss-commons.org @@ -1,67 +1,85 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 * The Free Software Commons :PROPERTIES: :CUSTOM_ID: main :END: ** Free Software -*** Definition (Free Software) + :PROPERTIES: + :CUSTOM_ID: freeswdef + :END: +*** [#B] Free Software + :PROPERTIES: + :BEAMER_env: definition + :END: A program is *free software* if the program's users have the four /essential freedoms/: - Freedom #0, to *run* the program, for any purpose - Freedom #1, to *study* how the program works, and change it - Freedom #2, to *redistribute* copies - Freedom #3, to *improve* the program, and *release* improvements *** :B_ignoreheading: :PROPERTIES: :BEAMER_env: ignoreheading :BEAMER_ref: :END: Free Software also comes with *obligations*, which vary according to the license: BSD, GPL, Apache, AGPL, . . . ** Why bother? + :PROPERTIES: + :CUSTOM_ID: whybother + :END: Why, as computer scientists/teachers/students, should we bother about Free/Open Source Software (FOSS)? #+BEAMER: \pause \vfill FOSS has /radically changed/ the way software is: - developed - tested - proven - conceived - marketed - sold - maintained - taught - deployed - ... ** The Commons and FOSS :PROPERTIES: :CUSTOM_ID: commonsdef :END: *** Definition (Commons) + :PROPERTIES: + :CUSTOM_ID: commonsdef1 + :END: The *commons* is the cultural and natural resources accessible to all members of a society, including natural materials such as air, water, and a habitable earth. These resources are held in common, not owned privately. #+BEAMER: {\tiny\url{https://en.wikipedia.org/wiki/Commons}} *** Definition (Software Commons) + :PROPERTIES: + :CUSTOM_ID: commonsdef2 + :END: The *software commons* consists of all computer software which is available at little or no cost and which can be altered and reused with few restrictions. Thus /all open source software and all free software are part of the [software] commons/. [...] #+BEAMER: {\tiny\url{https://en.wikipedia.org/wiki/Software_Commons}} ** But /where/ is this commons? + :PROPERTIES: + :CUSTOM_ID: wherefoss + :END: #+latex: \begin{flushleft} #+ATTR_LATEX: :width \extblockscale{.5\linewidth} file:myriadsources.png #+latex: \end{flushleft} #+BEAMER: \pause *** Fashion victims - many disparate development platforms - a myriad places where distribution may happen - projects tend to migrate from one place to the other over time #+BEAMER: \pause *** One place... :B_block: :PROPERTIES: :BEAMER_env: block :END: \hfill ... where can we find, track and search /all/ source code? diff --git a/common/modules/graph-compression.org b/common/modules/graph-compression.org index c59fd5e..8c2d452 100644 --- a/common/modules/graph-compression.org +++ b/common/modules/graph-compression.org @@ -1,279 +1,306 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 # Depends: \usepackage{pdfpages} * Graph Compression :PROPERTIES: :CUSTOM_ID: main :END: ** Graph compression on the Software Heritage archive + :PROPERTIES: + :CUSTOM_ID: intro + :END: *** #+BEGIN_EXPORT latex \vspace{-3mm} \begin{thebibliography}{Foo Bar, 1969} \bibitem{Boldi2020} Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli \newblock Ultra-Large-Scale Repository Analysis via Graph Compression \newblock SANER 2020, 27th Intl. Conf. on Software Analysis, Evolution and Reengineering. IEEE \end{thebibliography} #+END_EXPORT *** Research question Is it possible to efficiently perform software development history analyses at ultra large scale (= the scale of Software Heritage archive or more), on a single, relatively cheap machine? *** Idea Apply state-of-the-art graph compression techniques from the field of Web graph / social network analysis. ** Background --- (Web) graph compression + :PROPERTIES: + :CUSTOM_ID: background1 + :END: *** The graph of the Web :PROPERTIES: :BEAMER_env: definition :END: Directed graph that has Web pages as nodes and hyperlinks between them as edges. *** Properties (1) - **Locality:** pages links to pages whose URLs are lexicographically similar. URLs share long common prefixes. → use *D-gap compression* *** Adjacency lists :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.51 :END: #+BEAMER: \scriptsize | *Node* | *Outdegree* | *Successors* | |--------+-------------+--------------------------------------| | ... | ... | ... | | 15 | 11 | 13,15,16,17,18,19,23,24,203,315,1034 | | 16 | 10 | 15,16,17,22,23,24,315,316,317,3041 | | 17 | 0 | | | 18 | 5 | 13,15,16,17,50 | | ... | ... | ... | *** D-gapped adjacency lists :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.48 :END: #+BEAMER: \scriptsize | *Node* | *Outdegree* | *Successors* | |--------+-------------+-----------------------------| | ... | ... | ... | | 15 | 11 | 3,1,0,0,0,0,3,0,178,111,718 | | 16 | 10 | 1,0,0,4,0,0,290,0,0,2723 | | 17 | 0 | | | 18 | 5 | 9,1,0,0,32 | | ... | ... | ... | ** Background --- (Web) graph compression (cont.) + :PROPERTIES: + :CUSTOM_ID: background2 + :END: *** The graph of the Web :PROPERTIES: :BEAMER_env: definition :END: Directed graph that has Web pages as nodes and hyperlinks between them as edges. *** Properties (2) - **Similarity:** pages that are close together in lexicographic order tend to have many common successors. → use *reference compression* *** Adjacency lists :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.47 :END: #+BEAMER: \scriptsize | *Node* | *Outd.* | *Successors* | |--------+---------+--------------------------------------| | ... | ... | ... | | 15 | 11 | 13,15,16,17,18,19,23,24,203,315,1034 | | 16 | 10 | 15,16,17,22,23,24,315,316,317,3041 | | 17 | 0 | | | 18 | 5 | 13,15,16,17,50 | | ... | ... | ... | *** Copy lists :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.60 :END: #+BEAMER: \scriptsize | *Node* | *Ref.* | *Copy list* | *Extra nodes* | |--------+--------+-------------+--------------------------------------| | ... | ... | ... | ... | | 15 | 0 | | 13,15,16,17,18,19,23,24,203,315,1034 | | 16 | 1 | 01110011010 | 22,316,317,3041 | | 17 | | | | | 18 | 3 | 11110000000 | 50 | | ... | ... | ... | | ** Background --- Web graph compression (OLD) :noexport: Borrowing (great!) slides from: #+BEGIN_EXPORT latex \begin{thebibliography}{} \bibitem{Pibiri2018} Giulio Ermanno Pibiri \newblock Effective Web Graph Representations, 2018 \newblock \url{http://pages.di.unipi.it/pibiri/slides/webgraphs\_compression.pdf} \end{thebibliography} #+END_EXPORT ** Background -- Web graph compression (imported slides) (OLD) :noexport: :PROPERTIES: :BEAMER_env: ignoreheading :END: #+BEGIN_EXPORT latex { \setbeamercolor{background canvas}{bg=} \setbeamertemplate{background}{} \includepdf[pages={4,11,12,13}]{webgraphs_compression.pdf} \addtocounter{framenumber}{4} } #+END_EXPORT ** Corpus + :PROPERTIES: + :CUSTOM_ID: corpus + :END: *** Nodes :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.5 :END: | *Node type* | *N. of nodes* | |-------------+---------------| | origins | 88 M | | snapshots | 57 M | | releases | 9.9 M | | revisions | 1.1 B | | directories | 4.9 B | | contents | 5.5 B | |-------------+---------------| | Total nodes | 12 B | *** Edges :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.5 :END: | *Edge type* | *N. of edges* | |-----------------------+---------------| | origin → snapshot | 195 M | | snapshot → revision | 616 M | | snapshot → release | 215 M | | release → revision | 9.9 M | | revision → revision | 1.2 B | | revision → directory | 1.1 B | | directory → directory | 48 B | | directory → revisiony | 482 M | | directory → content | 112 B | |-----------------------+---------------| | Total edges | 165 B | *** :PROPERTIES: :BEAMER_env: ignoreheading :END: Archive snapshot 2018-09-25, from the Software Heritage graph dataset.\\ Growth rate: exponential, doubling every 22-30 months (Rousseau, Di Cosmo, Zacchiroli; ESE 2020, to appear). ** Graph compression pipeline + :PROPERTIES: + :CUSTOM_ID: pipeline + :END: #+BEAMER: \hspace*{-0.1\linewidth} \includegraphics[width=1.2\linewidth]{compression/compression-steps} #+BEAMER: \vspace{-1cm} *** - *MPH*: minimal perfect hash, mapping Merkle IDs to 0..N-1 integers - *BV* compress: Boldi-Vigna compression (based on MPH order) - *BFS*: breadth-first visit to renumber - *Permute*: update BV compression according to BFS order *** (Re)establishing locality - key for good compression is a node ordering that ensures locality and similarity - which is very much /not/ the case with Merkle IDs, ...but is the case /again/ after BFS reordering ** Compression experiment + :PROPERTIES: + :CUSTOM_ID: compexp + :END: | *Step* | *Wall time* (hours) | |-------------+---------------------| | MPH | 2 | | BV Compress | 84 | | BFS | 19 | | Permute | 18 | | Transpose | 15 | |-------------+---------------------| | Total | 138 (6 days) | - server equipped with 24 CPUs and 750 GB of RAM - RAM mostly used as I/O cache for the BFS step - /minimum/ memory requirements are close to the RAM needed to load the final compressed graph in memory ** Compression efficiency (space) + :PROPERTIES: + :CUSTOM_ID: spaceefficiency + :END: *** :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.4 :END: | *Forward graph* | | | total size | 91 GiB | | bits per edge | 4.91 | | compression ratio | 15.8% | *** :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.4 :END: | *Backward graph* | | | total size | 83 GiB | | bits per edge | 4.49 | | compression ratio | 14.4% | *** Operating cost The structure of a full bidirectional archive graph fits in less than 200 GiB of RAM, for a hardware cost of ~300 USD. ** Compression efficiency (time) + :PROPERTIES: + :CUSTOM_ID: timeefficiency + :END: *** Benchmark --- Full BFS visit (single thread) #+BEAMER: \begin{columns}\begin{column}{0.45\textwidth} | *Forward graph* | | |------------------+----------------| | wall time | 1h48m | | throughput | 1.81 M nodes/s | | | (553 ns/node) | #+BEAMER: \end{column}\begin{column}{0.45\textwidth} | *Backward graph* | | |------------------+----------------| | wall time | 3h17m | | throughput | 988 M nodes/s | | | (1.01 µs/node) | #+BEAMER: \end{column}\end{columns} *** Benchmark --- Edge lookup random sample: 1 B nodes (8.3% of entire graph); then enumeration of all successors #+BEAMER: \begin{columns}\begin{column}{0.45\textwidth} | *Forward graph* | | |------------------+----------------| | visited edges | 13.6 B | | throughput | 12.0 M edges/s | | | (83 ns/edge) | #+BEAMER: \end{column}\begin{column}{0.45\textwidth} | *Backward graph* | | |------------------+----------------| | visited edges | 13.6 B | | throughput | 9.45 M edges/s | | | (106 ns/edge) | #+BEAMER: \end{column}\end{columns} *** :PROPERTIES: :BEAMER_env: ignoreheading :END: Note how edge lookup time is close to DRAM random access time (50-60 ns). ** Discussion + :PROPERTIES: + :CUSTOM_ID: discussion + :END: *** Incrementality compression is *not incremental*, due to the use of contiguous integer ranges - but the graph is append-only, so... - ...based on expected graph growth rate it should be possible to pre-allocate enough free space in the integer ranges to support *amortized incrementality* (future work) #+BEAMER: \pause *** In-memory v. on-disk the compressed in-memory graph structure has *no attributes* - usual design is to exploit the 0..N-1 integer ranges to *memory map node attributes* to disk for efficient access - works well for queries that does graph traversal first and "join" node attributes last; ping-pong between the two is expensive - edge attributes are more problematic diff --git a/talks-public/2020-07-03-soheal/2020-07-03-soheal.org b/talks-public/2020-07-03-soheal/2020-07-03-soheal.org new file mode 100644 index 0000000..01d2e84 --- /dev/null +++ b/talks-public/2020-07-03-soheal/2020-07-03-soheal.org @@ -0,0 +1,237 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) +#+TITLE: Global Software Health +#+SUBTITLE: an Unified View of how our Software Commons is Doing +#+BEAMER_HEADER: \date[3 July 2020, SoHeal]{3 July 2020\\SoHeal 2020\\ (via conf call)\\[-2ex]} +#+AUTHOR: Stefano Zacchiroli +#+DATE: 3 July 2020 +#+EMAIL: zack@upsilon.cc + +#+INCLUDE: "../../common/modules/prelude-toc.org" :minlevel 1 +#+INCLUDE: "../../common/modules/169.org" +#+BEAMER_HEADER: \institute[UParis \& Inria]{Université de Paris \& Inria --- {\tt zack@upsilon.cc, @zacchiro}} +#+BEAMER_HEADER: \author{Stefano Zacchiroli} + +# Required by graph-compression.org module +#+LATEX_HEADER_EXTRA: \usepackage{pdfpages} + +# Syntax highlighting setup +#+LATEX_HEADER_EXTRA: \usepackage{minted} +#+LaTeX_HEADER_EXTRA: \usemintedstyle{tango} +#+LaTeX_HEADER_EXTRA: \newminted{sql}{fontsize=\scriptsize} +#+name: setup-minted +#+begin_src emacs-lisp :exports results :results silent + (setq org-latex-listings 'minted) + (setq org-latex-minted-options + '(("fontsize" "\\scriptsize"))) + (setq org-latex-to-pdf-process + '("pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f" + "pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f" + "pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f")) +#+end_src +# End syntax highlighting setup + +* Software Health +** Software Health + #+BEAMER: \pause +*** Software Health + :PROPERTIES: + :BEAMER_env: definition + :END: + One of the hardest research fields to search the Web for. +*** Proof (empirical, trivial) + :PROPERTIES: + :BEAMER_env: proof + :END: + Exhibit: https://www.google.com/search?q=software+health +*** :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + + #+BEAMER: \vfill \pause + More seriously… +*** + The SoHeal community has pioneered the exploration of the notion of + *Software Health*. + + By now we have evidence of interest in several /dimensions/ of the notion, + we have /tools & techniques/ that are routinely used to explore them, and + we have been doing that at various /scopes/. + +** Software Health --- dimensions +*** /What/ are we looking at + Several *dimensions* have been explored thus far, e.g.: + - software evolution and "liveliness" + - quality (cf. SoHeal 2019 keynote by Jesus M. Gonzalez-Barahona) + - community + - both static structure + - and dynamics over time + (non-exhaustive list) + +** Software Health --- tools & techniques +*** /How/ we are exploring the topic + - classic software evolution & MSR techniques + - quantitative analysis (stats !) + - qualitative analysis + - e.g., interviews, ethnography, Delphi method + - community metrics & their standardization (cf. CHAOSS) + - raising awareness in relevant communities: FOSS + scholars + the SoHeal workshop series! + +** Software Health --- scope +*** How /far/ are we looking + 1. a single project + 2. a set of inter-dependent projects + - e.g., a specific framework with plugins, a software stack, etc. + - also a community of contributors working on said projects + 3. an ecosystem + - e.g., Debian, PyPI, NPM, etc. + #+BEAMER: \pause +*** Going further + - can we go further in terms of software health scope? how far? + - is there a meaningful notion of *"global software health"*? + - if there is, which the *tools* can we use to explore global software + health? + - if they exist and are practical, what is the *current status* of global + software health? + +* Software Commons +** Free Software + (I know you all know this, but bear with me. I pinky promise it's gonna be + useful!) +*** Free Software + :PROPERTIES: + :BEAMER_env: definition + :END: + A program is *free software* if the program's users have the four + /essential freedoms/: + - Freedom #0, to *run* the program, for any purpose + - Freedom #1, to *study* how the program works, and change it + - Freedom #2, to *redistribute* copies + - Freedom #3, to *improve* the program, and *release* improvements +*** ChangeLog :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + by the Free Software Foundation\\ + ChangeLog: 2-freedom version: 1986, 3-freedom: 1990; 4-freedom: early 90s +** Software Commons + #+INCLUDE: "../../common/modules/foss-commons.org::#commonsdef1" + #+BEAMER: \pause + #+INCLUDE: "../../common/modules/foss-commons.org::#commonsdef2" +** Global Software Health +*** Proposition #1 + The full extent of our shared software commons is the ultimate scope for + software health.\\ + #+BEAMER: \centering \Large + global software health = software health + software commons +*** Global Software Health (tentative) + :PROPERTIES: + :BEAMER_env: definition + :END: + The investigation of *software health* at the scale of the entire *software + commons*. + #+BEAMER: \pause \vfill +*** Proposition #2 + As a starting point for global software health analysis, we need the + equivalent of ancient world libraries, i.e., *great libraries of software + artifacts*, that encompass the software commons as much as possible. +*** Great library options :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + - GHTorrent + - World of Code + - Software Heritage (← my focus for the rest of this talk) + +* Software Heritage +** Software Heritage in a nutshell \hfill [[https://softwareheritage.org][softwareheritage.org]] + #+INCLUDE: "../../common/modules/swh-goals-oneslide-vertical.org::#goals" :only-contents t :minlevel 3 +** An international, non profit initiative\hfill built for the long term + :PROPERTIES: + :CUSTOM_ID: support + :END: +*** Sharing the vision :B_block: + :PROPERTIES: + :CUSTOM_ID: endorsement + :BEAMER_COL: .5 + :BEAMER_env: block + :END: + #+LATEX: \begin{center}{\includegraphics[width=\extblockscale{.4\linewidth}]{unesco_logo_en_285}}\end{center} + #+LATEX: \vspace{-0.8cm} + #+LATEX: \begin{center}\vskip 1em \includegraphics[width=\extblockscale{1.4\linewidth}]{support.pdf}\end{center} + #+latex:\mbox{}~~~~~~~\tiny\url{www.softwareheritage.org/support/testimonials} +*** Donors, members, sponsors :B_block: + :PROPERTIES: + :CUSTOM_ID: sponsors + :BEAMER_COL: .5 + :BEAMER_env: block + :END: + #+LATEX: \begin{center}\includegraphics[width=\extblockscale{.4\linewidth}]{inria-logo-new}\end{center} + #+LATEX: \begin{center} + #+LATEX: \colorbox{white}{\includegraphics[width=\extblockscale{1.4\linewidth}]{sponsors.pdf}} + #+latex:\mbox{}~~~~~~~\tiny\url{www.softwareheritage.org/support/sponsors} + #+LATEX: \end{center} +** Status :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: +#+INCLUDE: "../../common/modules/status-extended.org::#archivinggoals" :minlevel 2 +#+INCLUDE: "../../common/modules/status-extended.org::#architecture" :minlevel 2 :only-contents t +#+INCLUDE: "../../common/modules/status-extended.org::#merkletree" :minlevel 2 +#+INCLUDE: "../../common/modules/status-extended.org::#datamodel" :minlevel 2 :only-contents t +#+INCLUDE: "../../common/modules/status-extended.org::#dagdetailsmall" :minlevel 2 :only-contents t +#+INCLUDE: "../../common/modules/status-extended.org::#archive" :minlevel 2 +* Exploring the Software Commons +** Early days +*** + - We are in the *early days* of full-scale explorations of the entire + software commons, for both /software health/ and other research or + practical needs. + - We are also not yet *capable* of performing analyses at such scale, due + to a lack of /resources/ (including time!) and/or appropriate /tools/ and + /techniques/. +*** :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + In the following I'll review some related work: + - a large-scale *dataset* encompassing a decent chunk of the software commons + - a *technique* to exploit such dataset /on a budget/ + - a long-term exploration of the *growth rate* of the software commons + + #+INCLUDE: "../../common/modules/dataset.org::#main" :minlevel 2 :only-contents t + #+INCLUDE: "../../common/modules/dataset.org::#morequery" :minlevel 2 :only-contents t + #+INCLUDE: "../../common/modules/dataset.org::#discussion" :minlevel 2 :only-contents t + + #+INCLUDE: "../../common/modules/graph-compression.org::#intro" :minlevel 2 + #+INCLUDE: "../../common/modules/graph-compression.org::#background1" :minlevel 2 + #+INCLUDE: "../../common/modules/graph-compression.org::#background2" :minlevel 2 + #+INCLUDE: "../../common/modules/graph-compression.org::#pipeline" :minlevel 2 + #+INCLUDE: "../../common/modules/graph-compression.org::#compexp" :minlevel 2 + #+INCLUDE: "../../common/modules/graph-compression.org::#spaceefficiency" :minlevel 2 + #+INCLUDE: "../../common/modules/graph-compression.org::#timeefficiency" :minlevel 2 + #+INCLUDE: "../../common/modules/graph-compression.org::#discussion" :minlevel 2 + + #+INCLUDE: "this/original-content-growth.org::#oneslide" :minlevel 2 + +* Conclusion +** Wrapping up +*** + - the notion of *software health* is shaping up nicely, with several + dimensions to it and more and more established tools and techniques + - *global software health*, i.e., the study of software health at the scale + of the full software commons is an open challenge that requires + exhaustive code libraries, tools, and techniques + - *Software Heritage* is one such library, containing a significant span of + the software commons; tools and techniques to analyze it are now badly + needed + - meanwhile, the *software commons* seems to be doing well in terms of + *growth*; let's dig it further to assess its health! +*** Contacts + [[https://upsilon.cc/~zack/][Stefano Zacchiroli]] / [[mailto:zack@upsilon.cc][zack@upsilon.cc]] / [[https://twitter.com/zacchiro][@zacchiro]] / [[https://mastodon.xyz/@zacchiro][@zacchiro@mastodon.xyz]] + +* Appendix :B_appendix: + :PROPERTIES: + :BEAMER_env: appendix + :END: diff --git a/talks-public/2020-07-03-soheal/Makefile b/talks-public/2020-07-03-soheal/Makefile new file mode 100644 index 0000000..68fbee7 --- /dev/null +++ b/talks-public/2020-07-03-soheal/Makefile @@ -0,0 +1 @@ +include ../Makefile.slides diff --git a/talks-public/2020-07-03-soheal/this/original-content-growth.org b/talks-public/2020-07-03-soheal/this/original-content-growth.org new file mode 100644 index 0000000..58c7cac --- /dev/null +++ b/talks-public/2020-07-03-soheal/this/original-content-growth.org @@ -0,0 +1,33 @@ +* Original content growth +** Original content growth + :PROPERTIES: + :CUSTOM_ID: oneslide + :END: + #+BEAMER: \vspace{-1mm} \includegraphics[width=\textwidth]{revision_content_growth_wide} +*** strut :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + #+BEAMER: \vspace{-5mm} +*** + - *50 years of software commons* history. 50 M projects, 4 B blobs, 1 B + commits (Software Heritage snapshot, Feb 2018) + - */original/ artifacts* explored over time, after deduplication + - evidence of *exponential growth*: original commits doubles every 30 + months; blobs every 22 months; original blobs /per commit/ doubles every + 7 years +*** strut :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: + #+BEAMER: \vspace{-1mm} +*** + #+BEGIN_EXPORT latex + \vspace{-3mm} + \begin{thebibliography}{Foo Bar, 1969} + \footnotesize + \bibitem{Rousseau2020} Roberto Di Cosmo, Guillaume Rousseau, Stefano Zacchiroli + \newblock Software Provenance Tracking at the Scale of Public Source Code + \newblock Empirical Software Engineering, 2020 + \end{thebibliography} + #+END_EXPORT