diff --git a/common/images/fork-degree.pdf b/common/images/fork-degree.pdf new file mode 100644 index 0000000..bdd7037 Binary files /dev/null and b/common/images/fork-degree.pdf differ diff --git a/common/images/merge-degree.pdf b/common/images/merge-degree.pdf new file mode 100644 index 0000000..506f0f5 Binary files /dev/null and b/common/images/merge-degree.pdf differ diff --git a/common/modules/dataset.org b/common/modules/dataset.org index 3cea4c9..20d0d21 100644 --- a/common/modules/dataset.org +++ b/common/modules/dataset.org @@ -1,42 +1,91 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 * Open Datasets :PROPERTIES: :CUSTOM_ID: main :END: ** Software Heritage Graph dataset :PROPERTIES: :CUSTOM_ID: graphdataset :END: - #+BEAMER: \vspace{-1mm} **Use case:** large scale analyses of the most comprehensive corpus on the development history of free/open source software. *** Dataset - Relational representation of the full graph as a set of tables - Available as open data: https://doi.org/10.5281/zenodo.2583978 #+BEAMER: \vspace{-1mm} - *** Formats - Local use: PostgreSQL dumps, or Apache Parquet files (~1 TiB each) - Live usage: Amazon Athena (SQL-queriable) #+BEAMER: \vspace{-1mm} - *** References and sample queries #+BEGIN_EXPORT latex \vspace{-2mm} \footnotesize \begin{thebibliography}{Foo Bar, 1969} - \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli\newblock The Software Heritage Graph Dataset: Public software development under one roof\newblock MSR 2019: Intl. Conf. on Mining Software Repositories, IEEE\newblock non-paywalled preprint: \url{http://deb.li/swhmsr19} - \end{thebibliography} #+END_EXPORT + +** Graph dataset --- sample queries + :PROPERTIES: + :CUSTOM_ID: graphquery1 + :END: +*** Most frequent first commit words + #+begin_src sql + SELECT COUNT(*) AS c, word FROM ( + SELECT LOWER(REGEXP_EXTRACT(FROM_UTF8( + message), 'ˆ\w+')) AS word FROM revision) + WHERE word != '' + GROUP BY word ORDER BY COUNT(*) DESC LIMIT 5; + #+end_src +*** + | Count | Word | + |------------+--------| + | 71'338'310 | update | + | 64'980'346 | merge | + | 56'854'372 | add | + | 44'971'954 | added | + | 33'222'056 | fix | + +** Graph dataset --- sample queries + :PROPERTIES: + :CUSTOM_ID: graphquery2 + :END: +*** Fork arity + :PROPERTIES: + :BEAMER_env: block + :BEAMER_COL: 0.5 + :END: + i.e., how often is a commit based upon? + #+BEAMER: \scriptsize + #+begin_src sql + SELECT fork_deg, count(*) FROM ( + SELECT id, count(*) AS fork_deg + FROM revision_history GROUP BY id) t + GROUP BY fork_deg ORDER BY fork_deg; + #+end_src + #+BEAMER: \includegraphics[width=\linewidth]{fork-degree} +*** Merge arity + :PROPERTIES: + :BEAMER_env: block + :BEAMER_COL: 0.5 + :END: + i.e., how large are merges? + #+BEAMER: \scriptsize + #+begin_src sql + SELECT merge_deg, COUNT(*) FROM ( + SELECT parent_id, COUNT(*) AS merge_deg + FROM revision_history GROUP BY parent_id) t + GROUP BY deg ORDER BY deg; + #+end_src + #+BEAMER: \includegraphics[width=\linewidth]{merge-degree}