diff --git a/common/modules/dataset.org b/common/modules/dataset.org index cf99eaf..67c482d 100644 --- a/common/modules/dataset.org +++ b/common/modules/dataset.org @@ -1,211 +1,211 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 * Open Datasets :PROPERTIES: :CUSTOM_ID: main :END: ** Software Heritage Graph dataset :PROPERTIES: :CUSTOM_ID: graphdataset :END: #+BEAMER: \vspace{-1mm} **Use case:** large scale analyses of the most comprehensive corpus on the development history of free/open source software. +*** + #+BEGIN_EXPORT latex + \vspace{-3mm} + \begin{thebibliography}{Foo Bar, 1969} + \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli\newblock + The Software Heritage Graph Dataset: Public software development under one roof\newblock + MSR 2019: 16th Intl. Conf. on Mining Software Repositories. IEEE\newblock + preprint: \url{http://deb.li/swhmsr19} + \end{thebibliography} + #+END_EXPORT + *** Dataset - Relational representation of the full graph as a set of tables - Available as open data: https://doi.org/10.5281/zenodo.2583978 #+BEAMER: \vspace{-1mm} *** Formats - Local use: PostgreSQL dumps, or Apache Parquet files (~1 TiB each) - Live usage: Amazon Athena (SQL-queriable), Azure Data Lake (soon) #+BEAMER: \vspace{-1mm} -*** References and sample queries - #+BEGIN_EXPORT latex - \vspace{-2mm} - \footnotesize - \begin{thebibliography}{Foo Bar, 1969} - \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli\newblock - The Software Heritage Graph Dataset: Public software development under one roof\newblock - MSR 2019: Intl. Conf. on Mining Software Repositories, IEEE\newblock - non-paywalled preprint: \url{http://deb.li/swhmsr19} - \end{thebibliography} - #+END_EXPORT ** Sample query --- most frequent first commit words :PROPERTIES: :CUSTOM_ID: graphquery1 :END: *** #+begin_src sql SELECT COUNT(*) AS c, word FROM ( SELECT LOWER(REGEXP_EXTRACT(FROM_UTF8( message), 'ˆ\w+')) AS word FROM revision) WHERE word != '' GROUP BY word ORDER BY COUNT(*) DESC LIMIT 5; #+end_src *** | Count | Word | |------------+--------| | 71 338 310 | update | | 64 980 346 | merge | | 56 854 372 | add | | 44 971 954 | added | | 33 222 056 | fix | ** Sample query --- fork and merge arities :PROPERTIES: :CUSTOM_ID: graphquery2 :END: *** Fork arity :PROPERTIES: :BEAMER_env: block :BEAMER_COL: 0.5 :END: i.e., how often is a commit based upon? #+BEAMER: \scriptsize #+begin_src sql SELECT fork_deg, count(*) FROM ( SELECT id, count(*) AS fork_deg FROM revision_history GROUP BY id) t GROUP BY fork_deg ORDER BY fork_deg; #+end_src #+BEAMER: \includegraphics[width=\linewidth]{fork-degree} *** Merge arity :PROPERTIES: :BEAMER_env: block :BEAMER_COL: 0.5 :END: i.e., how large are merges? #+BEAMER: \scriptsize #+begin_src sql SELECT merge_deg, COUNT(*) FROM ( SELECT parent_id, COUNT(*) AS merge_deg FROM revision_history GROUP BY parent_id) t GROUP BY deg ORDER BY deg; #+end_src #+BEAMER: \includegraphics[width=\linewidth]{merge-degree} * Other queries :PROPERTIES: :CUSTOM_ID: morequery :END: ** Sample query --- ratio of commits performed during weekends :PROPERTIES: :CUSTOM_ID: weekendsrc :END: #+BEGIN_SRC sql WITH revision_date AS (SELECT FROM_UNIXTIME(date / 1000000) AS date FROM revision) SELECT yearly_rev.year AS year, CAST(yearly_weekend_rev.number AS DOUBLE) / yearly_rev.number * 100.0 AS weekend_pc FROM (SELECT YEAR(date) AS year, COUNT(*) AS number FROM revision_date WHERE YEAR(date) BETWEEN 1971 AND 2018 GROUP BY YEAR(date) ) AS yearly_rev JOIN (SELECT YEAR(date) AS year, COUNT(*) AS number FROM revision_date WHERE DAY_OF_WEEK(date) >= 6 AND YEAR(date) BETWEEN 1971 AND 2018 GROUP BY YEAR(date) ) AS yearly_weekend_rev ON yearly_rev.year = yearly_weekend_rev.year ORDER BY year DESC; #+END_SRC ** Sample query --- ratio of commits performed during weekends (cont.) :PROPERTIES: :CUSTOM_ID: weekendout :END: | year | weekend | total | weekend percentage | |------+----------+-----------+--------------------| | 2018 | 15130065 | 78539158 | 19.26 | | 2017 | 33776451 | 168074276 | 20.09 | | 2016 | 43890325 | 209442130 | 20.95 | | 2015 | 35781159 | 166884920 | 21.44 | | 2014 | 24591048 | 122341275 | 20.10 | | 2013 | 17792778 | 88524430 | 20.09 | | 2012 | 12794430 | 64516008 | 19.83 | | 2011 | 9765190 | 48479321 | 20.14 | | 2010 | 7766348 | 38561515 | 20.14 | | 2009 | 6352253 | 31053219 | 20.45 | | 2008 | 4568373 | 22474882 | 20.32 | | 2007 | 3318881 | 16289632 | 20.37 | | 2006 | 2597142 | 12224905 | 21.24 | | 2005 | 2086697 | 9603804 | 21.72 | | 2004 | 1752400 | 7948104 | 22.04 | | 2003 | 1426033 | 6941593 | 20.54 | | 2002 | 1159294 | 5378538 | 21.55 | | 2001 | 849905 | 4098587 | 20.73 | | 2000 | 2091770 | 4338842 | 48.21 | | 1999 | 438540 | 2026906 | 21.63 | | 1998 | 311888 | 1430567 | 21.80 | | 1997 | 263995 | 1129249 | 23.37 | | 1996 | 192543 | 795827 | 24.19 | | 1995 | 176270 | 670417 | 26.29 | | 1994 | 137811 | 581563 | 23.69 | | 1993 | 169767 | 697343 | 24.34 | | 1992 | 74923 | 422068 | 17.75 | | 1991 | 92782 | 484547 | 19.14 | | 1990 | 113201 | 340489 | 33.24 | | 1989 | 31742 | 182325 | 17.40 | | 1988 | 44983 | 206275 | 21.80 | | 1987 | 27892 | 146157 | 19.08 | | 1986 | 54200 | 237330 | 22.83 | | 1985 | 75595 | 306564 | 24.65 | | 1984 | 26391 | 95506 | 27.63 | | 1983 | 89776 | 370687 | 24.21 | | 1982 | 51524 | 191933 | 26.84 | | 1981 | 32995 | 123618 | 26.69 | | 1980 | 31832 | 133733 | 23.80 | | 1979 | 20943 | 175164 | 11.95 | | 1978 | 3773 | 33677 | 11.20 | | 1977 | 4783 | 19376 | 24.68 | | 1976 | 1907 | 7048 | 27.05 | | 1975 | 2089 | 26579 | 7.85 | | 1974 | 2095 | 14290 | 14.66 | | 1973 | 2988 | 15580 | 19.17 | | 1972 | 1755 | 6552 | 26.78 | | 1971 | 1723 | 6125 | 28.13 | ** Sample query --- average size of the most popular file types :PROPERTIES: :CUSTOM_ID: popfilesrc :END: #+BEGIN_SRC sql SELECT suffix, ROUND(COUNT(*) * 100 / 1e6) AS Million_files, ROUND(AVG(length) / 1024) AS Average_k_length FROM (SELECT length, suffix FROM -- File length in joinable form (SELECT TO_BASE64(sha1_git) AS sha1_git64, length FROM content ) AS content_length JOIN -- Sample of files with popular suffixes (SELECT target64, file_suffix_sample.suffix AS suffix FROM -- Popular suffixes (SELECT suffix FROM ( SELECT REGEXP_EXTRACT(FROM_UTF8(name), '\.[^.]+$') AS suffix FROM directory_entry_file) AS file_suffix GROUP BY suffix ORDER BY COUNT(*) DESC LIMIT 20 ) AS pop_suffix JOIN -- Sample of files and suffixes (SELECT TO_BASE64(target) AS target64, REGEXP_EXTRACT(FROM_UTF8(name), '\.[^.]+$') AS suffix FROM directory_entry_file TABLESAMPLE BERNOULLI(1)) AS file_suffix_sample ON file_suffix_sample.suffix = pop_suffix.suffix) AS pop_suffix_sample ON pop_suffix_sample.target64 = content_length.sha1_git64) GROUP BY suffix ORDER BY AVG(length) DESC; #+END_SRC