diff --git a/common/modules/dataset.org b/common/modules/dataset.org index 20d0d21..cf99eaf 100644 --- a/common/modules/dataset.org +++ b/common/modules/dataset.org @@ -1,91 +1,211 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 * Open Datasets :PROPERTIES: :CUSTOM_ID: main :END: ** Software Heritage Graph dataset :PROPERTIES: :CUSTOM_ID: graphdataset :END: #+BEAMER: \vspace{-1mm} **Use case:** large scale analyses of the most comprehensive corpus on the development history of free/open source software. *** Dataset - Relational representation of the full graph as a set of tables - Available as open data: https://doi.org/10.5281/zenodo.2583978 #+BEAMER: \vspace{-1mm} *** Formats - Local use: PostgreSQL dumps, or Apache Parquet files (~1 TiB each) - - Live usage: Amazon Athena (SQL-queriable) + - Live usage: Amazon Athena (SQL-queriable), Azure Data Lake (soon) #+BEAMER: \vspace{-1mm} *** References and sample queries #+BEGIN_EXPORT latex \vspace{-2mm} \footnotesize \begin{thebibliography}{Foo Bar, 1969} \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli\newblock The Software Heritage Graph Dataset: Public software development under one roof\newblock MSR 2019: Intl. Conf. on Mining Software Repositories, IEEE\newblock non-paywalled preprint: \url{http://deb.li/swhmsr19} \end{thebibliography} #+END_EXPORT -** Graph dataset --- sample queries +** Sample query --- most frequent first commit words :PROPERTIES: :CUSTOM_ID: graphquery1 :END: -*** Most frequent first commit words +*** #+begin_src sql SELECT COUNT(*) AS c, word FROM ( SELECT LOWER(REGEXP_EXTRACT(FROM_UTF8( message), 'ˆ\w+')) AS word FROM revision) WHERE word != '' GROUP BY word ORDER BY COUNT(*) DESC LIMIT 5; #+end_src *** - | Count | Word | + | Count | Word | |------------+--------| - | 71'338'310 | update | - | 64'980'346 | merge | - | 56'854'372 | add | - | 44'971'954 | added | - | 33'222'056 | fix | + | 71 338 310 | update | + | 64 980 346 | merge | + | 56 854 372 | add | + | 44 971 954 | added | + | 33 222 056 | fix | -** Graph dataset --- sample queries +** Sample query --- fork and merge arities :PROPERTIES: :CUSTOM_ID: graphquery2 :END: *** Fork arity :PROPERTIES: :BEAMER_env: block :BEAMER_COL: 0.5 :END: i.e., how often is a commit based upon? #+BEAMER: \scriptsize #+begin_src sql SELECT fork_deg, count(*) FROM ( SELECT id, count(*) AS fork_deg FROM revision_history GROUP BY id) t GROUP BY fork_deg ORDER BY fork_deg; #+end_src #+BEAMER: \includegraphics[width=\linewidth]{fork-degree} *** Merge arity :PROPERTIES: :BEAMER_env: block :BEAMER_COL: 0.5 :END: i.e., how large are merges? #+BEAMER: \scriptsize #+begin_src sql SELECT merge_deg, COUNT(*) FROM ( SELECT parent_id, COUNT(*) AS merge_deg FROM revision_history GROUP BY parent_id) t GROUP BY deg ORDER BY deg; #+end_src #+BEAMER: \includegraphics[width=\linewidth]{merge-degree} +* Other queries + :PROPERTIES: + :CUSTOM_ID: morequery + :END: +** Sample query --- ratio of commits performed during weekends + :PROPERTIES: + :CUSTOM_ID: weekendsrc + :END: + #+BEGIN_SRC sql + WITH revision_date AS + (SELECT FROM_UNIXTIME(date / 1000000) AS date + FROM revision) + SELECT yearly_rev.year AS year, + CAST(yearly_weekend_rev.number AS DOUBLE) + / yearly_rev.number * 100.0 AS weekend_pc + FROM + (SELECT YEAR(date) AS year, COUNT(*) AS number + FROM revision_date + WHERE YEAR(date) BETWEEN 1971 AND 2018 + GROUP BY YEAR(date) ) AS yearly_rev + JOIN + (SELECT YEAR(date) AS year, COUNT(*) AS number + FROM revision_date + WHERE DAY_OF_WEEK(date) >= 6 + AND YEAR(date) BETWEEN 1971 AND 2018 + GROUP BY YEAR(date) ) AS yearly_weekend_rev + ON yearly_rev.year = yearly_weekend_rev.year + ORDER BY year DESC; + #+END_SRC + +** Sample query --- ratio of commits performed during weekends (cont.) + :PROPERTIES: + :CUSTOM_ID: weekendout + :END: + | year | weekend | total | weekend percentage | + |------+----------+-----------+--------------------| + | 2018 | 15130065 | 78539158 | 19.26 | + | 2017 | 33776451 | 168074276 | 20.09 | + | 2016 | 43890325 | 209442130 | 20.95 | + | 2015 | 35781159 | 166884920 | 21.44 | + | 2014 | 24591048 | 122341275 | 20.10 | + | 2013 | 17792778 | 88524430 | 20.09 | + | 2012 | 12794430 | 64516008 | 19.83 | + | 2011 | 9765190 | 48479321 | 20.14 | + | 2010 | 7766348 | 38561515 | 20.14 | + | 2009 | 6352253 | 31053219 | 20.45 | + | 2008 | 4568373 | 22474882 | 20.32 | + | 2007 | 3318881 | 16289632 | 20.37 | + | 2006 | 2597142 | 12224905 | 21.24 | + | 2005 | 2086697 | 9603804 | 21.72 | + | 2004 | 1752400 | 7948104 | 22.04 | + | 2003 | 1426033 | 6941593 | 20.54 | + | 2002 | 1159294 | 5378538 | 21.55 | + | 2001 | 849905 | 4098587 | 20.73 | + | 2000 | 2091770 | 4338842 | 48.21 | + | 1999 | 438540 | 2026906 | 21.63 | + | 1998 | 311888 | 1430567 | 21.80 | + | 1997 | 263995 | 1129249 | 23.37 | + | 1996 | 192543 | 795827 | 24.19 | + | 1995 | 176270 | 670417 | 26.29 | + | 1994 | 137811 | 581563 | 23.69 | + | 1993 | 169767 | 697343 | 24.34 | + | 1992 | 74923 | 422068 | 17.75 | + | 1991 | 92782 | 484547 | 19.14 | + | 1990 | 113201 | 340489 | 33.24 | + | 1989 | 31742 | 182325 | 17.40 | + | 1988 | 44983 | 206275 | 21.80 | + | 1987 | 27892 | 146157 | 19.08 | + | 1986 | 54200 | 237330 | 22.83 | + | 1985 | 75595 | 306564 | 24.65 | + | 1984 | 26391 | 95506 | 27.63 | + | 1983 | 89776 | 370687 | 24.21 | + | 1982 | 51524 | 191933 | 26.84 | + | 1981 | 32995 | 123618 | 26.69 | + | 1980 | 31832 | 133733 | 23.80 | + | 1979 | 20943 | 175164 | 11.95 | + | 1978 | 3773 | 33677 | 11.20 | + | 1977 | 4783 | 19376 | 24.68 | + | 1976 | 1907 | 7048 | 27.05 | + | 1975 | 2089 | 26579 | 7.85 | + | 1974 | 2095 | 14290 | 14.66 | + | 1973 | 2988 | 15580 | 19.17 | + | 1972 | 1755 | 6552 | 26.78 | + | 1971 | 1723 | 6125 | 28.13 | + +** Sample query --- average size of the most popular file types + :PROPERTIES: + :CUSTOM_ID: popfilesrc + :END: + #+BEGIN_SRC sql + SELECT suffix, + ROUND(COUNT(*) * 100 / 1e6) AS Million_files, + ROUND(AVG(length) / 1024) AS Average_k_length + FROM + (SELECT length, suffix + FROM -- File length in joinable form + (SELECT TO_BASE64(sha1_git) AS sha1_git64, length + FROM content ) AS content_length + JOIN -- Sample of files with popular suffixes + (SELECT target64, file_suffix_sample.suffix AS suffix + FROM -- Popular suffixes + (SELECT suffix FROM ( + SELECT REGEXP_EXTRACT(FROM_UTF8(name), + '\.[^.]+$') AS suffix + FROM directory_entry_file) AS file_suffix + GROUP BY suffix + ORDER BY COUNT(*) DESC LIMIT 20 ) AS pop_suffix + JOIN -- Sample of files and suffixes + (SELECT TO_BASE64(target) AS target64, + REGEXP_EXTRACT(FROM_UTF8(name), + '\.[^.]+$') AS suffix + FROM directory_entry_file TABLESAMPLE BERNOULLI(1)) + AS file_suffix_sample + ON file_suffix_sample.suffix = pop_suffix.suffix) + AS pop_suffix_sample + ON pop_suffix_sample.target64 = content_length.sha1_git64) + GROUP BY suffix + ORDER BY AVG(length) DESC; + #+END_SRC +