diff --git a/common/modules/dataset.org b/common/modules/dataset.org new file mode 100644 index 0000000..3cea4c9 --- /dev/null +++ b/common/modules/dataset.org @@ -0,0 +1,42 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) +#+INCLUDE: "prelude.org" :minlevel 1 + +* Open Datasets + :PROPERTIES: + :CUSTOM_ID: main + :END: +** Software Heritage Graph dataset + :PROPERTIES: + :CUSTOM_ID: graphdataset + :END: + + #+BEAMER: \vspace{-1mm} + + **Use case:** large scale analyses of the most comprehensive corpus on the + development history of free/open source software. + +*** Dataset + - Relational representation of the full graph as a set of tables + - Available as open data: https://doi.org/10.5281/zenodo.2583978 + + #+BEAMER: \vspace{-1mm} + +*** Formats + - Local use: PostgreSQL dumps, or Apache Parquet files (~1 TiB each) + - Live usage: Amazon Athena (SQL-queriable) + + #+BEAMER: \vspace{-1mm} + +*** References and sample queries + #+BEGIN_EXPORT latex + \vspace{-2mm} + \footnotesize + \begin{thebibliography}{Foo Bar, 1969} + + \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli\newblock + The Software Heritage Graph Dataset: Public software development under one roof\newblock + MSR 2019: Intl. Conf. on Mining Software Repositories, IEEE\newblock + non-paywalled preprint: \url{http://deb.li/swhmsr19} + + \end{thebibliography} + #+END_EXPORT