diff --git a/talks-public/2018-12-10-BENEVOL/2018-12-10-BENEVOL.org b/talks-public/2018-12-10-BENEVOL/2018-12-10-BENEVOL.org index 4813e62..a4fffd8 100644 --- a/talks-public/2018-12-10-BENEVOL/2018-12-10-BENEVOL.org +++ b/talks-public/2018-12-10-BENEVOL/2018-12-10-BENEVOL.org @@ -1,125 +1,138 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+TITLE: Towards Universal Software Evolution Analysis # #+SUBTITLE: Analyzing All the Code Source with Software Heritage #+BEAMER_HEADER: \date[10/12/2018, BENEVOL2018]{10 December 2018\\Belgium-Netherlands Software Evolution Workshop\\Delft, Netherlands} #+DATE: 10 December 2018 #+INCLUDE: "../../common/modules/prelude.org" :minlevel 1 #+INCLUDE: "../../common/modules/169.org" -#+BEAMER_HEADER: \institute[Inria]{\\[-5mm]Inria --- Software Heritage\\{\tt antoine.pietri@softwareheritage.org}} -#+BEAMER_HEADER: \author{Antoine Pietri} +#+BEAMER_HEADER: \institute[Inria]{\\[-5mm]Inria --- Software Heritage\\{\tt antoine.pietri@softwareheritage.org\\zack@upsilon.cc}} +#+BEAMER_HEADER: \author{Antoine Pietri \and Stefano Zacchiroli} #+LATEX_HEADER_EXTRA: \usepackage{tikz} #+LATEX_HEADER_EXTRA: \usetikzlibrary{arrows,shapes} #+LATEX_HEADER_EXTRA: \definecolor{swh-orange}{RGB}{254,205,27} #+LATEX_HEADER_EXTRA: \definecolor{swh-red}{RGB}{226,0,38} #+LATEX_HEADER_EXTRA: \definecolor{swh-green}{RGB}{77,181,174} * Software Heritage #+INCLUDE: "../../common/modules/swh-overview-sourcecode.org::#mission" :minlevel 2 #+INCLUDE: "../../common/modules/status-extended.org::#dataflow" :minlevel 2 #+INCLUDE: "../../common/modules/status-extended.org::#archive" :minlevel 2 ** A giant Merkle DAG - # #+BEAMER: \centering - #+LATEX: \only<1>{\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_1.pdf}}} - #+LATEX: \only<2>{\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_2_contents.pdf}}} - #+LATEX: \only<3>{\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_3_directories.pdf}}} - #+LATEX: \only<4>{\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_4_revisions.pdf}}} - #+LATEX: \only<5>{\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_5_releases.pdf}}} -# #+LATEX: {\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_1.pdf}}} + #+BEAMER: \centering + #+LATEX: \only<1>{\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_1.pdf}}}% + #+LATEX: \only<2>{\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_2_contents.pdf}}}% + #+LATEX: \only<3>{\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_3_directories.pdf}}}% + #+LATEX: \only<4>{\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_4_revisions.pdf}}}% + #+LATEX: \only<5>{\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_5_releases.pdf}}}% +# #+LATEX: {\colorbox{white}{\includegraphics[width=.7\linewidth]{git-merkle/merkle_1.pdf}}}% * A Platform for Software Analysis ** A Platform for Software Analysis -*** - *Goal*: build a research platform for Software Analysis. +*** Building a research platform for Software Analysis. + - Analyze all software source code artifacts (code + development history) + - At the largest possible scale #+BEAMER: \pause -*** Questions we want to be able to answer: +*** Examples of research questions the platform should support - What is the average size of a README? - What is the average directory depth of a Java repository? - What files are changed often in commits named "fix: ..."? - What are good predictors of software becoming popular/dying? - What are good predictors of a software getting forked? - ... ** Research requirements *** Categories of requested data - Content (/blobs/) - Metadata (/file names/, /directories/) - History graph (/revisions/) - Content search (/full-text search index/) - Provenance (/backwards index/) * Challenges ** Data volume challenges *** Analysis on a local mirror Handling data at that scale is a problem too hard for most researchers: - Data hardly fits on a single machine -- Unusual size distribution of blobs (~3 kB compressed) \\ +- Unusual size distribution of contents (a lot of very small files: median ~3 kB) \\ → hard to use classical distributed storage solutions - Graph doesn't fit in RAM \\ → hard to do intensive processing -- Even with enough capacity, how can we send you so much data? +- Even with enough capacity, downloading that volume of data is hard *** Remote computations -- Compute queries externally, /reduce/ the result and send it back -- How to describe those queries expressively? +- Compute on a remote server, /reduce/ the result and send it back +- How to describe queries expressively? ** Representation mismatch *** -Storing everything deduplicated is great for *archival* but *analysis -tools* generally expect specific directory structures/formats. +Storing everything deduplicated is storage-efficient for *archival* but +*analysis tools* generally expect specific directory structures/formats. *** Potential solutions - Provide a way to "flatten" deduplicated structures - Keep deduplication information accessible -- No real standard for the revision graph? +- No standard format for development history \\ + → export as VCS bundles? ** Other open questions -*** Provenance mappings -- "What is the content of this revision" is just half the story. -- *"What revisions contain this content"*? → Walk the tree backwards +*** Software provenance +- "What are the contents in this origin" is just half the story. +- *"What origins contain this content"*? → Walk the tree backwards - Tradeoff: reduce nb. of indirections while avoiding combinatorial explosions *** Project metadata - Concept of a "project" is lost in a fully-deduplicated dataset -- How to bridge project metadata with our objects? +- How to bridge project metadata with the objects? *** Expressivity -Our query language has to be expressive to allow combining types of +The query language has to be expressive to allow combining types of computations while minimizing roundtrips. -** Use case collection +** Roadmap *** -We want to *collect all the use cases* to understand usage patterns, and elicit -a query language. +- The entire dataset is accessible in Amazon Athena (graph) and S3 (contents) +- Will soon be made public for everyone to run queries on it +- *Collect all the use cases* to understand usage patterns +- Elicit a query language. -/Please/, give us ideas of what requests you would like to be able to run on +Please, give us ideas of what requests you would like to be able to run on the archive! ** Come and talk to us! - - Antoine Pietri / antoine.pietri@softwareheritage.org / @seirl_ - - Stefano Zacchiroli / zack@upsilon.cc / @zacchiro + Antoine Pietri / antoine.pietri@softwareheritage.org / @seirl_ + + #+BEAMER: \vspace{1cm} Links: - https://www.softwareheritage.org - https://archive.softwareheritage.org - - https://www.softwareheritage.org/support/sponsors/ *** Footer :B_ignoreheading: :PROPERTIES: :BEAMER_env: ignoreheading :END: #+BEAMER: \scriptsize \vfill \hfill Slides licensed under [[https://creativecommons.org/licenses/by-sa/4.0/][Creative Commons Attribution-ShareAlike 4.0 International License]] (CC BY-SA 4.0). + +** References + :PROPERTIES: + :BEAMER_OPT: fragile,allowframebreaks,label= + :END: + +#+BEAMER: \nocite{*} +#+BEAMER: \bibliographystyle{amsalpha} +#+BEAMER: \bibliography{swh.bib} diff --git a/talks-public/2018-12-10-BENEVOL/swh.bib b/talks-public/2018-12-10-BENEVOL/swh.bib new file mode 100644 index 0000000..1be9e3b --- /dev/null +++ b/talks-public/2018-12-10-BENEVOL/swh.bib @@ -0,0 +1,230 @@ +@article{cacm-2018-software-heritage, + author = {Abramatic, Jean-François and Di Cosmo, Roberto and Stefano Zacchiroli}, + title = {Building the Universal Archive of Source Code}, + publisher = {ACM}, + month = {October}, + year = {2018}, + issn = {0001-0782}, + doi = {10.1145/3183558}, + pages = {29-31}, + volume = {61}, + number = {10}, + journal = {Communications of the ACM}, +} + +@inproceedings{ipres-2017-software-heritage, + author = {Di Cosmo, Roberto and Stefano Zacchiroli}, + title = {Software Heritage: Why and How to Preserve Software Source Code}, + year = {2017}, + booktitle = {iPRES 2017: 14th International Conference on Digital Preservation}, +} + +@inproceedings{ipres-2018-doi, + author = {Di Cosmo, Roberto and Gruenpeter, Morane and Stefano Zacchiroli}, + title = {Identifiers for Digital Objects: the Case of Software Source Code Preservation}, + year = {2018}, + booktitle = {iPRES 2018: 15th International Conference on Digital Preservation}, +} + +@article{syeed2013evolution, + title={Evolution of open source software projects: A systematic literature review}, + author={Syeed, MM Mahbubul and Hammouda, Imed and Systa, Tarja}, + journal={Journal of Software}, + volume={8}, + number={11}, + pages={2815--2830}, + year={2013}, + publisher={Academy Publisher} +} + +@inproceedings{breivold2010systematic, + title={A systematic review of studies of open source software evolution}, + author={Breivold, Hongyu Pei and Chauhan, Muhammad Aufeef and Babar, Muhammad Ali}, + booktitle={Software Engineering Conference (APSEC), 2010 17th Asia Pacific}, + pages={356--365}, + year={2010}, + organization={IEEE} +} + +@article{herraiz2013evolution, + title={The evolution of the laws of software evolution: A discussion based on a systematic literature review}, + author={Herraiz, Israel and Rodriguez, Daniel and Robles, Gregorio and Gonzalez-Barahona, Jesus M}, + journal={ACM Computing Surveys (CSUR)}, + volume={46}, + number={2}, + pages={28}, + year={2013}, + publisher={ACM} +} + +@article{brisaboa2014compact, + title={Compact representation of web graphs with extended functionality}, + author={Brisaboa, Nieves R and Ladra, Susana and Navarro, Gonzalo}, + journal={Information Systems}, + volume={39}, + pages={152--174}, + year={2014}, + publisher={Elsevier} +} + +@inproceedings{Merkle, + author = {Ralph C. Merkle}, + title = {A Digital Signature Based on a Conventional Encryption Function}, + booktitle = {Advances in Cryptology - {CRYPTO} '87, {A} Conference on the Theory + and Applications of Cryptographic Techniques, Santa Barbara, California, + USA, August 16-20, 1987, Proceedings}, + pages = {369--378}, + year = {1987}, + crossref = {DBLP:conf/crypto/1987}, + url = {https://doi.org/10.1007/3-540-48184-2_32}, + doi = {10.1007/3-540-48184-2_32}, + timestamp = {Fri, 19 May 2017 13:10:47 +0200}, + biburl = {https://dblp.org/rec/bib/conf/crypto/Merkle87}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +@proceedings{DBLP:conf/crypto/1987, + editor = {Carl Pomerance}, + title = {Advances in Cryptology - {CRYPTO} '87, {A} Conference on the Theory + and Applications of Cryptographic Techniques, Santa Barbara, California, + USA, August 16-20, 1987, Proceedings}, + series = {Lecture Notes in Computer Science}, + volume = {293}, + publisher = {Springer}, + year = {1988}, + url = {https://doi.org/10.1007/3-540-48184-2}, + doi = {10.1007/3-540-48184-2}, + isbn = {3-540-18796-0}, + timestamp = {Fri, 19 May 2017 13:10:47 +0200}, + biburl = {https://dblp.org/rec/bib/conf/crypto/1987}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{lehman1980programs, + title={Programs, life cycles, and laws of software evolution}, + author={Lehman, Meir M}, + journal={Proceedings of the IEEE}, + volume={68}, + number={9}, + pages={1060--1076}, + year={1980}, + publisher={IEEE} +} + +@proceedings{DBLP:conf/icse/2018soheal, + editor = {Bram Adams and + Eleni Constantinou and + Tom Mens and + Gregorio Robles}, + title = {Proceedings of the 1st International Workshop on Software Health, + SoHeal@ICSE 2018, Gothenburg, Sweden, May 27, 2018}, + publisher = {{ACM}}, + year = {2018}, + url = {http://doi.acm.org/10.1145/3194124}, + doi = {10.1145/3194124}, + isbn = {978-1-4503-5730-2}, + timestamp = {Thu, 30 Aug 2018 11:49:50 +0200}, + biburl = {https://dblp.org/rec/bib/conf/icse/2018soheal}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{gonzalez2009macro, + title={Macro-level software evolution: a case study of a large software compilation}, + author={Gonzalez-Barahona, Jesus M and Robles, Gregorio and Michlmayr, Martin and Amor, Juan Jos{\'e} and German, Daniel M}, + journal={Empirical Software Engineering}, + volume={14}, + number={3}, + pages={262--285}, + year={2009}, + publisher={Springer} +} + +@article{debsources-ese-2016, + author = {Caneill, Matthieu and Daniel M. German and Stefano Zacchiroli}, + title = {The Debsources Dataset: Two Decades of Free and Open Source Software}, + publisher = {Springer}, + month = {June}, + year = {2017}, + issn = {1382-3256}, + doi = {10.1007/s10664-016-9461-5}, + pages = {1405-1437}, + volume = {22}, + journal = {Empirical Software Engineering}, +} + +@article{1999-beagle-in-commons, + title={Conceptualizing an information commons}, + author={Beagle, Donald}, + journal={The Journal of Academic Librarianship}, + volume={25}, + number={2}, + pages={82--89}, + year={1999}, + publisher={Elsevier} +} + +@article{kranich2008information, + title={Information Commons}, + author={Kranich, Nancy and Schement, Jorge Reina}, + journal={Annual Review of Information Science and Technology}, + volume={42}, + number={1}, + pages={546--591}, + year={2008}, + publisher={Wiley} +} + +@book{schweik2012internet, + title={Internet success: a study of open-source software commons}, + author={Schweik, Charles M and English, Robert C}, + year={2012}, + publisher={MIT Press} +} + +@article{spinellis2005vcs, + title={Version control systems}, + author={Spinellis, Diomidis}, + journal={IEEE Software}, + volume={22}, + number={5}, + pages={108--109}, + year={2005}, + publisher={IEEE} +} + +@article{dean2008mapreduce, + title={MapReduce: simplified data processing on large clusters}, + author={Dean, Jeffrey and Ghemawat, Sanjay}, + journal={Communications of the ACM}, + volume={51}, + number={1}, + pages={107--113}, + year={2008}, + publisher={ACM} +} + +@misc{dandrimont2018cephml, + title={[ceph-users] Ceph behavior on (lots of) small objects (RGW, RADOS + + erasure coding)?}, + howpublished={Ceph Users Mailing List}, + year={2018}, + url = {https://marc.info/?l=ceph-users\&m=153013955112932}, +} + +@inproceedings{xia2016fastcdc, + title={FastCDC: a Fast and Efficient Content-Defined Chunking Approach for Data Deduplication.}, + author={Xia, Wen and Zhou, Yukun and Jiang, Hong and Feng, Dan and Hua, Yu and Hu, Yuchong and Liu, Qing and Zhang, Yucheng}, + booktitle={USENIX Annual Technical Conference}, + pages={101--114}, + year={2016} +} + +@inproceedings{muthitacharoen2001low, + title={A low-bandwidth network file system}, + author={Muthitacharoen, Athicha and Chen, Benjie and Mazieres, David}, + booktitle={ACM SIGOPS Operating Systems Review}, + volume={35}, + number={5}, + pages={174--187}, + year={2001}, + organization={ACM} +}