diff --git a/common/modules/biblio.org b/common/modules/biblio.org index d03455c..c22345d 100644 --- a/common/modules/biblio.org +++ b/common/modules/biblio.org @@ -1,65 +1,69 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 * Bibliography :PROPERTIES: :CUSTOM_ID: main :END: #+BEGIN_EXPORT latex \begin{thebibliography}{Foo Bar, 1969} \footnotesize + \bibitem{EOSCSirs2020} EOSC SIRS Task Force + \newblock Scholarly Infrastructures for Research Software + \newblock 2020, European Commission, https://doi.org/10.2777/28598 + \bibitem{Pietri2020a} Antoine Pietri, Guillaume Rousseau, Stefano Zacchiroli \newblock Forking Without Clicking: on How to Identify Software Repository Forks \newblock MSR 2020: 17th Intl. Conf. on Mining Software Repositories. IEEE \bibitem{Pietri2020b} Antoine Pietri, Guillaume Rousseau, Stefano Zacchiroli \newblock Determining the Intrinsic Structure of Public Software Development History \newblock MSR 2020: 17th Intl. Conf. on Mining Software Repositories. IEEE \bibitem{Pietri2020c} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli \newblock The Software Heritage Graph Dataset: Large-scale Analysis of Public Software Development History \newblock MSR 2020: 17th Intl. Conf. on Mining Software Repositories. IEEE \bibitem{DiCosmo2020d} Roberto Di Cosmo \newblock Archiving and Referencing Source Code with Software Heritage \newblock International Congress on Mathematical Software (ICMS), 2020 \bibitem{DiCosmo2020c} P. Alliez, R. Di Cosmo, B. Guedj, A. Girault, M. Hacid, A. Legrand, N. Rougier \newblock Attributing and Referencing (Research) Software: Best Practices and Outlook From Inria \newblock Computing in Science \& Engineering, 22 (1), pp. 39-52, 2020, ISSN: 1558-366X \bibitem{DiCosmo2020b} P. Alliez, R. Di Cosmo, B. Guedj, A. Girault, M. Hacid, A. Legrand, N. Rougier \newblock Attributing and Referencing (Research) Software: Best Practices and Outlook From Inria \newblock Computing in Science \& Engineering, 22 (1), pp. 39-52, 2020, ISSN: 1558-366X \bibitem{DiCosmo2020a} Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli \newblock Referencing Source Code Artifacts: a Separate Concern in Software Citation \newblock Computing in Science \& Engineering, 2020, ISSN: 1521-9615 \bibitem{Boldi2020} Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli \newblock Ultra-Large-Scale Repository Analysis via Graph Compression \newblock SANER 2020, 27th Intl. Conf. on Software Analysis, Evolution and Reengineering. IEEE \bibitem{Rousseau2020} Roberto Di Cosmo, Guillaume Rousseau, Stefano Zacchiroli \newblock Software Provenance Tracking at the Scale of Public Source Code \newblock Empirical Software Engineering, 2020 \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli \newblock The Software Heritage Graph Dataset: Public software development under one roof \newblock MSR 2019: 16th Intl. Conf. on Mining Software Repositories. IEEE \bibitem{Abramatic2018} Jean-François Abramatic, Roberto Di Cosmo, Stefano Zacchiroli \newblock Building the Universal Archive of Source Code \newblock Communication of the ACM, October 2018 \bibitem{DiCosmo2018} Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli \newblock Identifiers for Digital Objects: the Case of Software Source Code Preservation \newblock iPRES 2018: Intl. Conf. on Digital Preservation \bibitem{DiCosmo2017} Roberto Di Cosmo, Stefano Zacchiroli \newblock Software Heritage: Why and How to Preserve Software Source Code \newblock iPRES 2017: Intl. Conf. on Digital Preservation \end{thebibliography} #+END_EXPORT diff --git a/common/modules/status-extended.org b/common/modules/status-extended.org index 8257ac9..09b10e6 100644 --- a/common/modules/status-extended.org +++ b/common/modules/status-extended.org @@ -1,504 +1,504 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 # not to be included as a whole, just pick individual slides as you see fit * Status :PROPERTIES: :CUSTOM_ID: main :END: ** The people :PROPERTIES: :CUSTOM_ID: people :END: *** The core team :B_picblock: :PROPERTIES: :CUSTOM_ID: core-team-formal :BEAMER_env: picblock :BEAMER_opt: pic=team,width=.4\linewidth :END: - Roberto Di Cosmo - Stefano Zacchiroli - Nicolas Dandrimont (Engineer) - Antoine Dumont (Engineer) # - and /Jordi, Quentin and Guillaume/ *** Scientific advisors - Serge Abiteboul (French Science Academy) - Jean-François Abramatic (former W3C director) - Gerard Berry (CNRS Gold Medal, French Science Academy) - Julia Lawall (Coccinelle, Linux Kernel, Outreachy) ** Archive coverage --- archive.softwareheritage.org :PROPERTIES: :CUSTOM_ID: archive :END: #+BEAMER: \vspace{-1mm} - #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{1\linewidth}]{2020-09-08-growth.png}\end{center} + #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{1\linewidth}]{2021-01-archive-growth.png}\end{center} #+BEAMER: \vspace{-2mm} *** #+BEAMER: \includegraphics[width=0.12\linewidth]{coverage/github} #+BEAMER: \hfill #+BEAMER: \includegraphics[width=0.13\linewidth]{coverage/gitlab} #+BEAMER: \hfill #+BEAMER: \raisebox{2mm}{\includegraphics[width=0.14\linewidth]{coverage/bitbucket}} #+BEAMER: \hfill #+BEAMER: \includegraphics[width=0.14\linewidth]{coverage/googlecode} #+BEAMER: \hfill #+BEAMER: \includegraphics[width=0.14\linewidth]{coverage/gitorious} #+BEAMER: \hfill #+BEAMER: \includegraphics[width=0.12\linewidth]{coverage/framagit} #+BEAMER: \\ #+BEAMER: \includegraphics[width=0.10\linewidth]{coverage/hal} #+BEAMER: \hfill #+BEAMER: \raisebox{2mm}{\includegraphics[width=0.12\linewidth]{coverage/debian}} #+BEAMER: \hfill #+BEAMER: \raisebox{1mm}{\includegraphics[width=0.11\linewidth]{coverage/npm}} #+BEAMER: \hfill #+BEAMER: \includegraphics[width=0.06\linewidth]{coverage/cran} #+BEAMER: \hfill #+BEAMER: \includegraphics[width=0.12\linewidth]{coverage/gnu} #+BEAMER: \hfill #+BEAMER: \includegraphics[width=0.12\linewidth]{coverage/inria} #+BEAMER: \hfill #+BEAMER: \raisebox{-1mm}{\includegraphics[width=0.11\linewidth]{coverage/pypi}} #+BEAMER: \pause *** - ~400 TB (uncompressed) blobs, ~20 B nodes, ~300 B edges - The /richest/ public source code archive, ... and growing daily! ** The structure of the archive :noexport: *** On-disk storage - flat file storage for contents - postgres database for the metadata *** Data model: /one/ big Merkle DAG, inspired by the git model - Origins (= repositories) - Occurrences (= branches) - Releases (= tags) - Revisions (= commits) - Directories (= trees) - Contents (= blobs) ** Archiving goals :PROPERTIES: :CUSTOM_ID: archivinggoals :END: Targets: VCS repositories & source code releases (e.g., tarballs) *** We DO archive - file *content* (= blobs) - *revisions* (= commits), with full metadata - *releases* (= tags), ditto - where (*origin*) & when (*visit*) we found any of the above # - time-indexed repo *snapshots* (i.e., we never delete anything) … in a VCS-/archive-agnostic *canonical data model* *** We DON'T archive # - diffs → derived data from related contents - homepages, wikis - BTS/issues/code reviews/etc. - mailing lists Long term vision: play our part in a /"semantic wikipedia of software"/ ** Architecture :PROPERTIES: :CUSTOM_ID: architecture :END: *** Data flow :PROPERTIES: :CUSTOM_ID: dataflow :END: # #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{1.4\textwidth}]{swh-dataflow.pdf}\end{center} ** Data model :noexport: *** General schema - VCS-independent - fully deduplicated + files, directories and commits are /shared/ - biggest git-like /graph/ in the world *** \begin{center} \url{http://deb.li/swhdm} \end{center} *** full hash index (sha1, sha256, ...) Some funny facts: - the GPL2 licence appears under more than 500 names + including /aa.css.txt/ and /FullSync.txt/ ~ :-) ** Merkle DAG *** Merkle structure :PROPERTIES: :CUSTOM_ID: merkle :END: **** Merkle trees :PROPERTIES: :CUSTOM_ID: merkletree :END: # R. C. Merkle, A digital signature based on a conventional encryption # function, Crypto '87 #+BEAMER: \vspace{-3mm} ***** Merkle tree (R. C. Merkle, CRYPTO 1987) :B_picblock: :PROPERTIES: :BEAMER_opt: pic=merkle, leftpic=true, width=.7\linewidth :BEAMER_env: picblock :BEAMER_act: :END: Combination of - tree - hash function #+BEAMER: \pause #+BEAMER: \footnotesize ***** Classical cryptographic construction - fast, parallel signature of large data structures - widely used (e.g., Git, blockchains, IPFS, ...) - built-in deduplication #+BEAMER: \vspace{-1mm} **** Data Model :PROPERTIES: :CUSTOM_ID: datamodel :END: ***** The archive: a (giant) Merkle DAG #+BEAMER: \vspace{-3mm} #+BEAMER: \centering \includegraphics[width=\textwidth]{swh-data-model-h} **** The archive in a few pictures :PROPERTIES: :CUSTOM_ID: merkledemo :END: ***** A giant (extended) Merkle DAG #+LATEX: \only<1>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_1.pdf}}} #+LATEX: \only<2>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/contents.pdf}}} #+LATEX: \only<3>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_2_contents.pdf}}} #+LATEX: \only<4>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/directories.pdf}}} #+LATEX: \only<5>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_3_directories.pdf}}} #+LATEX: \only<6>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/revisions.pdf}}} #+LATEX: \only<7>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_4_revisions.pdf}}} #+LATEX: \only<8>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/releases.pdf}}} #+LATEX: \only<9>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_5_releases.pdf}}} # #+LATEX: {\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_1.pdf}}} *** A revision node :PROPERTIES: :CUSTOM_ID: merklerevision :END: **** Example: a Software Heritage revision ***** #+BEAMER: \vspace{-.5cm}\centering\includegraphics[width=0.9\textwidth]{git-merkle/revisions} ***** Note: most object kinds currently have Git-compatible identifiers *** Giant DAG :PROPERTIES: :CUSTOM_ID: giantdag :END: **** The archive: a (giant) Merkle DAG # Using an empty frame because the image is difficult to read on swh bg. # Finding a way to override image bg for just this frame would be better. ***** #+BEAMER: \centering \includegraphics[width=\extblockscale{\textwidth}]{git-merkle/merkle_5_releases} *** Giant DAG (single slide) :PROPERTIES: :CUSTOM_ID: giantdag1slide :END: **** The Software Heritage archive: a gigantic Merkle DAG #+LATEX: \centering\forcebeamerstart{} #+LATEX: \only<1>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_1}}} #+LATEX: \only<2>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/contents}}} #+LATEX: \only<3>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_2_contents}}} #+LATEX: \only<4>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/directories}}} #+LATEX: \only<5>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_3_directories}}} #+LATEX: \only<6>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/revisions}}} #+LATEX: \only<7>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_4_revisions}}} #+LATEX: \only<8>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/releases}}} #+LATEX: \only<9>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_5_releases}}} #+LATEX: \forcebeamerend{} *** Giant DAG (detailed) :PROPERTIES: :CUSTOM_ID: dagdetail :END: **** The archive: a (giant) Merkle DAG #+BEAMER: \vspace{-3mm} #+BEAMER: \centering \includegraphics[width=\textwidth]{swh-merkle-dag-wide} *** Giant DAG (detailed) :PROPERTIES: :CUSTOM_ID: dagdetailsmall :END: **** The archive: a (giant) Merkle DAG #+BEAMER: \centering #+BEAMER: \only<1>{\includegraphics[width=\textwidth]{swh-merkle-dag-small-visit1}} #+BEAMER: \only<2>{\includegraphics[width=\textwidth]{swh-merkle-dag-small-visit2}} #+BEAMER: \only<3>{\includegraphics[width=\textwidth]{swh-merkle-dag-small}} ** Technology :noexport: :PROPERTIES: :CUSTOM_ID: technology :END: *** Software stack :PROPERTIES: :CUSTOM_ID: swstack :END: **** 3rd party - Debian, Proxmox, ZFS on Linux, Puppet - PostgreSQL for metadata storage, with barman & pglogical - Celery (RabbitMQ backend) for task scheduling - Python3 and psycopg2 for the backend - Django, Bootstrap, D3.js for Web stuff **** in house - /ad hoc/ object storage (to avoid imposing tech to mirrors) - data model implementation, listers, loaders, scheduler - ~60 Git repositories (~20 Python packages, ~30 Puppet modules) - ~60 kSLOC Python / ~12 kSLOC SQL / ~4 kSLOC Puppet - licence choice: GPLv3 (backend) / AGPLv3 (frontend) *** Deployment architecture #+BEAMER: \vspace{1mm} #+BEAMER: \centering \includegraphics[height=.9\textheight]{general-architecture} *** Hardware stack :PROPERTIES: :CUSTOM_ID: hwstack :END: **** in house - 3x hypervisors with ~40 VMs - 1x high performance database server; read-only replica on a container - 2x dedicated storage servers, one of them using ZFS. - 3x high density storage array (2 x 60 x 6TB; 1 x 60 x 10TB) - 3x nodes for a kafka+elasticsearch cluster **** on Azure - full object storage mirror - full mirror of the database containing the graph - workers for content indexing - workers for download bundle preparation *** Software architecture :noexport: **** Module dependencies (internal + external) :B_picblock: :PROPERTIES: :BEAMER_env: picblock :BEAMER_opt: pic=swh-modules-deps-all,width=\linewidth :END: **** let's zoom in: http://deb.li/swhdeps ** Technology :noexport: :PROPERTIES: :CUSTOM_ID: technology-short :END: *** Deployment and resource usage **** Software - around 60k SLOC of custom Python code, running on Debian Stable - PostgreSQL database for the metadata storage - Full docker-compose development environment - Work in progress: scale-out metadata storage (Cassandra?) - Work in progress: mirroring infrastructure (Kafka) **** Hardware - 12 servers (hypervisors, database, storage, staging and testing infrastructure) / 40 virtual machines with mass storage and a backup server at Inria - In-kind sponsorship of cloud and storage resources (Microsoft, University of Bologna) ** Software development :noexport: :PROPERTIES: :CUSTOM_ID: development :END: *** Software development **** classic FOSS development - language: English - development mailing list #+BEAMER: \\{\small \url{https://sympa.inria.fr/sympa/info/swh-devel}} - IRC #+BEAMER: \\ #swh-devel / FreeNode - Forge #+BEAMER: \\{\small \url{https://forge.softwareheritage.org}} - Git, tasks, code review, etc. **** for more information #+BEAMER: \scriptsize https://www.softwareheritage.org/community/developers/ ** Roadmap :PROPERTIES: :CUSTOM_ID: features :END: *** Features... - (done) *lookup* by content hash - (done) *browsing*: "wayback machine" for source code (API + UI) - (early access) *deposit* of source code bundles directly to the archive - (early access) *save code now*, on-demand archive - (done) *download*: =wget= / =git clone= from the archive - (todo) *provenance* lookup for all archived content - (todo) *full-text search* on all archived source code files #+BEAMER: \pause *** ... and much more than one could possibly imagine all the world's software development history at hand's reach! ** Web API :noexport: :PROPERTIES: :CUSTOM_ID: api :END: *** Web API :PROPERTIES: :CUSTOM_ID: apiintro :END: **** RESTful API to programmatically access the Software Heritage archive \\ *\url{https://archive.softwareheritage.org/api/}* **** Features - pointwise *browsing* of the archive - … snapshots → revisions → directories → contents … - full access to the *metadata* of archived objects - *crawling* information - /when have you last visited this Git repository I care about?/ - /where were its branches/tags pointing to at the time?/ # - derived information about archived contents (WIP) # - MIME type, programming language, license, etc. **** Endpoint index \url{https://archive.softwareheritage.org/api/1/} *** A tour of the Web API --- origins & visits :PROPERTIES: :CUSTOM_ID: apitourvisits :END: #+BEAMER: \footnotesize #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/origin/ \ git/url/https://github.com/hylang/hy { "id": 1, "origin_visits_url": "/api/1/origin/1/visits/", "type": "git", "url": "https://github.com/hylang/hy" } #+END_SRC #+BEAMER: \vfill #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/origin/ \ 1/visits/ [ ..., { "date": "2016-09-14T11:04:26.769266+00:00", "origin": 1, "origin_visit_url": "/api/1/origin/1/visit/13/", "status": "full", "visit": 13 }, ... ] #+END_SRC *** A tour of the Web API --- snapshots :PROPERTIES: :CUSTOM_ID: apitoursnapshots :END: #+BEAMER: \footnotesize #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/origin/ \ 1/visit/13/ { ..., "occurrences": { ..., "refs/heads/master": { "target": "b94211251...", "target_type": "revision", "target_url": "/api/1/revision/b94211251.../" }, "refs/tags/0.10.0": { "target": "7045404f3...", "target_type": "release", "target_url": "/api/1/release/7045404f3.../" }, ... }, "origin": 1, "origin_url": "/api/1/origin/1/", "status": "full", "visit": 13 } #+END_SRC *** A tour of the Web API --- releases :noexport: :PROPERTIES: :CUSTOM_ID: apitourreleases :END: #+BEAMER: \footnotesize #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/release/ \ 7045404f3d1c54e6473c71bbb716529fbad4be24/ { "author": { "email": "tag@pault.ag", "fullname": "Paul Tagliamonte ", "id": 96, "name": "Paul Tagliamonte" }, "date": "2014-04-10T23:01:28-04:00", "message": "0.10: The Oh f*ck it's PyCon release", "name": "0.10.0", "synthetic": false, "target": "6072557b6...", "target_type": "revision", "target_url": "/api/1/revision/6072557b6.../", ... } #+END_SRC *** A tour of the Web API --- revisions :PROPERTIES: :CUSTOM_ID: apitourrevisions :END: #+BEAMER: \footnotesize #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/revision/ \ 6072557b6c10cd9a21145781e26ad1f978ed14b9/ { "author": { "email": "tag@pault.ag", "fullname": "Paul Tagliamonte ", "id": 96, "name": "Paul Tagliamonte" }, "committer": { ... }, "date": "2014-04-10T23:01:11-04:00", "committer_date": "2014-04-10T23:01:11-04:00", "directory": "2df4cd84e...", "directory_url": "/api/1/directory/2df4cd84e.../", "history_url": "/api/1/revision/6072557b6.../log/", "merge": false, "message": "0.10: The Oh f*ck it's PyCon release", "parents": [ { "id": "10149f66e...", "url": "/api/1/revision/10149f66e.../" } ], ... } #+END_SRC *** A tour of the Web API --- contents :PROPERTIES: :CUSTOM_ID: apitourcontents :END: #+BEAMER: \footnotesize #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/content/ \ adc83b19e793491b1c6ea0fd8b46cd9f32e592fc/ { "data_url": "/api/1/content/sha1:adc83b19e.../raw/", "filetype_url": "/api/1/content/sha1:.../filetype/", "language_url": "/api/1/content/sha1:.../language/", "length": 1, "license_url": "/api/1/content/sha1:.../license/", "sha1": "adc83b19e...", "sha1_git": "8b1378917...", "sha256": "01ba4719c...", "status": "visible" } #+END_SRC #+BEAMER: \normalsize \vfill \pause **** Caveats - rate limits apply throughout the API - raw download available for textual contents ** Accessing the archive :noexport: :PROPERTIES: :CUSTOM_ID: accessing-short :END: *** Browse :B_block:BMCOL: :PROPERTIES: :BEAMER_col: 0.4 :BEAMER_env: block :END: #+BEAMER: \begin{center}\includegraphics[width=0.5\textwidth]{archive-browse}\end{center} - https://archive.softwareheritage.org/browse - way back machine for software source code #+BEAMER: \pause *** Web API :B_block:BMCOL: :PROPERTIES: :BEAMER_col: 0.4 :BEAMER_env: block :END: #+BEAMER: \begin{center}\includegraphics[width=0.5\textwidth]{archive-webapi}\end{center} - https://archive.softwareheritage.org/api - point-wise navigation of the archive as a graph ** Some technical challenges :PROPERTIES: :CUSTOM_ID: techchallenges :END: *** Expanding the archive - discover and classify /all/ the software sources - importers for other VCSs (SVN, Hg, ...) \hfill /We need your help!/ *** Staying current get new repositories and commits ASAP\\ \hfill /We need reliable, standardised event feeds./ *** Handling the backlog ingesting all the pre-existing data\\ \hfill /Decades of software development are waiting!/ diff --git a/common/modules/swh-ardc.org b/common/modules/swh-ardc.org index 27fce0b..1b29f07 100644 --- a/common/modules/swh-ardc.org +++ b/common/modules/swh-ardc.org @@ -1,275 +1,275 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) # # Software is all around us # #+INCLUDE: "prelude.org" :minlevel 1 #+INCLUDE: "169.org" * Source code pillar of Open Science, and how Software Heritage addresses ARDC :PROPERTIES: :CUSTOM_ID: main :END: ** Source code is /special/ (software is /not/ data) :PROPERTIES: :CUSTOM_ID: swnotdata :END: *** /Executable/ and /human readable/ knowledge \hfill copyright law :noexport: /“Programs must be written for people to read, and only incidentally for machines to execute.”/\\ \hfill Harold Abelson #+BEAMER: \pause *** Software /evolves/ over time - projects may last decades - the /development history/ is key to its /understanding/ #+BEAMER: \pause *** Complexity :B_picblock: :PROPERTIES: :BEAMER_env: picblock :BEAMER_OPT: pic=python3-matplotlib.pdf, width=.6\linewidth :END: - /millions/ of lines of code - large /web of dependencies/ + easy to break, difficult to maintain + /research software/ a thin top layer - sophisticated /developer communities/ *** :B_ignoreheading: :PROPERTIES: :BEAMER_env: ignoreheading :END: #+BEAMER: \pause *** Precious, endangered /executable/ and /human readable/ knowledge key people *passing away*, platforms (GoogleCode, Gitorious, etc.) closing down ...\\ \hfill no organised effort to catalog and archive it ** Source code is /special/, cont'd :PROPERTIES: :CUSTOM_ID: swnotdatacontd :END: *** Versioning, granularity - Project :: “Inria created OCaml and Scikit-learn”\pause - Release :: “2D Voronoi Diagrams were introduced in CGAL 3.1.0”\pause - Precise state of a project :: “This result was produced using commit 0064fbd...”\pause - Code fragment :: “The core algorithm is in lines 101 to 143 of the file parmap.ml contained in the precise state of the project corresponding to commit 0064fbd....” #+BEAMER: \pause *** Authors can have multiple roles: - Architecture, Management, Development, Documentation, Testing, ... ** Software Source code: a pillar of Open Science :PROPERTIES: :CUSTOM_ID: pillaropenscience :END: *** Software is everywhere in modern research :B_picblock: :PROPERTIES: :BEAMER_opt: pic=papermountain, leftpic=true, width=.3\linewidth :BEAMER_env: picblock :BEAMER_COL: .6 :END: #+BEGIN_QUOTE [...] software [...] essential in their fields. \mbox{}\hfill Top 100 papers (Nature, 2014) #+END_QUOTE #+BEGIN_QUOTE Sometimes, if you dont have the software, you dont have the data \mbox{}\hfill Christine Borgman, Paris, 2018 #+END_QUOTE # http://www.nature.com/news/the-top-100-papers-1.16224 #+BEAMER: \pause *** Open Science: three pillars :B_block: :PROPERTIES: :BEAMER_COL: .45 :BEAMER_env: block :END: #+latex: \begin{center} #+ATTR_LATEX: :width \extblockscale{\linewidth} file:PreservationTriangle.png #+latex: \end{center} #+BEAMER: \pause *** :B_ignoreheading: :PROPERTIES: :BEAMER_env: ignoreheading :END: *** Nota bene \hfill The links in the picture are *essential* ** A plurality of needs :PROPERTIES: :CUSTOM_ID: userneeds :END: *** Researchers - *archive* and *reference* software used in articles - *find* useful software - get *credit* for developed software - verify/reproduce/improve results #+BEAMER: \pause *** Laboratories/teams - track software contributions - produce reports - maintain web page #+BEAMER: \pause *** Research Organization - know its *software assets* for: technology *transfer*, impact *metrics*, strategy ** What is at stake: ARDC \hfill in increasing order of difficulty :PROPERTIES: :CUSTOM_ID: ardc :END: *** Archive Research software artifacts must be properly *archived*\\ \hfill make sure we can /retrieve/ them (/reproducibility/) #+BEAMER: \pause *** :B_ignoreheading: :PROPERTIES: :BEAMER_env: ignoreheading :END: \vspace{-.5em} *** Reference Research software artifacts must be properly *referenced*\\ \hfill make sure we can /identify/ them (/reproducibility/) #+BEAMER: \pause *** :B_ignoreheading: :PROPERTIES: :BEAMER_env: ignoreheading :END: \vspace{-.5em} *** Describe Research software artifacts must be properly *described*\\ \hfill make it easy to /discover/ and /reuse/ them (/visibility/) #+BEAMER: \pause *** :B_ignoreheading: :PROPERTIES: :BEAMER_env: ignoreheading :END: \vspace{-.5em} *** Cite/Credit Research software artifacts must be properly *cited* /(not the same as referenced!)/\\ \hfill to give /credit/ to authors (/evaluation/!) *** :B_ignoreheading: :PROPERTIES: :BEAMER_env: ignoreheading :END: ** What is at stake: beyond ARDC :PROPERTIES: :CUSTOM_ID: beyondardc :END: *** Sustainability Organisational schemas, legal tools, ecomonic models, processes and policies to ensure research software can be maintained and sustained over time #+BEAMER: \pause *** Technology transfer and industry collaboration Approaches, support, methods, processes to establish connections with industry in order to foster uptake and transfer of research software ** Addressing the four ARDC needs (see [[https://dx.doi.org/10.1007/978-3-030-52200-1_36][ICMS 2020]] for details) :PROPERTIES: :CUSTOM_ID: swh-ardc-short :END: -*** Archive (8B+ files, 140M+ projects) +*** Archive (10B+ files, 150M+ projects) :PROPERTIES: :BEAMER_env: block :BEAMER_COL: .5 :END: #+ATTR_LATEX: :width .8\linewidth file:swh-dataflow-merkle.pdf \vspace{-1em} #+BEAMER: \pause - [[https://save.softwareheritage.org][save.softwareheritage.org]] - [[https://deposit.softwareheritage.org][deposit.softwareheritage.org]] # (HAL, IPOL) #+BEAMER: \pause *** Reference (20 billion SWHIDs) :B_block: :PROPERTIES: :BEAMER_env: block :BEAMER_COL: .5 :END: [[https://www.softwareheritage.org/2020/07/09/intrinsic-vs-extrinsic-identifiers/][Intrinsic, decentralised, cryptographically strong identifiers, SWHIDs]] \vspace{-1em} #+ATTR_LATEX: :width 1.02\linewidth file:SWHID-v1.4_3.png Now supported [[https://www.softwareheritage.org/2020/05/13/swhid-adoption/][in SPDX 2.2, Wikidata]] etc. #+BEAMER: \pause *** :B_ignoreheading: :PROPERTIES: :BEAMER_env: ignoreheading :END: *** Describe :B_block: :PROPERTIES: :BEAMER_env: block :BEAMER_COL: .5 :END: - /Intrinsic metadata/ from source code - Contributed the [[https://codemeta.github.io/codemeta-generator/][Codemeta generator]] #+BEAMER: \pause *** Cite/Credit :B_block: :PROPERTIES: :BEAMER_env: block :BEAMER_COL: .5 :END: - Contributed /software citation/ style [[https://www.ctan.org/tex-archive/macros/latex/contrib/biblatex-contrib/biblatex-software][biblatex-software, v 1.2-2 now on CTAN]] ** Addressing the A(archive) in ARDC (see [[https://dx.doi.org/10.1007/978-3-030-52200-1_36][ICMS 2020]] for details) :PROPERTIES: :CUSTOM_ID: swh-a :END: #+latex: \vspace{-0.8em} -*** /Universal/ source code archive \hfill not only resarch \hfill (8B+ files, 140M+ projects) +*** /Universal/ source code archive \hfill not only resarch \hfill (9B+ files, 150M+ projects) :PROPERTIES: :BEAMER_env: block :END: #+ATTR_LATEX: :width .6\linewidth file:swh-dataflow-merkle.pdf #+latex: \vspace{-1em} - your research software /is likely there already/! #+BEAMER: \pause - anyone can trigger archival with [[https://save.softwareheritage.org][save.softwareheritage.org]] #+BEAMER: \pause - selected partners can push to the archive via [[https://deposit.softwareheritage.org][deposit.softwareheritage.org]] # (HAL, IPOL) ** Addressing the R(eference) in ARDC (see [[https://dx.doi.org/10.1007/978-3-030-52200-1_36][ICMS 2020]] for details) :PROPERTIES: :CUSTOM_ID: swh-r :END: #+latex: \vspace{-0.8em} *** Software Heritage Identifiers (SWHID) \hfill [[https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html][link to full docs]] :B_block: :PROPERTIES: :BEAMER_env: block :END: 20+B [[https://www.softwareheritage.org/2020/07/09/intrinsic-vs-extrinsic-identifiers/][intrinsic, decentralised, cryptographically strong identifiers, SWHIDs]] # #+INCLUDE: "../../common/modules/swh-id-syntax.org::#swh-id-syntax" :only-contents t :minlevel 3 #+LATEX: \centering%\forcebeamerstart #+LATEX: \mode{\only<1>{\includegraphics[width=0.9\linewidth]{SWHID-v1.4_1.png}}} #+LATEX: \mode{\only<2>{\includegraphics[width=0.9\linewidth]{SWHID-v1.4_2.png}}} #+LATEX: \only<3->{\includegraphics[width=0.9\linewidth]{SWHID-v1.4_3.png}} #+LATEX: %\forcebeamerend *** :PROPERTIES: :BEAMER_act: <4-> :BEAMER_env: block :END: Emerging standard : Linux Foundation [[https://spdx.github.io/spdx-spec/appendix-VI-external-repository-identifiers/#persistent-id][SPDX 2.2]]; IANA registered; WikiData [[https://www.wikidata.org/wiki/Property:P6138][P6138]] #+latex: \vspace{-0.5em} *** Full fledged /source code references/ for reproducibility :B_block: :PROPERTIES: :BEAMER_act: <5-> :BEAMER_env: block :END: Examples: [[https://archive.softwareheritage.org/swh:1:cnt:64582b78792cd6c2d67d35da5a11bb80886a6409;origin=https://github.com/virtualagc/virtualagc;lines=245-261/][Apollo 11 AGC excerpt]], [[https://archive.softwareheritage.org/swh:1:cnt:bb0faf6919fc60636b2696f32ec9b3c2adb247fe;origin=https://github.com/id-Software/Quake-III-Arena;lines=549-572/][Quake III rsqrt]]; Guidelines available, see [[https://dx.doi.org/10.1007/978-3-030-52200-1_36][ICMS 2020]] #+BEAMER: \pause ** Addressing D(escribe) and C(ite) in ARDC (see [[https://dx.doi.org/10.1007/978-3-030-52200-1_36][ICMS 2020]] for details) :PROPERTIES: :CUSTOM_ID: swh-dc :END: *** Describe :B_block: :PROPERTIES: :BEAMER_env: block :BEAMER_COL: .5 :END: - Collect /intrinsic metadata/ - Contributed the [[https://codemeta.github.io/codemeta-generator/][Codemeta generator]] #+ATTR_LATEX: :width .8\linewidth file:CodeMetaGenerator.png #+BEAMER: \pause *** Cite/Credit :B_block: :PROPERTIES: :BEAMER_env: block :BEAMER_COL: .5 :END: - Contributed /software citation/ style [[https://www.ctan.org/tex-archive/macros/latex/contrib/biblatex-contrib/biblatex-software][biblatex-software, v 1.2-2 now on CTAN]] #+ATTR_LATEX: :width .8\linewidth file:BibLaTeX-swh.png