diff --git a/common/images/Makefile b/common/images/Makefile index 7c0aadc..8e175b7 100644 --- a/common/images/Makefile +++ b/common/images/Makefile @@ -1,33 +1,36 @@ GEN_STUFF = GEN_STUFF += PreservationPyramid.pdf GEN_STUFF += swh-dataflow.svg GEN_STUFF += swh-dataflow.pdf all: $(GEN_STUFF) support.pdf: support.odg soffice --headless --convert-to pdf support.odg "-env:UserInstallation=file:///tmp/LibreOffice_Conversion_${USER}" # incredible libreoffice bug, see http://stackoverflow.com/questions/30349542/command-libreoffice-headless-convert-to-pdf-test-docx-outdir-pdf-is-not pdfcrop support.pdf mv support-crop.pdf support.pdf sponsors.pdf: sponsors.odg soffice --headless --convert-to pdf sponsors.odg "-env:UserInstallation=file:///tmp/LibreOffice_Conversion_${USER}" # incredible libreoffice bug, see http://stackoverflow.com/questions/30349542/command-libreoffice-headless-convert-to-pdf-test-docx-outdir-pdf-is-not pdfcrop sponsors.pdf mv sponsors-crop.pdf sponsors.pdf %.svg: %.dia dia -t svg -e $@ $< %.pdf: %.svg inkscape --export-area-drawing --export-pdf $@ $< +swh-dataflow-merkle-listers.svg: swh-dataflow-merkle.dia + dia -t svg -L Background -e $@ $< + PreservationPyramid.pdf: PreservationPyramid.tex pdflatex PreservationPyramid.tex pdfcrop PreservationPyramid.pdf mv PreservationPyramid-crop.pdf PreservationPyramid.pdf clean: rm -f *.aux *.toc *.log *.snm *.nav *~ diff --git a/common/images/Parmap-browse-contextless-path.png b/common/images/Parmap-browse-contextless-path.png new file mode 100644 index 0000000..1998efd Binary files /dev/null and b/common/images/Parmap-browse-contextless-path.png differ diff --git a/common/images/acm_badges.png b/common/images/acm_badges.png new file mode 100644 index 0000000..bc0e86f Binary files /dev/null and b/common/images/acm_badges.png differ diff --git a/common/images/cloud_metadata_credit.png b/common/images/cloud_metadata_credit.png new file mode 100644 index 0000000..61aa409 Binary files /dev/null and b/common/images/cloud_metadata_credit.png differ diff --git a/common/images/growth.png b/common/images/growth.png deleted file mode 120000 index f2c83ce..0000000 --- a/common/images/growth.png +++ /dev/null @@ -1 +0,0 @@ -2018-01-archive-growth.png \ No newline at end of file diff --git a/common/images/growth.png b/common/images/growth.png new file mode 100644 index 0000000..085a496 Binary files /dev/null and b/common/images/growth.png differ diff --git a/common/images/hardtasks-xkcd.com-1425.png b/common/images/hardtasks-xkcd.com-1425.png new file mode 100644 index 0000000..febd917 Binary files /dev/null and b/common/images/hardtasks-xkcd.com-1425.png differ diff --git a/common/images/metadata_landscape_RDA11.png b/common/images/metadata_landscape_RDA11.png new file mode 100644 index 0000000..61a726f Binary files /dev/null and b/common/images/metadata_landscape_RDA11.png differ diff --git a/common/images/metadata_landscape_RDA11_1.png b/common/images/metadata_landscape_RDA11_1.png new file mode 100644 index 0000000..987b7ae Binary files /dev/null and b/common/images/metadata_landscape_RDA11_1.png differ diff --git a/common/images/metadata_landscape_RDA11_2.png b/common/images/metadata_landscape_RDA11_2.png new file mode 100644 index 0000000..0e933b8 Binary files /dev/null and b/common/images/metadata_landscape_RDA11_2.png differ diff --git a/common/images/metadata_landscape_RDA11_3.png b/common/images/metadata_landscape_RDA11_3.png new file mode 100644 index 0000000..44335c7 Binary files /dev/null and b/common/images/metadata_landscape_RDA11_3.png differ diff --git a/common/images/metadata_landscape_RDA11_4.png b/common/images/metadata_landscape_RDA11_4.png new file mode 100644 index 0000000..f8acda7 Binary files /dev/null and b/common/images/metadata_landscape_RDA11_4.png differ diff --git a/common/images/metadata_landscape_RDA11_5.png b/common/images/metadata_landscape_RDA11_5.png new file mode 100644 index 0000000..cc625eb Binary files /dev/null and b/common/images/metadata_landscape_RDA11_5.png differ diff --git a/common/images/swh-dataflow-merkle.dia b/common/images/swh-dataflow-merkle.dia new file mode 100644 index 0000000..9182dec Binary files /dev/null and b/common/images/swh-dataflow-merkle.dia differ diff --git a/common/logos/Qwant_Logo.png b/common/logos/Qwant_Logo.png new file mode 100644 index 0000000..cedc397 Binary files /dev/null and b/common/logos/Qwant_Logo.png differ diff --git a/common/modules/status-extended.org b/common/modules/status-extended.org index 95f31ab..d7524c2 100644 --- a/common/modules/status-extended.org +++ b/common/modules/status-extended.org @@ -1,410 +1,414 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 # not to be included as a whole, just pick individual slides as you see fit * Status :PROPERTIES: :CUSTOM_ID: main :END: ** The people :PROPERTIES: :CUSTOM_ID: people :END: *** The core team :B_picblock: :PROPERTIES: :CUSTOM_ID: core-team-formal :BEAMER_env: picblock :BEAMER_opt: pic=team,width=.4\linewidth :END: - Roberto Di Cosmo - Stefano Zacchiroli - Nicolas Dandrimont (Engineer) - Antoine Dumont (Engineer) # - and /Jordi, Quentin and Guillaume/ *** Scientific advisors - Serge Abiteboul (French Science Academy) - Jean-François Abramatic (former W3C director) - Gerard Berry (CNRS Gold Medal, French Science Academy) - Julia Lawall (Coccinelle, Linux Kernel, Outreachy) ** Archive coverage :PROPERTIES: :CUSTOM_ID: archive :END: #+BEAMER: \vspace{-2mm} #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{1.1\linewidth}]{2018-01-archive-growth.png}\end{center} #+BEAMER: \vspace{-2mm} *** Current sources - live: GitHub, Debian - one-off: Gitorious, Google Code, GNU - WIP: Bitbucket #+BEAMER: \pause *** 150 TB blobs, 5 TB database (as a graph: 7 B nodes + 60 B edges) #+BEAMER: \pause *** \hfill The /richest/ public source code archive, ... and growing daily! ** The structure of the archive :noexport: *** On-disk storage - flat file storage for contents - postgres database for the metadata *** Data model: /one/ big Merkle DAG, inspired by the git model - Origins (= repositories) - Occurrences (= branches) - Releases (= tags) - Revisions (= commits) - Directories (= trees) - Contents (= blobs) ** Archiving goals :PROPERTIES: :CUSTOM_ID: archivinggoals :END: Targets: VCS repositories & source code releases (e.g., tarballs) *** We DO archive - file *content* (= blobs) - *revisions* (= commits), with full metadata - *releases* (= tags), ditto - where (*origin*) & when (*visit*) we found any of the above # - time-indexed repo *snapshots* (i.e., we never delete anything) … in a VCS-/archive-agnostic *canonical data model* *** We DON'T archive # - diffs → derived data from related contents - homepages, wikis - BTS/issues/code reviews/etc. - mailing lists Long term vision: play our part in a /"semantic wikipedia of software"/ ** Architecture :PROPERTIES: :CUSTOM_ID: architecture :END: *** Data flow :PROPERTIES: :CUSTOM_ID: dataflow :END: # #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{1.2\textwidth}]{swh-dataflow.pdf}\end{center} ** Data model :noexport: *** General schema - VCS-independent - fully deduplicated + files, directories and commits are /shared/ - biggest git-like /graph/ in the world *** \begin{center} \url{http://deb.li/swhdm} \end{center} *** full hash index (sha1, sha256, ...) Some funny facts: - the GPL2 licence appears under more than 500 names + including /aa.css.txt/ and /FullSync.txt/ ~ :-) ** Merkle DAG *** Merkle structure :PROPERTIES: :CUSTOM_ID: merkle :END: **** Merkle trees :PROPERTIES: :CUSTOM_ID: merkletree :END: # R. C. Merkle, A digital signature based on a conventional encryption # function, Crypto '87 #+BEAMER: \vspace{-3mm} ***** Merkle tree (R. C. Merkle, Crypto 1979) :B_picblock: :PROPERTIES: :BEAMER_opt: pic=merkle, leftpic=true, width=.7\linewidth :BEAMER_env: picblock :BEAMER_act: :END: Combination of - tree - hash function #+BEAMER: \pause #+BEAMER: \footnotesize ***** Classical cryptographic construction - fast, parallel signature of large data structures - widely used (e.g., Git, blockchains, IPFS, ...) - built-in deduplication **** The archive in a few pictures :PROPERTIES: :CUSTOM_ID: merkledemo :END: ***** A giant (extended) Merkle DAG #+LATEX: \only<1>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_1.pdf}}} #+LATEX: \only<2>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/contents.pdf}}} #+LATEX: \only<3>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_2_contents.pdf}}} #+LATEX: \only<4>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/directories.pdf}}} #+LATEX: \only<5>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_3_directories.pdf}}} #+LATEX: \only<6>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/revisions.pdf}}} #+LATEX: \only<7>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_4_revisions.pdf}}} #+LATEX: \only<8>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/releases.pdf}}} #+LATEX: \only<9>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_5_releases.pdf}}} # #+LATEX: {\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_1.pdf}}} *** A revision node :PROPERTIES: :CUSTOM_ID: merklerevision :END: **** Example: a Software Heritage revision ***** #+BEAMER: \vspace{-.5cm}\centering\includegraphics[width=0.9\textwidth]{git-merkle/revisions} ***** Note: most object kinds currently have Git-compatible identifiers *** Giant DAG :PROPERTIES: :CUSTOM_ID: giantdag :END: **** The archive: a (giant) Merkle DAG # Using an empty frame because the image is difficult to read on swh bg. # Finding a way to override image bg for just this frame would be better. ***** #+BEAMER: \centering \includegraphics[width=\extblockscale{\textwidth}]{git-merkle/merkle_5_releases} *** Giant DAG (single slide) :PROPERTIES: :CUSTOM_ID: giantdag1slide :END: **** The Software Heritage archive: a gigantic Merkle DAG #+LATEX: \centering #+LATEX: \only<1>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_1}}} #+LATEX: \only<2>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/contents}}} #+LATEX: \only<3>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_2_contents}}} #+LATEX: \only<4>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/directories}}} #+LATEX: \only<5>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_3_directories}}} #+LATEX: \only<6>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/revisions}}} #+LATEX: \only<7>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_4_revisions}}} #+LATEX: \only<8>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/releases}}} #+LATEX: \only<9>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_5_releases}}} ** Technology :noexport: :PROPERTIES: :CUSTOM_ID: technology :END: *** Software stack **** 3rd party - Debian, Puppet - PostgreSQL for metadata storage, with barman & pglogical - Celery (RabbitMQ backend) for task scheduling - Python3 and psycopg2 for the backend - Flask and Bootstrap for Web stuff - Phabricator **** in house - /ad hoc/ object storage (to avoid imposing tech to mirrors) - data model implementation, listers, loaders, scheduler - - ~50 Git repositories (~20 Python packages, ~10 Puppet modules) + - ~60 Git repositories (~20 Python packages, ~30 Puppet modules) - ~30 kSLOC Python / ~12 kSLOC SQL / ~4 kSLOC Puppet - licence choice: GPLv3 (backend) / AGPLv3 (frontend) *** Hardware stack **** in house - 2x hypervisors with ~20 VMs - 2x high density storage array (60 * 6TB => 300TB usable) **** on Azure - full object storage mirror + - full mirror of the database containing the graph - workers for content indexing + - workers for download bundle preparation +**** at the University of Bologna + - backend storage (60TB) for the bundles available for download *** Software architecture :noexport: **** Module dependencies (internal + external) :B_picblock: :PROPERTIES: :BEAMER_env: picblock :BEAMER_opt: pic=swh-modules-deps-all,width=\linewidth :END: **** let's zoom in: http://deb.li/swhdeps ** Software development :noexport: :PROPERTIES: :CUSTOM_ID: development :END: *** Software development **** classic FOSS development - language: English - development mailing list #+BEAMER: \\{\small \url{https://sympa.inria.fr/sympa/info/swh-devel}} - IRC #+BEAMER: \\ #swh-devel / FreeNode - Forge #+BEAMER: \\{\small \url{https://forge.softwareheritage.org}} - Git, tasks, code review, etc. **** for more information #+BEAMER: \scriptsize https://www.softwareheritage.org/community/developers/ ** Roadmap :PROPERTIES: :CUSTOM_ID: features :END: *** Features... - (done) *lookup* by content hash - *browsing*: "wayback machine" for archived code - - (preview) via Web API - - (stay tuned) via Web UI - - (done) *download*: =wget= / =git clone= from the archive - - (preview) *deposit* of source code bundles directly to the archive + - (done) via Web API + - (early access) via Web UI + - (early access) *deposit* of source code bundles directly to the archive + - (early access) *download*: =wget= / =git clone= from the archive - (todo) *provenance* lookup for all archived content - (todo) *full-text search* on all archived source code files #+BEAMER: \pause *** ... and much more than one could possibly imagine all the world's software development history at hand's reach! ** Web API :noexport: :PROPERTIES: :CUSTOM_ID: api :END: *** Web API :PROPERTIES: :CUSTOM_ID: apiintro :END: **** RESTful API to programmatically access the Software Heritage archive \\ *\url{https://archive.softwareheritage.org/api/}* **** Features - pointwise *browsing* of the archive - … snapshots → revisions → directories → contents … - full access to the *metadata* of archived objects - *crawling* information - /when have you last visited this Git repository I care about?/ - /where were its branches/tags pointing to at the time?/ # - derived information about archived contents (WIP) # - MIME type, programming language, license, etc. **** Endpoint index \url{https://archive.softwareheritage.org/api/1/} *** A tour of the Web API --- origins & visits :PROPERTIES: :CUSTOM_ID: apitourvisits :END: #+BEAMER: \footnotesize #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/origin/ \ git/url/https://github.com/hylang/hy { "id": 1, "origin_visits_url": "/api/1/origin/1/visits/", "type": "git", "url": "https://github.com/hylang/hy" } #+END_SRC #+BEAMER: \vfill #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/origin/ \ 1/visits/ [ ..., { "date": "2016-09-14T11:04:26.769266+00:00", "origin": 1, "origin_visit_url": "/api/1/origin/1/visit/13/", "status": "full", "visit": 13 }, ... ] #+END_SRC *** A tour of the Web API --- snapshots :PROPERTIES: :CUSTOM_ID: apitoursnapshots :END: #+BEAMER: \footnotesize #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/origin/ \ 1/visit/13/ { ..., "occurrences": { ..., "refs/heads/master": { "target": "b94211251...", "target_type": "revision", "target_url": "/api/1/revision/b94211251.../" }, "refs/tags/0.10.0": { "target": "7045404f3...", "target_type": "release", "target_url": "/api/1/release/7045404f3.../" }, ... }, "origin": 1, "origin_url": "/api/1/origin/1/", "status": "full", "visit": 13 } #+END_SRC *** A tour of the Web API --- releases :noexport: :PROPERTIES: :CUSTOM_ID: apitourreleases :END: #+BEAMER: \footnotesize #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/release/ \ 7045404f3d1c54e6473c71bbb716529fbad4be24/ { "author": { "email": "tag@pault.ag", "fullname": "Paul Tagliamonte ", "id": 96, "name": "Paul Tagliamonte" }, "date": "2014-04-10T23:01:28-04:00", "message": "0.10: The Oh f*ck it's PyCon release", "name": "0.10.0", "synthetic": false, "target": "6072557b6...", "target_type": "revision", "target_url": "/api/1/revision/6072557b6.../", ... } #+END_SRC *** A tour of the Web API --- revisions :PROPERTIES: :CUSTOM_ID: apitourrevisions :END: #+BEAMER: \footnotesize #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/revision/ \ 6072557b6c10cd9a21145781e26ad1f978ed14b9/ { "author": { "email": "tag@pault.ag", "fullname": "Paul Tagliamonte ", "id": 96, "name": "Paul Tagliamonte" }, "committer": { ... }, "date": "2014-04-10T23:01:11-04:00", "committer_date": "2014-04-10T23:01:11-04:00", "directory": "2df4cd84e...", "directory_url": "/api/1/directory/2df4cd84e.../", "history_url": "/api/1/revision/6072557b6.../log/", "merge": false, "message": "0.10: The Oh f*ck it's PyCon release", "parents": [ { "id": "10149f66e...", "url": "/api/1/revision/10149f66e.../" } ], ... } #+END_SRC *** A tour of the Web API --- contents :PROPERTIES: :CUSTOM_ID: apitourcontents :END: #+BEAMER: \footnotesize #+BEGIN_SRC GET https://archive.softwareheritage.org/api/1/content/ \ adc83b19e793491b1c6ea0fd8b46cd9f32e592fc/ { "data_url": "/api/1/content/sha1:adc83b19e.../raw/", "filetype_url": "/api/1/content/sha1:.../filetype/", "language_url": "/api/1/content/sha1:.../language/", "length": 1, "license_url": "/api/1/content/sha1:.../license/", "sha1": "adc83b19e...", "sha1_git": "8b1378917...", "sha256": "01ba4719c...", "status": "visible" } #+END_SRC #+BEAMER: \normalsize \vfill \pause **** Caveats - rate limits apply throughout the API - blob download available for selected contents ** Some technical challenges :PROPERTIES: :CUSTOM_ID: techchallenges :END: *** Expanding the archive - discover and classify /all/ the software sources - importers for other VCSs (SVN, Hg, ...) \hfill /We need your help!/ *** Staying current get new repositories and commits ASAP\\ \hfill /We need reliable, standardised event feeds./ *** Handling the backlog ingesting all the pre-existing data\\ \hfill /Decades of software development are waiting!/ diff --git a/common/modules/support-compact.org b/common/modules/support-compact.org index a8eb241..68c6d59 100644 --- a/common/modules/support-compact.org +++ b/common/modules/support-compact.org @@ -1,35 +1,34 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 * Support :PROPERTIES: :CUSTOM_ID: main :END: ** Growing Support :PROPERTIES: :CUSTOM_ID: support :END: -*** Raising awareness - April 3rd, 2017: landmark Inria Unesco agreement on source code access and preservation\\ +*** Raising awareness: landmark Inria Unesco agreement, April 3rd, 2017 #+BEGIN_EXPORT latex \includegraphics[width=\extblockscale{.25\linewidth}]{inria-logo-new} \hfill \includegraphics[width=\extblockscale{.35\linewidth}]{unesco-accord} \hfill \includegraphics[width=\extblockscale{.2\linewidth}]{unesco}\\[1em] #+END_EXPORT *** Sharing the vision :B_block: :PROPERTIES: :CUSTOM_ID: endorsement - :BEAMER_COL: .45 + :BEAMER_COL: .5 :BEAMER_env: block :END: - #+LATEX: \begin{center}\includegraphics[width=\extblockscale{\linewidth}]{support.pdf}\end{center} + #+LATEX: \begin{center}\includegraphics[width=\extblockscale{1.3\linewidth}]{support.pdf}\end{center} *** See more :noexport: \hfill\tiny\url{http:://www.softwareheritage.org/support/testimonials} *** Sponsoring our work :B_block: :PROPERTIES: :CUSTOM_ID: sponsors - :BEAMER_COL: .45 + :BEAMER_COL: .5 :BEAMER_env: block :END: - #+LATEX: \begin{center}\includegraphics[width=\extblockscale{.3\linewidth}]{inria-logo-new}\end{center} - #+LATEX: \begin{center}\includegraphics[width=\extblockscale{.8\linewidth}]{sponsors.pdf}\end{center} + #+LATEX: \begin{center}\includegraphics[width=\extblockscale{.4\linewidth}]{inria-logo-new}\end{center} + #+LATEX: \begin{center}\includegraphics[width=\extblockscale{\linewidth}]{sponsors.pdf}\end{center} # - sponsoring / partnership :: \hfill \url{sponsorship.softwareheritage.org} diff --git a/common/modules/swh-goals-oneslide-vertical.org b/common/modules/swh-goals-oneslide-vertical.org index bf11051..173f1b9 100644 --- a/common/modules/swh-goals-oneslide-vertical.org +++ b/common/modules/swh-goals-oneslide-vertical.org @@ -1,55 +1,55 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) # # Software is all around us # #+INCLUDE: "prelude.org" :minlevel 1 * How Software Heritage changes the world :PROPERTIES: :CUSTOM_ID: main :END: ** Software Heritage in a nutshell :PROPERTIES: :CUSTOM_ID: goals :END: #+latex: \begin{center} #+ATTR_LATEX: :width \extblockscale{.8\linewidth} file:SWH-logo+motto.pdf #+latex: \end{center} *** /Collect, preserve and share/ the /source code/ of /all the software/ \hfill Preserving our heritage, enabling better software and better science for all #+BEAMER: \pause *** Reference catalog :PROPERTIES: :BEAMER_env: block :BEAMER_COL: .3 :END: #+BEGIN_EXPORT latex \begin{center} \includegraphics[width=.6\linewidth]{myriadsources} \end{center} #+END_EXPORT find and reference *all* the source code #+BEAMER: \pause *** Universal archive :PROPERTIES: :BEAMER_env: block :BEAMER_COL: .3 :END: #+BEGIN_EXPORT latex \begin{center} \includegraphics[width=.6\linewidth]{fragilecloud} \end{center} #+END_EXPORT preserve *all* the source code #+BEAMER: \pause *** Research Infrastructure :B_block: :PROPERTIES: :BEAMER_COL: .3 :BEAMER_env: block :END: #+BEGIN_EXPORT latex \begin{center} \includegraphics[width=.7\linewidth]{atacama-telescope} \end{center} #+END_EXPORT - Enable analysis of *all* the source code + enable analysis of *all* the source code diff --git a/common/modules/urls-decay.org b/common/modules/urls-decay.org index 8056be6..c190e72 100644 --- a/common/modules/urls-decay.org +++ b/common/modules/urls-decay.org @@ -1,60 +1,66 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 * Why URLS are not good references :PROPERTIES: :CUSTOM_ID: main :END: ** Disruption of the /web of reference/ + :PROPERTIES: + :CUSTOM_ID: rfc + :END: *** Web links \emph{are not} permanent (even \emph{permalinks}) :B_picblock: :PROPERTIES: :BEAMER_env: picblock :BEAMER_opt: pic=404 :END: \itshape there is no general guarantee that a URL... which at one time points to a given object continues to do so \\ \hfill T. Berners-Lee et al. Uniform Resource Locators. RFC 1738. #+BEAMER: \pause *** URLs used in articles /decay/! Analysis of /IEEE Computer/ (Computer), and the /Communications of the ACM/ (CACM): 1995-1999 - the /half-life/ of a referenced URL /is approximately 4 years/ from its publication date \hfill D. Spinellis. The Decay and Failures of URL References.\\ \hfill Communications of the ACM, 46(1):71-77, January 2003.\\ #+BEAMER: \pause *** Similar findings in Lawrence, S. et al. /Persistence of Web References in Scientific Research/, IEEE Computer, 34(2), pp. 26–31, 2001. ** Scholar roster of broken links + :PROPERTIES: + :CUSTOM_ID: examples + :END: *** An example from Astronomy :B_picblock: :PROPERTIES: :BEAMER_env: picblock :BEAMER_opt: pic=journal.pone.0104798.t001.png,leftpic=true,width=.99\linewidth :END: *** :B_ignoreheading: :PROPERTIES: :BEAMER_env: ignoreheading :END: /How Do Astronomers Share Data?/\\ Pepe, Goodman, Muench, Crosas, Erdmann \hfill /PLOS August 28, 2014/\\ dx.doi.org/10.1371/journal.pone.0104798 ** Cool URLs (should not) change *** What makes a cool URI? A cool URI is one which does not change.\\ What sorts of URI change?\\ URIs don't change: /people change them/.\\ \mbox{}\\ \hfill Tim Berners Lee, 1998\\ \hfill https://www.w3.org/Provider/Style/URI *** Yes, /people/ change them...\\ \hfill sometimes behind your back! ** Disruption of the web of reference: Inria's own Gforge #+BEGIN_EXPORT latex \begin{center} \includegraphics[width=\extblockscale{.65\linewidth}]{gforge-changed-url} \end{center} #+END_EXPORT #+BEAMER: \pause *** Fixed, adding a redirection, by the Gforge team in /1 day/ this one was fixed!\\ #+BEAMER: \pause \hfill Not always that lucky, though ... diff --git a/talks-public/2017-12-07-ACM/2017-12-07-ACM.org b/talks-public/2017-12-07-ACM/2017-12-07-ACM.org index a326658..a02f20a 100644 --- a/talks-public/2017-12-07-ACM/2017-12-07-ACM.org +++ b/talks-public/2017-12-07-ACM/2017-12-07-ACM.org @@ -1,335 +1,291 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+TITLE: Software Heritage #+SUBTITLE: A new essential infrastructure for Software Source Code #+BEAMER_HEADER: \title{Software Heritage} #+AUTHOR: Roberto Di Cosmo #+EMAIL: roberto@dicosmo.org #+BEAMER_HEADER: \date[December 7th, 2017]{December 7th, 2017\\[-1em]} #+BEAMER_HEADER: \title[www.softwareheritage.org]{Software Heritage} #+BEAMER_HEADER: \author[Roberto Di Cosmo \hspace{5em} www.dicosmo.org]{Roberto Di Cosmo\\[1em]% #+BEAMER_HEADER: Director, Software Heritage\\Computer Science full professor, Inria and IRIF\\[-1em]} # #+BEAMER_HEADER: \setbeameroption{show notes on second screen} #+BEAMER_HEADER: \setbeameroption{hide notes} #+KEYWORDS: software heritage legacy preservation knowledge mankind technology # # prelude.org contains all the information needed to export the main beamer latex source # use prelude-toc.org to get the table of contents # #+INCLUDE: "../../common/modules/prelude-toc.org" :minlevel 1 #+INCLUDE: "../../common/modules/169.org" # +LaTeX_CLASS_OPTIONS: [aspectratio=169,handout,xcolor=table] # # If you want to change the title logo it's here # # +BEAMER_HEADER: \titlegraphic{\includegraphics[width=0.7\textwidth]{SWH-logo}} # aspect ratio can be changed, but the slides need to be adapted # - compute a "resizing factor" for the images (macro for picblocks?) # # set the background image # # https://pacoup.com/2011/06/12/list-of-true-169-resolutions/ # #+BEAMER_HEADER: \pgfdeclareimage[height=90mm,width=160mm]{bgd}{swh-world-169.png} #+BEAMER_HEADER: \setbeamertemplate{background}{\pgfuseimage{bgd}} #+LATEX_HEADER: \usepackage{supertabular} #+LATEX_HEADER: \newcommand{\sponsor}[2]{{\bf #1}, #2} #+LATEX_HEADER: \newcommand{\teamster}[2]{{\textcolor{red}{#1}}, #2} * Introductions :noexport: ** Short Bio # +BEAMER: \raisebox{-.5\height}{\includegraphics[width=.28\linewidth]{rdc}} Roberto Di Cosmo\\ Computer Science professor in Paris\\ now working at INRIA\\ /20 years/ of Free and Open Source Software\\ \mbox{}\\ \begin{minipage}[c]{0.18\linewidth} \includegraphics[width=1.0\linewidth]{rdc} \end{minipage} \begin{minipage}[c]{0.8\linewidth} \begin{description} % \item[1998] \emph{Cybersnare} -- voice of French FOSS \item[1999] \emph{DemoLinux} -- first live GNU/Linux distro % \item[2004] \emph{EDOS} -- check package dependencies \item[2007] \emph{Free Software Thematic Group}\\ %\tiny{\url{http://www.systematic-paris-region.org/fr/logiciel-libre}}\\ ~150 members ~40 projects ~200Me % \item[2008] \emph{Mancoosi project} \url{www.mancoosi.org} \item[2010] \emph{IRILL} \url{www.irill.org} \item[2015] \emph{Software Heritage} at INRIA \end{description} \end{minipage} * Software is everywhere... :noexport: ** Software is everywhere :noexport: :PROPERTIES: :CUSTOM_ID: softwareispervasive :END: #+latex: \begin{center} #+ATTR_LATEX: :width .6\linewidth file:software-center.pdf #+latex: \end{center} #+BEAMER: \pause *** :PROPERTIES: :BEAMER_env: block :END: \hfill Software embodies our collective *Knowledge* and *Cultural Heritage* # why software source code is special (2 slides) # #+INCLUDE: "../../common/modules/source-code-different-long.org::#thesourcecode" :minlevel 2 ** Source code is essential :noexport: #+INCLUDE: "../../common/modules/source-code-different-long.org::#softwareisdifferent" :only-contents t :minlevel 3 ** 50 years of software source code #+INCLUDE: "../../common/modules/50years-source-code.org::#apollolinux" :only-contents t :minlevel 3 * Software Heritage # # One slide motivation + goals # #+INCLUDE: "../../common/modules/swh-goals-oneslide-vertical.org::#goals" :minlevel 2 # # * Building the network # Where we are today: endorsement # ** Our principles \hfill iPres 2017 - \url{http://bit.ly/swhpaper} # #+INCLUDE: "../../common/modules/principles-compact.org::#principlesstatus" :only-contents t :minlevel 3 ** Our principles \hfill iPres 2017 - \url{http://bit.ly/swhpaper} :PROPERTIES: :CUSTOM_ID: principlesstatus :END: #+latex: \begin{center} #+ATTR_LATEX: :width .6\linewidth file:SWH-as-foundation-slim.png #+latex: \end{center} #+latex: \footnotesize\vspace{-3mm} # # #+BEAMER: \pause #+BEAMER: \pause #+latex: \centering #+ATTR_LATEX: :width \extblockscale{.8\linewidth} file:growth.png #+BEAMER: \pause *** Open approach :B_block:BMCOL: :PROPERTIES: :BEAMER_col: 0.3 :BEAMER_env: block :END: - open source - transparency *** In for the long haul :B_block:BMCOL: :PROPERTIES: :BEAMER_col: 0.3 :BEAMER_env: block :END: - non profit, replication - *intrinsic* identifiers # *** Exhaustive :B_block:BMCOL: :PROPERTIES: :BEAMER_col: 0.3 :BEAMER_env: block :END: - *all* software - open to *all* communities *** :B_ignoreheading: :PROPERTIES: :BEAMER_env: ignoreheading :END: *** ** Growing Support #+INCLUDE: "../../common/modules/support-compact.org::#support" :only-contents t :minlevel 3 * Relevance for research software publishing ** Zoom on the collection phase # #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{.8\textwidth}]{swh-dataflow.pdf}\end{center} ** Zoom on the collection phase *** Much more than an archive! :B_picblock: :PROPERTIES: :BEAMER_env: picblock :BEAMER_OPT: pic=swh-dataflow.pdf,width=.65\linewidth,leftpic=true :END: - GitHub - Debian, GNU - Gitorious, Google Code - Bitbucket (WIP), FusionForge (WIP) - /add your own plugins!/ #+BEAMER: \pause *** Important properties - mission: *exhaustive* and *up to date* collection of *source code*, /specifically/ - strategy: *automatic* harvesting + /deposit/ from /selected/ sources #+BEAMER: \pause *** \hfill The /richest/ source code archive already, ... and growing daily! -** The research software (deposit) use case -*** Beta testing with HAL \hfill \url{https://hal.archives-ouvertes.fr/} :B_picblock: - :PROPERTIES: - :BEAMER_env: picblock - :BEAMER_OPT: pic=deposit-communication.png,width=.63\linewidth,leftpic=true -# :BEAMER_OPT: pic=software_publication_state_diagram.png,width=.63\linewidth,leftpic=true - :END: -#+LATEX: \pause - *\hspace{1em}Generic mechanism:* - - SWORD based - - review process - - versioning - - /industry chimes in/ (details on demand) -#+BEAMER: \pause - *\hspace{1em}Variants:* - - just provide SWH hash and metadata - - just provide SWH hash, extract metadata - - ... -#+BEAMER: \pause -*** Feedback is welcome - \hfill drop me a line if you want to join the test group -** Coming soon: access using intrinsic IDs -*** Getting close to it ... \hfill click in the paper and view the source - "Our *Parmap.parmap* and *Parmap.parfold* functions may be used to seamlessly ..." -# \mbox{} \hfill https://doi.org/10.1016/j.procs.2012.04.202 -# replace OCaml List map and fold standard functions preserving their full -# functional semantics..." -*** :B_ignoreheading: - :PROPERTIES: - :BEAMER_env: ignoreheading - :END: - #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{1.4\textwidth}]{Parmap-browse-contextless-path.png}\end{center} -** Selected unique benefits \hfill there are more! -*** All features of Software Heritage /for free/ - - *intrinsic IDs* (integrity, not just DIOs!), browse, download (now) - - metadata, licenses, provenance analysis (plagiarism detection), classification (wip) - - and many more (powerful connections with SE and Industry) -#+LATEX: \pause -*** Coverage and uniformity - - *one* archive for *all* domains (industry included) - - you can reference /any/ software, not just the deposited one\\ - \hfill /(thanks D. Katz for pointing this out)/ - - *git-compatible* identifiers greatly simplify workflows -#+LATEX: \pause -*** Sustainability \hfill ... doors are open! - \mbox{}\hfill /one/ infrastructure \hfill /independent/ non profit foundation \hfill /worldwide/ mirrors\hfill\mbox{} +#+INCLUDE: "../../common/modules/swh-scientific-publishing.org::#main" :only-contents t :minlevel 2 + # * Conclusion ** Come in, we're open! #+latex: \begin{center} #+ATTR_LATEX: :width .9\linewidth file:SWH-logo.pdf #+latex: \end{center} #+latex: \begin{center} #+latex: {\large \url{www.softwareheritage.org} \hspace{4em} \url{@swheritage}}\\ #+latex: \mbox{}\hfill Talks, slides: {\large \url{annex.softwareheritage.org/public/talks}} #+latex: \end{center} *** Get involved - sponsoring / partnership :: \hfill \url{sponsorship.softwareheritage.org} - working groups, leads :: \hfill \url{wiki.softwareheritage.org} - our own code :: \hfill \url{forge.softwareheritage.org} - metadata :: \hfill RDA source code IG * Appendix :B_appendix: :PROPERTIES: :BEAMER_env: appendix :END: * Intrinsic PID ** Our challenge in the PID arena *** Long term Identifiers must be there for the long term *** No middle man Identifiers must be meaningful even if resolvers go away *** Integrity, not just naming Identifier must ensure that the retrieved object is the intended one *** Uniqueness by design one name identifies a single object, and each object has only one name ** Exploring the PID landscape *** A lot of options out there... URL, URI, PURL, URN, ARK, DOI, ... *** ... some are widely used - articles - data - even software artefacts! #+BEAMER: \pause *** We can get no satisfaction \hfill of all the key criteria #+BEAMER: \pause *** \hfill we adopted something radically different \hfill ** Intrinsic identifiers in Software Heritage # R. C. Merkle, A digital signature based on a conventional encryption # function, Crypto '87 #+BEAMER: \vspace{-3mm} ***** Merkle tree (R. C. Merkle, Crypto 1979) :B_picblock: :PROPERTIES: :BEAMER_opt: pic=merkle, leftpic=true, width=.5\linewidth :BEAMER_env: picblock :BEAMER_act: :END: Combination of - tree - hash function ***** Classical cryptographic construction fast, parallel signature of large data structures, built-in deduplication #+BEAMER: \pause - satisfies all three criteria - widely used in industry (e.g., Git, nix, blockchains, IPFS, ...) ** Back to basics: DIOs vs. IDOs *** DIO (digital identifier of an object) - digital identifiers for traditional (non digital) objects - epistemic complications (manifestations, versions, locations, etc.) - significant governance issues, ... #+BEAMER: \pause *** IDO (identifier of a digital object) - (digital) identifier for digital objects - much simpler to build/handle - can (and must) be intrinsic #+BEAMER: \pause *** Separation of concerns - yes, we \alert{need both} DIOs and IDOs - no, we \alert{must not mistake} DIOs for IDOs (and viceversa) #+BEAMER: \pause ** Working together *** Example: links to /software source code/ in an article Leveraging the Software Heritage universal archive: - set of files :: \small\url{swh:1:tree:06741c8c37c5a384083082b99f4c5ad94cd0cd1f}\\ id of tree object listing all the files in a project (at a given time) - revision :: \url{swh:1:rev:7598fb94d59178d65bd8d2892c19356290f5d4e3}\\ id of commit object which a tree and (a pointer to) the history #+BEAMER: \pause - metadata :: this /will/ involve some form of DIO - and we get all the complications back * Our role in the publication workflow ** Our role : handle /all/ the /software source code/ *** At the end of the process Explicit deposit, coordinated with the publisher - store the /final/ source code (no garbage) - store only public source code - *N.B.:* no embargo or access control (yet) *** During the review Access to the largest available source code base + provenance, plagiarism detection (for new code) + metrics (for long standing projects) #+BEAMER: \pause *** Later on - Support embargo/access control * The Metadata challenge ** Collecting metadata for 60+ million projects *** Landscape of Software Ontologies #+latex: \begin{center} #+ATTR_LATEX: :width .75\linewidth file:metadata_landscape6.png #+latex: \end{center} *** It's the real world! reconcile metadata from different origins, handle conflicts, synthesise missing information, classify (automatically) the projects, etc. * Collection strategies ** All the source code #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{\linewidth}]{swh-collect-axes}\end{center} ** All the source code, strategies #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{\linewidth}]{swh-collect-strategies}\end{center} ** Online, open source code: automation overview #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{\linewidth}]{swh-automation}\end{center} diff --git a/talks-public/2018-02-13-inria-saclay/2018-02-13-inria-saclay.org b/talks-public/2018-02-13-inria-saclay/2018-02-13-inria-saclay.org new file mode 100644 index 0000000..f15d681 --- /dev/null +++ b/talks-public/2018-02-13-inria-saclay/2018-02-13-inria-saclay.org @@ -0,0 +1,221 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) +#+TITLE: Software Heritage: Preserving the Free Software Commons +# does not allow short title, so we override it for beamer as follows : +#+BEAMER_HEADER: \title[Software Heritage]{Software Heritage\\Preserving the Free Software Commons} +#+BEAMER_HEADER: \author{Nicolas Dandrimont} +#+BEAMER_HEADER: \date[2018-02-13 Inria Saclay]{13 february 2018\\Demandez le Programme! - Inria Saclay} +#+AUTHOR: Nicolas Dandrimont +#+DATE: 13 February 2018 +#+EMAIL: nicolas@dandrimont.eu +#+DESCRIPTION: Software Heritage: Preserving the Free Software Commons +#+KEYWORDS: software heritage legacy preservation knowledge mankind technology + +#+INCLUDE: "../../common/modules/prelude-toc.org" :minlevel 1 +#+INCLUDE: "../../common/modules/169.org" +#+BEAMER_HEADER: \institute[Software Heritage]{Software Engineer - Software Heritage\\\href{mailto:nicolas@dandrimont.eu}{\tt nicolas@dandrimont.eu}} + +#+LATEX_HEADER: \usepackage{bbding} +#+LATEX_HEADER: \DeclareUnicodeCharacter{66D}{\FiveStar} +* The Software Commons + #+INCLUDE: "../../common/modules/source-code-different-short.org::#softwareisdifferent" :minlevel 2 +** Our Software Commons + #+INCLUDE: "../../common/modules/foss-commons.org::#commonsdef" :only-contents t + #+BEAMER: \pause +*** Source code is /a precious part/ of our commons + \hfill are we taking care of it? + # #+INCLUDE: "../../common/modules/swh-motivations-foss.org::#main" :only-contents t :minlevel 2 + #+INCLUDE: "../../common/modules/swh-motivations-foss.org::#fragile" :minlevel 2 + #+INCLUDE: "../../common/modules/swh-motivations-foss.org::#research" :minlevel 2 +* Software Heritage + #+INCLUDE: "../../common/modules/swh-overview-sourcecode.org::#mission" :minlevel 2 +** Our principles + #+latex: \begin{center} + #+ATTR_LATEX: :width .9\linewidth + file:SWH-as-foundation-slim.png + #+latex: \end{center} +#+BEAMER: \pause +*** Open approach :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.4 + :BEAMER_env: block + :END: + - 100% FOSS + - transparency +*** In for the long haul :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.4 + :BEAMER_env: block + :END: + - replication + - non profit + +* Architecture + #+INCLUDE: "../../common/modules/status-extended.org::#archivinggoals" :minlevel 2 + #+INCLUDE: "../../common/modules/status-extended.org::#architecture" :only-contents t +# #+INCLUDE: "../../common/modules/status-extended.org::#merkletree" :minlevel 2 + #+INCLUDE: "../../common/modules/status-extended.org::#merklerevision" :only-contents t + #+INCLUDE: "../../common/modules/status-extended.org::#giantdag" :only-contents t + #+INCLUDE: "../../common/modules/status-extended.org::#archive" :minlevel 2 + #+INCLUDE: "../../common/modules/status-extended.org::#technology" :only-contents t + #+INCLUDE: "../../common/modules/status-extended.org::#development" :only-contents t + #+INCLUDE: "../../common/modules/status-extended.org::#features" :minlevel 2 + +* Gory details +** Technology: how do you store the SWH DAG? + +*** Problem statement +- How would you store and query a graph with 10 billion nodes and 60 billion edges? +- How would you store the contents of more than 3 billion files, 300TB of raw data? +- on a limited budget (100 000 € of hardware overall) + +#+BEAMER: \pause + +*** Our hardware stack +- two hypervisors with 512GB RAM, 20TB SSD each, sharing access to a storage array (60 x 6TB spinning rust) +- one backup server with 48GB RAM and another storage array + +*** Our software stack +- A RDBMS (PostgreSQL, what else?), for storage of the graph nodes and edges +- filesystems for storing the actual file contents + +** Technology: archive storage components + +*** Metadata storage +- Python module *swh.storage* +- thin Python API over a pile of PostgreSQL functions +- motivation: keeping relational integrity at the lowest layer + +*** Content ("object") storage +- Python module *swh.objstorage* +- very thin object storage abstraction layer (PUT, APPEND and GET) over regular storage technologies +- separate layer for asynchronous replication and integrity management (*swh.archiver*) +- motivation: stay as technology neutral as possible for future mirrors + +** Technology: object storage +*** Current primary deployment +- Storage on 16 sharded XFS filesystems; key = /sha1/ (content), value = /gzip/ (content) +- if sha1 = *abcdef01234...*, file path = / srv / storage / *a* / *ab* / *cd* / *ef* / *abcdef01234...* +- 3 directory levels deep, each level 256-wide = 16 777 216 directories (1 048 576 per partition) +*** Secondary deployment +- Storage on Azure blob storage +- 16 storage containers, objects stored in a flat structure there + +** Technology: object storage review + +*** Generic model is fine +The abstraction layer is fairly simple and generic, and the implementation of the upper layers (replication, integrity checking) was a breeze. + +*** Filesystem implementation is bad +Slow spinning storage + little RAM (48GB) + 16 million dentries = (very) bad performance + +** Technology: metadata storage +*** Current deployment +- PostgreSQL deployed in primary/replica mode, using pg\under{}logical for replication: different indexes on primary (tuned for writes) and replicas (tuned for reads). +- most logic done in SQL +- thin Pythonic API over the SQL functions + +*** end goals +- proper handling of relations between objects at the lowest level +- doing fast recursive queries on the graph (e.g. find the provenance info for a content, walking up the whole graph, in one single query) + +** Technology: metadata storage review + +*** Limited resources +PostgreSQL works really well +#+BEAMER: \pause +... until your indexes don't fit in RAM + +#+BEAMER: \pause +*** +Our recursive queries jump between different object types, and between evenly distributed hashes. Data locality doesn't exist. Caches break down. + +#+BEAMER: \pause +*** +Massive deduplication = efficient storage +#+BEAMER: \pause + +*but* Massive deduplication = exponential width for recursive queries + +#+BEAMER: \pause +*** Reality check + +Referential integrity? +#+BEAMER: \pause +Real repositories downloaded from the internet are all kinds of broken. + +** Technology: outlook + +*** Object storage + +Our azure prototype shows that using a scale-out "cloudy" technology for our +object storage works really well. Plain filesystems on spinning rust, not so +much. +#+BEAMER: \pause + +We have started working on a prototype Ceph infrastructure for our main copy +of the archive, as our budget ramps up. +#+BEAMER: \pause + +*** Metadata storage +Our initial assumption that we wanted referential integrity and built-in +recursive queries was wrong. +#+BEAMER: \pause + +We could probably migrate to "dumb" object storages for each type of object, +with another layer to check metadata integrity regularly. + +* Come in, we're open! +** You can help! +*** Coding + - \url{forge.softwareheritage.org} --- *our own code* + #+BEAMER: \vspace{-5mm} + | ٭٭٭ | listers for unsupported forges, distros, pkg. managers | + | ٭٭٭ | loaders for unsupported VCS, source package formats | + | ٭٭ | Web UI: eye candy wrapper around the Web API | + #+BEAMER: \pause +*** Community + | ٭٭ | spread the news, help us with long-term sustainability | + | ٭٭٭ | document endangered source code | + #+BEAMER: \vspace{-3mm} \scriptsize \centering + \url{wiki.softwareheritage.org/index.php?title=Suggestion_box} + +** The Software Heritage community +*** Core team + 10 people working on the project full-time, split across engineering, research, and fundraising/management topics. + #+BEAMER: \pause +*** Inria as initiator :B_picblock: + :PROPERTIES: + :BEAMER_env: picblock + :BEAMER_opt: pic=inria-logo-new,leftpic=true,width=\extblockscale{.2\linewidth} + :END: + - .fr national computer science research entity + - strong Free Software culture + # - creating a non profit, international organisation + #+BEAMER: \vspace{-2mm} + #+BEAMER: \pause +*** Early Sponsors and Supporters + *Société Générale, Microsoft, Huawei, Nokia, DANS, Univ. Bologna,* + #+latex: ~~ + ACM, Creative Commons, Eclipse, Engineering, FSF, Gandi, GitHub, IEEE, OIN, + OSI, OW2, Software Freedom Conservancy, SFLC, The Document Foundation, ... + +* Conclusion + #+INCLUDE: "../../common/modules/swh-backmatter.org::#conclusion" :minlevel 2 +* FAQ :B_appendix: + :PROPERTIES: +# :BEAMER_env: appendix + :END: +** Q: do you archive /only/ Free Software? + - We only crawl origins /meant/ to host source code (e.g., forges) + - Most (~90%) of what we /actually/ retrieve is textual content + #+BEAMER: \vfill +*** Our goal + Archive *the entire Free Software Commons* + + #+BEAMER: \vfill +*** + - Large parts of what we retrieve is /already/ Free Software, today + - Most of the rest /will become/ Free Software in the long term + - e.g., at copyright expiration +** Q: how about SHA1 collisions? + #+BEAMER: \lstinputlisting[language=SQL,basicstyle=\small]{../../common/source/swh-content.sql} diff --git a/talks-public/2018-02-13-inria-saclay/Makefile b/talks-public/2018-02-13-inria-saclay/Makefile new file mode 100644 index 0000000..68fbee7 --- /dev/null +++ b/talks-public/2018-02-13-inria-saclay/Makefile @@ -0,0 +1 @@ +include ../Makefile.slides diff --git a/talks-public/2018-03-12-deposit-update/2018-03-12-deposit-update.org b/talks-public/2018-03-12-deposit-update/2018-03-12-deposit-update.org new file mode 100644 index 0000000..6140e99 --- /dev/null +++ b/talks-public/2018-03-12-deposit-update/2018-03-12-deposit-update.org @@ -0,0 +1,53 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) +#+TITLE: Status update for the Deposit feature +# does not allow short title, so we override it for beamer as follows : +#+BEAMER_HEADER: \title[Deposit feature]{Status update for the Deposit feature} +#+BEAMER_HEADER: \author{Antoine R. Dumont & Morane Gruenpeter} +#+BEAMER_HEADER: \date[2018-03-12]{12 mars 2018\\Team status update} +#+AUTHOR: Antoine R. Dumont & Morane Gruenpeter +#+DATE: 12 Mars 2018 +#+EMAIL: ardumont@softwareheritage.org morane@softwareheritage.org +#+DESCRIPTION: Status update for the Deposit feature +#+KEYWORDS: software heritage preservation knowledge technology deposit + +#+INCLUDE: "../../common/modules/prelude.org" +# +#+INCLUDE: "../../common/modules/169.org" +#+BEAMER_HEADER: \institute[Software Heritage]{Software Engineers} + + + +* Achieved +** Achieved +*** Milestones +- 06/12/2017: in production on SWH server +- 22/01/2018: poster at JSO2018 +- 19/02/2018: in production on Hal-Inria (unannounced) + + +*** Actions done +- deposit feature & documentation via SWORD v2 +- deposit client & walk-through presentation for FOSDEM +- poster submitted to Liber2018 +- software deposit & moderation guides + + +* Remaining +** Remaining +*** Development +- elaborate error messages returning to Hal +- create a dedicated deposit view when browsing on the web-app +- sparse deposit + + +*** Communication +- poster at RDA's 11th plenary (21/03/2018) +- blog post about deposit (with events: JSO2018, RDA 11th plenary, Liber2018?) +- announcement with Inria and the CCSD when the service is ready + +* Blockers +** Blockers +*** On Hal side +- missing crontab mechanism (deposit sent to SWH only while in moderation) +- missing error logs on the moderator view + diff --git a/talks-public/2018-03-12-deposit-update/Makefile b/talks-public/2018-03-12-deposit-update/Makefile new file mode 100644 index 0000000..68fbee7 --- /dev/null +++ b/talks-public/2018-03-12-deposit-update/Makefile @@ -0,0 +1 @@ +include ../Makefile.slides diff --git a/talks-public/2018-03-12-team/2018-03-12-team.org b/talks-public/2018-03-12-team/2018-03-12-team.org new file mode 100644 index 0000000..9c45591 --- /dev/null +++ b/talks-public/2018-03-12-team/2018-03-12-team.org @@ -0,0 +1,851 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) +#+TITLE: Software Heritage +#+SUBTITLE: Vision and outlook +#+AUTHOR: Roberto Di Cosmo +#+DATE: 12/3/2018 +#+EMAIL: roberto@dicosmo.org +#+DESCRIPTION: Preserving the technological knowledge of mankind +#+KEYWORDS: software heritage legacy preservation knowledge mankind technology +#+BEAMER_HEADER: \title[Strategic team meeting]{Software Heritage: vision and outlook} +#+BEAMER_HEADER: \date[12/3/2018]{March 12th 2018\\ Paris} +#+LATEX_HEADER: \usepackage{color} +#+LATEX_HEADER: \usepackage{colortbl} +#+LATEX_HEADER: \usepackage[table]{xcolor}% http://ctan.org/pkg/xcolor +#+LATEX_HEADER: \usepackage{array} +#+LATEX_HEADER: \usepackage{supertabular} + +# +# prelude.org contains all the information needed to export the main beamer latex source +# use prelude-toc.org to get the table of contents +# + +#+INCLUDE: "../../common/modules/prelude-toc.org" :minlevel 1 + +#+INCLUDE: "../../common/modules/169.org" + +# +# Some context: where we come from +# +# +INCLUDE: "../../common/modules/mancoosi-background.org::#main" :minlevel 1 + +# +# Basic properties for software studies +# +# +INCLUDE: "../../common/modules/software-studies-stepback-properties.org::#main" :minlevel 2 :only-contents t + +* Context and motivations +** Software Heritage in a nutshell +#+INCLUDE: "../../common/modules/swh-goals-oneslide-vertical.org::#goals" :only-contents t :minlevel 3 +** Why now +*** Looking at the past + - a lot of old software misplaced, lost, or behind barriers, but... + - most founding fathers are still here, and willing to share + - \alert{urgent} to collect their knowledge + \hfill Only a few years left. +#+BEAMER: \pause +*** Looking at the future + - software development skyrockets + - \alert{essential} to provide a platform for the future + \hfill Every year that goes by makes the problem worse. +** Approach and principles \hfill \url{http://bit.ly/swhpaper} + #+latex: \begin{center} + #+ATTR_LATEX: :width 0.8\linewidth + file:SWH-as-foundation-slim.png + #+latex: \end{center} + #+BEAMER: \pause +*** Technology + :PROPERTIES: + :BEAMER_col: 0.34 + :BEAMER_env: block + :END: + - transparency and FOSS + - replication all around +*** Content + :PROPERTIES: + :BEAMER_col: 0.32 + :BEAMER_env: block + :END: + - intrinsic identifiers + - facts and provenance +*** Organization + :PROPERTIES: + :BEAMER_col: 0.33 + :BEAMER_env: block + :END: + - non-profit + - multi-stakeholder +** A great ambition... in a few taglines +*** Culture (catalog+archive) + \hfill The Library of Alexandria of Source Code +*** Science (pillar of Open Science) + \hfill The reference archive of research software +*** Science (research instrument) + \hfill The CERN of Computer Science +*** Industry (reference catalog) + \hfill The universal software knowledge base +* Key properties, and principles +** Three properties are key for Software Heritage's mission + :PROPERTIES: + :CUSTOM_ID: keyproperties + :END: +*** Availability + :PROPERTIES: + :BEAMER_act: +- + :END: + - /all/ the /history/ of /all/ the software + - no restrictions (technical, legal, ... ) on /content/ or /metadata/ +*** Traceability + :PROPERTIES: + :BEAMER_act: +- + :END: + - know /what/ we get, /when/, from /where/ and /how/ + - [ ] /persistent/ and /intrinsic/ identifiers : no middle man, no dangling pointers! +*** Uniformity + :PROPERTIES: + :BEAMER_act: +- + :END: + - one /standard/ metadata structure, /irrespective of the origins/ + - /uniform/ naming /schema/ +** Software Heritage's approach + :PROPERTIES: + :CUSTOM_ID: keyproperties + :END: +*** Availability + :PROPERTIES: + :BEAMER_act: +- + :END: + - collect /all/ software from /all/ possible places + - /replicate/ the archive in a network of mirrors +*** Traceability + :PROPERTIES: + :BEAMER_act: +- + :END: + - keep /provenance/ information, systematically + + [ ] keep incoming sources until full testing succeeds (and more if possible) + - /unique/ identifiers : use /cryptographic hashes/, derived from the software itself + + [ ] *NEW*: accountability /for all changes/ (see [[https://pages.lip6.fr/Marc.Shapiro/papers/RR-7687.pdf][CRDT]] Shapiro et al., blockchains) +*** Uniformity + :PROPERTIES: + :BEAMER_act: +- + :END: + - version control data model designed to /represent all the others/ +* Yes, we really mean all the source code +** All the source code + #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{\linewidth}]{swh-collect-axes}\end{center} + +** All the source code, strategies + #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{\linewidth}]{swh-collect-strategies}\end{center} +** Strategy to collect all the source code +*** Different unit cost for each sector +#+BEGIN_EXPORT latex +\begin{center} +\tablefirsthead{} +\tablehead{} +\tabletail{} +\tablelasttail{} +\begin{supertabular}{|c|c|c|} + \cline{2-3} + %\rowcolor{blue!25} +\multicolumn{1}{c}{~} + & + \multicolumn{1}{|c|}{\cellcolor{yellow}Closed} & + \multicolumn{1}{c|}{\cellcolor{yellow}Open}\\\hline +\cellcolor{yellow} Online & + SWH: {\bf \$\$}, ~~~ extern: {\bf \$\$} & +\cellcolor{yellow} SWH: {\bf \$}, ~~~ extern: {\bf \$} + \\\hline +\cellcolor{yellow} Offline & + SWH:{\bf \$\$}, ~~~ extern: {\bf \$\$\$} & + SWH:{\bf \$}, ~~~ extern: {\bf \$\$} + \\\hline +\end{supertabular} +\end{center} +#+END_EXPORT +#+BEAMER: \pause +*** Different approaches for each sector :noexport: +#+BEGIN_EXPORT latex +\begin{center} +\tablefirsthead{} +\tablehead{} +\tabletail{} +\tablelasttail{} +\begin{supertabular}{|c|c|c|} + \cline{2-3} + %\rowcolor{blue!25} +\multicolumn{1}{c}{~} + & + \multicolumn{1}{|c|}{\cellcolor{yellow}Open} & + \multicolumn{1}{c|}{\cellcolor{yellow}Proprietary}\\\hline +\cellcolor{yellow} Current and future & + \cellcolor{yellow}{{\bf Automation}} & + {\bf Embargo} + \\\hline +\cellcolor{yellow} Legacy & + {\bf Crowdsourcing} & + {\bf Focused search} + \\\hline +\end{supertabular} +\end{center} +#+END_EXPORT +#+BEAMER: \pause +# IMPACTS +*** We started on the first quadrant, we need all four! + - [ ] *technical*: security, identification, authorization, access control + - *legal*: policies, contracts + - *community*: network, standards, endorsement +#+BEAMER: \pause +*** Important technical issues + - [ ] setup space for "/collections/" (staging area waiting for curation) + + make it simple for contributors to donate! + - [ ] keep the embargo/takedown issue in mind +#+INCLUDE: "../../common/modules/swh-functional-architecture.org::#phases" :minlevel 2 +* Community is essential +# IMPACTS +** A daunting task: + - challenge :: extreme variability of sources and technologies + - opportunity :: highly parallelisable, /if we provide good abstractions/ + and welcome contributors + #+BEAMER: \pause +*** Collect entry points :B_block: + :PROPERTIES: + :BEAMER_COL: .43 + :BEAMER_env: block + :END: + - listers (see Avi's blog post) + - protocols (Adullact+FusionForge) + - [ ] VCS loaders (e.g.: Avi's work) + - [ ] Web crawlers (IA, Qwant) + - [ ] curation of the collections + #+BEAMER: \pause +*** Preserve entry points + :PROPERTIES: + :BEAMER_COL: .3 + :BEAMER_env: block + :END: + - [ ] mirrors + - [ ] storage and indexing backends + - [ ] event feeds + - [ ] data compression +*** Share entry points + :PROPERTIES: + :BEAMER_COL: .27 + :BEAMER_env: block + :END: +# application specific data representation + - [ ] data representation + - [ ] APIs + - [ ] WebHooks + - [ ] indexes +*** + \hfill tag tasks with Collect, Preserve, Share when possible +* Building for the long term +** Three pillars +*** Awareness, visibility, endorsement + - promote public and private policies + - attract users, unlock funds + - turn copycats into partners +#+BEAMER: \pause +*** Resources + - fund the long term effort: people, collaborators, organisation, infrastructure... +#+BEAMER: \pause +*** Science and technology + - build on sound basis: /we need external help/ + + [ ] be prepared to learn from others! + \hfill /"Seul on va plus vite, mais ensemble on va plus loin"/ +# Where we are today: endorsement +# +#+INCLUDE: "../../common/modules/endorsement.org::#endorsement" :minlevel 2 +** Political awareness +*** April 3rd, 2017: landmark Inria Unesco agreement... +#+BEGIN_EXPORT latex + \includegraphics[width=\extblockscale{.25\linewidth}]{inria-logo-new} \hfill + \includegraphics[width=\extblockscale{.35\linewidth}]{unesco-accord} \hfill + \includegraphics[width=\extblockscale{.2\linewidth}]{unesco}\\[1em] + \mbox{}\hfill + \includegraphics[width=\extblockscale{.2\linewidth}]{rdc-fh-ib} \hfill + \includegraphics[width=\extblockscale{.15\linewidth}]{SWH-logo_share} \hfill + \includegraphics[width=\extblockscale{.2\linewidth}]{swh-team-2017-04-03}\hfill +% \mbox{}\\ +% \url{https://www.softwareheritage.org/blog} +#+END_EXPORT +*** September 27-28: Mauritius Call + \hfill mentions the importance of software heritage +*** Sometimes in 2018 + \hfill opening of the archive (we'll come back to this) +** Resources + #+INCLUDE: "../../common/modules/swh-sponsors.org::#sponsors" :only-contents t +#+BEAMER: \pause +*** Breaking news! :B_picblock: + :PROPERTIES: + :BEAMER_env: picblock + :BEAMER_opt: pic=Qwant_Logo,leftpic=true,width=\extblockscale{.2\linewidth} + :END: + \hfill contract awarded for building together the source code search engine +** Science +*** Communication + - CACM Viewpoint *accepted!!!* (thanks Moshe Vardi) + - RDA 2018 + - Keynote Devoxx (April), ICSE (May), and ASE (September) +*** Collaboration + - Qwant and Almanach (search/classification, AP+Zack+Roberto) + - Crossminer (MG) and Linked Data (MG and Roberto) + - RDFox (Zack and Roberto ), H2020 (Zack is on the deck) + - [ ] distributed storage, databases, graphs, crypto, blockchains, etc... +#+BEAMER: \pause +*** Essential + - [ ] reliable interface with scientific community (human and technical) +* Roadmap for a sustainable organisation + :PROPERTIES: + :CUSTOM_ID: main + :END: +** Growing a sustainable common digital infrastructure :noexport: + :PROPERTIES: + :CUSTOM_ID: phases + :END: +*** Ignition (3 Y) \alert{\em Inria} :B_exampleblock: + :PROPERTIES: + :BEAMER_env: exampleblock + :BEAMER_COL: .3 + :BEAMER_ACT: +- + :END: + - Vision + - Team + - Core infrastructure + - Identity + + communication + + community + - Legitimacy + + awareness + + support +*** Scale up (5 Y) :B_block: + :PROPERTIES: + :BEAMER_env: block + :BEAMER_COL: .35 + :BEAMER_ACT: +- + :END: + - Core Infra (engineer) + - Collect (4 strategies) + - Preserve + + mirrors, multiple techs + - Share + + search, browse, APIs + - Connect + + community + - Organisation + + build the foundation +*** Stable Operation ($\infty$) :B_block: + :PROPERTIES: + :BEAMER_env: alertblock + :BEAMER_COL: .38 + :BEAMER_ACT: +- + :END: + - Maintain+Evolve + + archive, community + + bylaws, organisation + - Interact+Engage + + research + + industry + + education + + culture + - Sustainability + + /key/ \alert{infrastructure} + + /ecosystem/ \alert{diversity} + + /foundation/ \alert{endowment} +** Towards a sustainable common digital infrastructure + :PROPERTIES: + :CUSTOM_ID: phases + :END: +*** Launching (2015-2017) :B_exampleblock: + :PROPERTIES: + :BEAMER_env: exampleblock + :BEAMER_COL: .3 + :BEAMER_ACT: +- + :END: + - Vision + - Team + - Core infrastructure + - Identity + - Legitimacy +*** Building (2018-2022) :B_block: + :PROPERTIES: + :BEAMER_env: block + :BEAMER_COL: .35 + :BEAMER_ACT: +- + :END: + - Expand collection + - Support use cases + - Build community + - Grow mirror network + - Independent Foundation +*** Stable Operation (2023-$\infty$) :B_block: + :PROPERTIES: + :BEAMER_env: alertblock + :BEAMER_COL: .38 + :BEAMER_ACT: +- + :END: + - Maintain+Evolve + + archive, community + + bylaws, organisation + - Interact+Engage + + research and industry + + culture and education +*** Sustainability + :PROPERTIES: + :BEAMER_ACT: +- + :END: + + /key/ \alert{infrastructure} + + /ecosystem/ \alert{diversity} + + /foundation/ \alert{endowment} + +** Today: team +*** Management + - Roberto and Stefano (CEO/CTO) + - Jean-Fran\c{c}ois Abramatic (Head of Advisory Board) + - Magali Fitzgibbon (Legal, Contracts) +*** R and D, Ops + - 5 engineers (Morane thanks to Crossminer) + - 1 PhD + - 1 visiting scientist +*** Everything else + \hfill provided by Inria +** Today: funding +*** Baseline + Inria engagement (~ 500Ke/year) +*** Sponsoring + - 3 platinum sponsors (Microsoft, Intel, SocGen) + - 1 silver sponsor (Huawei), 4 bronze sponsors (DANS, Nokia, DISI, GitHub) +*** Partnerships + - HAL and Intel + - Crossminer + - Qwant + - ClearlyDefined +*** + \hfill a /huge/ part of my time +** Today: sponsor's view +*** Features +#+BEGIN_EXPORT latex + \begin{columns}[t] + \begin{column}{0.48\linewidth} + In production + \begin{itemize} + \item \emph{lookup} a content using its hash + \item \emph{navigation} of the archive with an API: \url{http://archive.softwareheritage.org/api} + \end{itemize} + \end{column}\pause + \begin{column}{0.48\linewidth} + Work in progress + \begin{itemize} + \item \emph{browsing}: "wayback machine" for archived code via Web UI (demo?) + \item \emph{download}: copy from the archive + \item \emph{deposit}: into the archive + \item \emph{reverse index}: map hashes to origins/commits + \item \emph{classification}: (very early stage) + \end{itemize} + \end{column} + \end{columns} +#+END_EXPORT + +* The transition has started +** Organisation +*** The Software Heritage Foundation + - legal :: contract ongoing + - funding :: will accept donations as soon as possible + + [ ] updated website (AL+RDC+Zack) + + [ ] /donate/ button (AL+RDC) + + from 1 euro to 1Me :-) +#+BEAMER: \pause +*** Foundation vs. Inria: separation of concerns (transitional) + - the Foundation collects funds for Software Heritage + - Inria operates Software Heritage +** Operations +*** Software Heritage is /no longer/ a "project" + - they *depend on us* + + HAL *now*, /mirrors/ and /Intel use case/ soon + + UNESCO event requires ~24/7 stable operation + + [ ] state of Azure clone? +#+BEAMER: \pause +*** Moving to ~24/7 + - think about a way of implementing /in production/ stable operation + - TODO send me (cc: Zack) /privately/ your ideas by *Friday, March 23rd* +** Mirror network +*** Terminology + - copy :: instance of the archive under SWH own control + - mirror :: instance of the archive outside SWH own control +*** How it works + - legal :: 5 documents + + [X] contract (RDC+MF), technical annex (RDC+ND), ethical charter (RDC), + + [ ] CLA, Code of conduct + - technical :: quite a lot of work to do (ND) +*** Status + - advanced :: Grenoble + - exploratory :: 2 more in France, 1 in Norway +** Technology +*** Evolutions ongoing + - move to more flexible in-house storage (Ceph, FT, ND) + - experiment data compression + - [ ] explore NoSQL solutions +#+BEAMER: \pause +*** Evolutions forthcoming + - [ ] blockchain + - [ ] embargo/escrow +#+BEAMER: \pause +*** Memento + - *modular* software stack: we need to enable + - other programming languages + - other backends/frontends +** Technology, cont'd (interfacing with the world) +**** Existing line of work + - APIs (must be maintained!) + - PURLs (must be carefully defined!) + + [ ] /cite me button/ + + [ ] /documentation/ and /rationale/ (part is ongoing, Morane+Zack+Roberto) + + [ ] "/software citation/" (we need Inria teams onboard!) +**** Forthcoming + - Journal / blockchain + + [ ] Mirrors feed, trust and accountability (blockchain) + - Web hooks + + [ ] allow others to build Software Heritage integrated services +** Team and Community +*** Expanding core team in 2018 + - 2 new hires (TBD) +*** Community + - [ ] we need to bring in contributors + + software collectors + + developers + + partner platforms + + curators +** The next 5 years +*** Collect + - *stable process* for adding new listers/loaders + - community of contributors +*** Preserve + - *stable process* for mirror network + - at least 10 mirrors worldwide +*** Share + - in production *browse/download/upload/search/index/automatic classification* + - support for research and industry use +*** Process + - continuous improvement (tech, community) +** The next 5 years, cont'd +*** Team + 30 full time people on SWH core\\ + management, dev/ops, fundraising, comm, product, liaison\hfill \alert{structured} +*** Funding + ~5 Me/year +*** Organisation + - Independent international foundation + - International network of peers +*** Community + - research, industry, culture, ... + - collectors/curators/scholars/museums ... +** Pause +*** Yes, it is + - a huge challenge + - an unprecedented effort + - much more than just technology + - high risk, high gain +#+BEAMER: \pause +*** + \hfill I believe we can make it! +** What we need to succeed +*** Operations + - stability, reliability, efficiency + #+BEAMER: \pause +*** Engineering + - modularity (platform/plugins, tech oecumenism) + - replicability (mirrors, contributors, \alert{docs}) + - evolvability (testing env, sandbox, exps) + #+BEAMER: \pause +*** Product vision + - "users" and "clients" are coming + #+BEAMER: \pause +*** Mindset + - make the principles guide the technology\\ + \hfill /not the other way around/ +* Conclusion +** Come in, we're open +*** Software Heritage is + - a /reference archive/ of /all available/ source code + - a fantastic new tool for /research/ software + - a unique /complement/ for /development platforms/ + - an international, open, nonprofit, /mutualized infrastructure/ + - at the service of our community, at the service of society +*** Questions :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: +#+BEAMER: {\vfill\begin{center}\Huge{Questions ?}\end{center}\vfill} +* Team report +** Task priorities (established November 2017) +*** short term :B_block: + :PROPERTIES: + :BEAMER_env: block + :BEAMER_COL: .45 + :END: + - browse (lead AL) + - ideal ETA beta open Q4 2017 + + - deposit (lead AD+MG) + - ideal ETA + + state diagram/high level specs for [2017-12-05 Tue] + + working pipeline [2017-12-06 Wed 23:00 CET] + + - download (lead AP) + - ideal ETA working pipeline Q4 2017 +*** short/medium term :B_block: + :PROPERTIES: + :BEAMER_env: block + :BEAMER_COL: .45 + :END: + - mirrors (lead ND+MF) + - ideal ETA Q2 2018 + - preliminary work on legal+tech specs needed by Jan 16th 2018 + + - provenance (lead GR) + - ideal ETA production index Q2 2018 + - preliminary Azure experiment ETA Q4 2017 +* Appendix :B_appendix: + :PROPERTIES: + :BEAMER_env: appendix + :END: +# +# How we want to work, including core properties +# +* Zoom on science :noexport: +# +# Software Research +# +** Multiple facets +*** Scientists as users + - reproducibility via SWH (all) + - SWH as dataset (computer science) +*** Scientists as providers/partners + - research on SWH challenges +** An Universal Archive of Software Development + :PROPERTIES: + :CUSTOM_ID: main + :END: +#+LATEX: \includegraphics[width=\extblockscale{.15\linewidth}]{universal.png} +*** /Repeatable/ Software Studies + :PROPERTIES: + :BEAMER_act: +- + :END: + - vulnerability detection + - dependency analysis + - pattern elicitation + - study of the development graph + - ... the sky is the limit +*** Prerequisites + clean, evolvable data and metadata model + +** How we built our scientific knowledge +# +# Scientific method, reproducibility +# +#+INCLUDE: "../../common/modules/scientific-method.org::#short" :only-contents t + +# +# Connection with Open Access +# +#+INCLUDE: "../../common/modules/conservancy.org::#main" :minlevel 2 + +# +# URLS are not good tracers +# +#+INCLUDE: "../../common/modules/urls-decay.org::#main" :only-contents t :minlevel 2 + +# +# DOI is not a solution +# +#+INCLUDE: "../../common/modules/doi-analysis.org::#main" :only-contents t :minlevel 2 + +** What could the good links look like? +*** Links to /software source code/ in an article + Leverage Software Heritage as universal archive: + - set of files :: \small\url{swh:1:tree:06741c8c37c5a384083082b99f4c5ad94cd0cd1f}\\ + id of tree object listing all the files in a project (at a given time) + - revision :: \url{swh:1:rev:7598fb94d59178d65bd8d2892c19356290f5d4e3}\\ + id of commit object which a tree and (a pointer to) the history + - metadata :: this /may/ involve a DOI +*** + \hfill this is also of /industrial/ relevance! +*** Links to /data/ in /software source code/ :noexport: + - external linking mechanisms /that guarantee integrity/ + + git lfs + + git annex + - need to extend them into a generic, VCS independent solution + +** The SWH - HAL connector +*** Strategic + First open access / open source archival process +*** Opportunity + - HAL is one of a kind + - ArXiv uses the same tech + +* Selected research challenges : building the archive :noexport: +** Data compression + Deduplication is performed at the file level /across all projects in the world/ +*** Pros + - very efficient to cope with file clones + - quite resilient to technology changes +*** Cons + - a minor edit creates two different files +#+BEAMER: \pause +*** Challenge: exploit file similarities + - adapt / improve variable size checksums / diff detection + - compression rates of up to 100 to 1 may arise +** Metadata alignment :noexport: +*** Many concepts related to source code + - project, archive, source, language, licence, bts, mailing list, ... + - developer, committer, author, architect, ... +*** Many existing ontologies + DOAP, FOAF, Appstream, schema.org, ADMS.SW, ... +*** Many disparate catalogs + :PROPERTIES: + :BEAMER_act: +- + :END: +# mostly manual + Freecode (40.000+), Plume (400+), Debian (25.000+), OpenHub (670.000+), ... +# FramaSoft (1500+), +# OpenHub is mostly automatic +# Wikipedia ? +*** Challenge : scale up metadata to millions of projects + :PROPERTIES: + :BEAMER_act: +- + :END: + - /reconcile/ existing ontologies + - /link/ and /check/ existing catalogs with Software Heritage + - handle /inconsistent data/ and /provenance information/ + - synthesise missing information (machine learning) + +** Software phylogenetics :noexport: +*** The Software Diaspora + :PROPERTIES: + :BEAMER_act: +- + :END: + - Code often /migrates/ across projects : forks, copy-paste + - Code gets /cloned/ : reuse, language limitations, code smells + - Projects /migrate/ across forges : fashion, functionality + - Projects get /cloned/ : mirrors, packages +*** Challenge: tracing software evolution across billions of files + :PROPERTIES: + :BEAMER_act: +- + :END: + - rebuild the history of software artefacts + - identify code origins + - spot code clones + - build project impact graphs +** Distributed infrastructure +*** The software graph + - files + - directories + - commits + - projects + all de-duplicated in Software Heritage +*** Challenge: design efficient architectures and algorithms + - replication and availability (CAP?) + - navigation + - query + - path analysis +* Selected research challenges : using the archive :noexport: +** Code search +*** A natural need + :PROPERTIES: +# :BEAMER_act: +- + :END: + - Find the definition of a function/class/procedure/type/structure + - Search examples of code usage in an archive of source code + - you name it... +*** Approaches + :PROPERTIES: +# :BEAMER_act: +- + :END: + - language specific /patterns/ + - working on /abstract syntax trees/ + Regular expressions are a nice /swiss-army knife/ approximation, can we build a specific tool that scales? +*** What about /all the source code/ in the world? + :PROPERTIES: + :BEAMER_act: +- + :END: + - /hundreds/ of billions of LOCs + We need new insight for handling this. +** Software as Big Data +*** Remember the numbers + - 60+ million repositories ingested + - 700+ million commits + - 3+ billion unique source files / 200 TB of raw source code + and growing by the day! +*** Challenge: what can machines learn here? + - programming patterns / trends + - developer skills + - vulnerabilities + - bugs and fixes +** Efficient data representation :noexport: +*** Remember the numbers + - 60+ million repositories ingested + - 700+ million commits + - 3+ billion unique source files / 200 TB of raw source code + and growing by the day! +*** Challenge: can we make this fit in memory? + - efficient graph representation + - fast non-local queries + - mitigate the size/speed tradeoff +* A glimpse of the archive :noexport: +#+INCLUDE: "../../common/modules/status-extended.org::#api" :only-contents t +* Bits from the drawing board :noexport: +#+INCLUDE: "../../common/modules/bits-drawing-board.org::#keyproperties" :minlevel 2 +#+INCLUDE: "../../common/modules/bits-drawing-board.org::#foss" :minlevel 2 +#+INCLUDE: "../../common/modules/bits-drawing-board.org::#intrinsicids" :minlevel 2 +#+INCLUDE: "../../common/modules/bits-drawing-board.org::#replication" :minlevel 2 +** Some planned working groups +#+INCLUDE: "../../common/modules/your-help-wg.org::#sodi" :minlevel 3 +#+INCLUDE: "../../common/modules/your-help-wg.org::#sapi" :minlevel 3 +#+INCLUDE: "../../common/modules/your-help-wg.org::#opad" :minlevel 3 +* Tech bits :noexport: +** More details on the internals +#+INCLUDE: "../../common/modules/status-extended.org::#architecture" :only-contents t +#+INCLUDE: "../../common/modules/status-extended.org::#merklerevision" :only-contents t +# +# Contributing to the great picture +# +** The team :noexport: + #+latex: \begin{center} + #+ATTR_LATEX: :width .35\linewidth +file:core-team-formal.png + #+latex: \end{center} + #+BEAMER: \pause +* Technical status :noexport: +# #+INCLUDE: "../../common/modules/status-extended.org::#people" :minlevel 2 +#+INCLUDE: "../../common/modules/status-extended.org::#archive" :minlevel 2 +** Archiving goals + Targets: VCS repositories & source code releases (e.g., tarballs) +*** We DO archive + - file *content* (= blobs) + - *revisions* (= commits), with full metadata + - *releases* (= tags), ditto + - where (*origin*) & when (*visit*) we found any of the above + # - time-indexed repo *snapshots* (i.e., we never delete anything) + … in a VCS-/archive-agnostic *canonical data model* +*** We DON'T archive (for now) + # - diffs → derived data from related contents + - homepages, wikis + - BTS/issues/code reviews/etc. + - mailing lists + Long term vision: play our part in a /"semantic wikipedia of software"/ + +** Dataflow + #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{.9\textwidth}]{swh-dataflow.pdf}\end{center} +# +# Key properties of the system +# +** Much more than an archive! + #+INCLUDE: "../../common/modules/status-extended.org::#merkletree" :only-contents t + #+INCLUDE: "../../common/modules/status-extended.org::#merkledemo" :minlevel 2 +# +INCLUDE: "../../common/modules/status.org::#datamodel" :minlevel 2 +# +INCLUDE: "../../common/modules/status-extended.org::#merkletree" :minlevel 2 +# +INCLUDE: "../../common/modules/status-extended.org::#merkledemo" :minlevel 2 +# +INCLUDE: "../../common/modules/status-extended.org::#architecture" :only-contents t +# +INCLUDE: "../../common/modules/status-extended.org::#merklerevision" :only-contents t +# +INCLUDE: "../../common/modules/status-extended.org::#giantdag" :only-contents t +# +INCLUDE: "../../common/modules/status-extended.org::#features" :minlevel 2 + diff --git a/talks-public/2018-03-12-team/Makefile b/talks-public/2018-03-12-team/Makefile new file mode 100644 index 0000000..68fbee7 --- /dev/null +++ b/talks-public/2018-03-12-team/Makefile @@ -0,0 +1 @@ +include ../Makefile.slides diff --git a/talks-public/2018-03-21-RDA-plenary-Poster/2018-03-21-RDA-Poster.tex b/talks-public/2018-03-21-RDA-plenary-Poster/2018-03-21-RDA-Poster.tex index 5170171..22b6d71 100644 --- a/talks-public/2018-03-21-RDA-plenary-Poster/2018-03-21-RDA-Poster.tex +++ b/talks-public/2018-03-21-RDA-plenary-Poster/2018-03-21-RDA-Poster.tex @@ -1,370 +1,370 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % MUW Poster % LaTeX Template % Version 1.0 (31/08/2016) % (Based on Version 1.0 (31/08/2015) of the Jacobs Portrait Poster % % License: % CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/) % % Created by: % Nicolas Ballarini, CeMSIIS, Medical University of Vienna % nicoballarini@gmail.com % http://statistics.msi.meduniwien.ac.at/ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Software Heritage poster % % License: % CC BY-SA 4.0 (http://creativecommons.org/licenses/by-sa/4.0/) % Created by: % the Software Heritage team % contact: morane@softwareheritage.org %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \def\footer#1{\def\insertfooter{#1}} %-------------------------------------------------------------------------------------- % PACKAGES AND OTHER DOCUMENT CONFIGURATIONS %-------------------------------------------------------------------------------------- \documentclass[final]{beamer} \usepackage[scale=1.150]{beamerposter} % Use the beamerposter package \usetheme{MUWposter} % Use the MUWposter theme supplied with this template % Include a logo of your project if desired \logo{\pgfputat{\pgfxy(-10,102)}{ \pgfbox[center,base]{ \includegraphics[width=13cm]{../../common/logos/hal-inria-ccsd-swh-logo-en.png}}}} \usepackage{multicol} \usepackage{array} %The following two are column definitions for the aknowledgements section \newcolumntype{L}{>{\arraybackslash}m{22cm}} \newcolumntype{S}{>{\arraybackslash}m{5cm}} \usepackage{pgf} \usepackage{mathtools} \usepackage{amsmath, amsthm, amssymb, amsfonts} \usepackage{exscale} \usepackage{xcolor} \usepackage{ushort} \usepackage{setspace} \usepackage[square,numbers]{natbib} \usepackage{url} \bibliographystyle{abbrvnat} \renewcommand{\vec}[1]{\ushort{#1}} \renewcommand{\vec}[1]{\mathbf{#1}} \definecolor{greenMUW}{RGB}{60,191,174} \definecolor{blueMUW}{RGB}{17,29,79} \definecolor{skinMUW}{RGB}{254,228,217} \definecolor{hellblauMUW}{RGB}{95,180,229} \definecolor{swhred}{RGB}{252,43,20} \usepackage{xcolor} \usepackage{graphicx} \usepackage{wrapfig} %----------------------------------------------- % START Set the colors % Uncomment to apply colors you want to use. %----------------------------------------------- \colorlet{themecolor}{swhred} \usebackgroundtemplate{\includegraphics{background}} %\colorlet{themecolor}{skinMUW} %\colorlet{themecolor}{blueMUW} %\usebackgroundtemplate{\includegraphics{MUW_skin.pdf}} %%\colorlet{themecolor}{blueMUW} %\colorlet{themecolor}{hellblauMUW} %\usebackgroundtemplate{\includegraphics{MUW_hellblau.pdf}} %----------------------------------------------- % END Set the colors %----------------------------------------------- %----------------------------------------------- % START Set the width of the columns %----------------------------------------------- \setlength{\paperwidth}{33.1in} % A0 width: 46.8in \setlength{\paperheight}{46.8in} % A0 height: 33.1in \newlength{\sepmargin} \newlength{\sepwid} \newlength{\onecolwid} \newlength{\twocolwid} \newlength{\threecolwid} % The following measures are used for 2 columns %\setlength{\sepmargin}{0.055\paperwidth} % Separation width (white space) between columns %\setlength{\sepwid}{0.03\paperwidth} % Separation width (white space) between columns %\setlength{\onecolwid}{0.43\paperwidth} % Width of one column %\setlength{\twocolwid}{0.9\paperwidth} % Width of two columns %----------------------------------------------------------- % The following measures are used for 3 columns \setlength{\sepmargin}{0.06\paperwidth} % Separation width (white space) between columns \setlength{\sepwid}{0.02\paperwidth} % Separation width (white space) between columns \setlength{\onecolwid}{0.28\paperwidth} % Width of one column \setlength{\twocolwid}{0.58\paperwidth} % Width of two columns %\setlength{\threecolwid}{0.88\paperwidth} % Width of three columns \setlength{\columnsep}{30pt} %----------------------------------------------- % END Set the width of the columns %----------------------------------------------- %-------------------------------------------------------------------------------------- % TITLE SECTION %-------------------------------------------------------------------------------------- \setbeamertemplate{title}[center] \setbeamertemplate{frametitle}[default][center] \setmainfont{Alegreya-sans} \subtitle{The creation of a new type of scientific deposit:} \title{Software} % Poster title \author{CCSD¹, HAL-Inria², Software Heritage³} % Author(s) \institute{Y. Barborini¹, R. Di Cosmo³, A.R. Dumont³, M. Gruenpeter³, B. Marmol¹, A. Monteil², J. Sadowska², S. Zacchiroli³} % Institution(s) %-------------------------------------------------------------------------------------- \begin{document} \addtobeamertemplate{block end}{}{\vspace*{1ex}} % White space under blocks \addtobeamertemplate{block alerted end}{}{\vspace*{0ex}} % White space under highlighted (alert) blocks \setlength{\belowcaptionskip}{2ex} % White space under figures \setlength\belowdisplayshortskip{1ex} % White space under equations \begin{frame}[t] % The whole poster is enclosed in one beamer frame \begin{columns}[t] % The whole poster consists of two major columns \begin{column}{\sepmargin}\end{column} \begin{column}{\twocolwid} % The first column \begin{block}{Software preservation: a scientific challenge} - Software has become an indisociable support of + Software has become an indissociable support of \textbf{technical and scientific knowledge}. The preservation of this universal body of knowledge has become as essential as preserving research articles and data sets. Software preservation is a pillar of \textbf{reproducibility}. \end{block} % space between cycle and intro (- up, + down) \vspace*{-1cm} \begin{figure} \raggedleft % making cycle more central (- to left, + to right) \hspace*{24cm} \includegraphics[width=.52\linewidth]{../../common/images/software_life_cycle_en.png} \end{figure} \end{column} \begin{column}{\sepmargin}\end{column} \begin{column}{\onecolwid} \begin{block}{} %\begin{multicols}{2} \end{block} \end{column} \begin{column}{\sepmargin}\end{column} \end{columns} % space between two columns and intro (- up, + down) \vspace*{-10cm} \begin{columns} \begin{column}{\sepmargin}\end{column} \begin{column}{\onecolwid} % space between one column and intro (- up, + down) \vspace*{-13 cm} In the quest for making scientific results reproducible, and pass the knowledge over to future -generations, the three main pillars are: \textbf{scientfic articles}, +generations, the three main pillars are: \textbf{scientific articles}, that describe the results, the \textbf{data sets} used or produced, and the \textbf{software} that embodies the logic of the data transformation[1]. \begin{figure} \vspace*{0cm} \includegraphics[width=.82\linewidth]{../../common/images/preservation_triangle_color.png} \caption{The pillars of knowledge preservation} \end{figure} % space between figure and text (- up, + down) \vspace*{-4cm} \begin{block} \end{block} \begin{block}{Software deposit} The collaboration between \textbf{Software Heritage (SWH), Hal-Inria and the CCSD} has resulted with a new type of scientific deposit in the national open archive.\\ Researchers have now the possibility to deposit \\ \textit{software} source code on Hal-Inria. \vspace*{-1cm} \begin{figure} \includegraphics[width=1.1\linewidth]{../../common/images/HAL_form.png} \caption{The form dedicated to software deposits} \end{figure} The steps for a software deposit: \begin{itemize} \item deposit a source code archive (.zip) \item choose deposit type: \textit{software} \item add associated metadata \item add the software authors \item accept the archival of the deposit on SWH \end{itemize} %\begin{flushright} %\footnotesize{* ouverture printemps 2018} %\end{flushright} \end{block} \end{column} \begin{column}{\sepmargin} \end{column} \begin{column}{\onecolwid} % space between cycle and caption (- up, + down) \vspace*{9cm} \begin{figure} \caption{The life cycle of research software} \end{figure} % space between caption and metadata block (- up, + down) \vspace*{-4cm} \begin{block} \textbf{The descriptive metadata} \\ To ensure an accurate description of the software, different metadata are available on the deposit form and are preserved with the software in the SWH archive. An example: \begin{table}[] \centering \small \begin{tabular}{llll} \textbf{Provided by the system:} & & \textbf{ \textit{MUST:}} & \\ - Hal identifier & & - title & \\ - publication date & & - description & \\ - swh-id & & - authors & \\ & & &\\ \textbf{\textit{SHOULD:} } & & \textbf{\textit{MAY:}} & \\ - license & & - dependencies & \\ - keywords & & - platform/OS & \\ - repository & & - funding & \\ \end{tabular} \end{table} \textbf{The intrinsic and persistent identifier} \\ To be able to reproduce an experiment, knowing the exact version of the software used is essential. Software Heritage will provide the \textit{swh-id}, intrinsically bound to software components, ensuring persistent traceability across future development and organizational changes. -The \textit{swh-id}, like a finger print of the Software is specific, +The \textit{swh-id}, like a fingerprint of the Software is specific, persistent and unique. It does not depend on an ID resolver. \end{block} \begin{figure} \includegraphics[width=1\linewidth]{../../common/images/HAL_deposit.png} \caption{The deposit on Hal-Inria} \end{figure} \end{column} \begin{column}{\sepmargin} \end{column} \begin{column}{\onecolwid} \end{column} \begin{column}{\sepmargin} \end{column} \end{columns} \begin{columns} \begin{column}{\sepmargin} \end{column} \begin{column}{\twocolwid} % space between deposit and actors (- up, + down) \vspace*{-4cm} \begin{block}{The actors} \textbf{Software Heritage} took the challenge to collect, preserve and share all software that is publicly available in source code form. \textbf{Hal-Inria} is the open archive of Inria- The French Institute for Research in Computer Science and Automation. -Hal-Inria provides, since 2005, access to the Hal platform, developped by the +Hal-Inria provides, since 2005, access to the Hal platform, developed by the \textbf{ CCSD}- The Center for Direct Scientific Communication. Its main mission is to provide tools, in the respect of open access principles, for archiving and dissemination of scientific publications and data. \end{block} \end{column} \begin{column}{\sepmargin} \end{column} \begin{column}{\onecolwid} % space before column start (- up, + down) \vspace*{-80.5cm} \begin{block}{Transfer deposit to SWH} Once the deposit is validated, it is pushed to SWH using SWORD protocol. SWH will proceed with the injection of the source code into \textbf{Alexandria's Library of Software} and will generate the intrinsic identifier - the \textit{swh-id}. Hal retrieves the \textit{swh-id} to use in the citation format. \vspace*{1cm} \begin{figure} \centering \includegraphics[width=.9\linewidth]{../../common/images/deposit_communication_en.png} \caption{The software deposit workflow} \end{figure} \vspace*{-1cm} \begin{figure} \includegraphics[width=.9\linewidth]{../../common/images/SWH_deposit.png} \caption{Browse the deposit on Software Heritage} \end{figure} \end{block} \vspace*{-1.3cm} \begin{block}{Software citation} Following the software citation principles[2] and thus considering that software is a legitimate and citable product of research, we have proposed a citation format containing metadata submitted with the software. \begin{figure} \centering \includegraphics[width=.9\linewidth]{../../common/images/citation_format_en.png} \caption{Software citation format[3]} \end{figure} \vspace*{-.5cm} Citation is essential for promoting the recognition of software as a valuable research output, and ensuring that the authors have their contributions recognised and rewarded[4]. \vspace*{-.5cm} \end{block} \begin{block}{\large Références} \small 1.Roberto Di Cosmo, Stefano Zacchiroli (2017) Software Heritage: Why and How to Preserve Software Source Code. iPRES 2017. https://hal.archives-ouvertes.fr/hal-01590958\\ 2.Smith et al. (2016), Software citation principles. PeerJ Comput. Sci. 2:e86; DOI 10.7717/peerj-cs.862.\\ 3.Yolanda Gil (2015) Documenting Software through Metadata. Geosoft.\\ 4.Mike Jackson (2014) How to cite and describe software. The Software Sustainability Institute https://www.software.ac.uk/how-cite-and-describe-software \end{block} \end{column} \begin{column}{\sepmargin} \end{column} \end{columns} \vspace*{1cm} \includegraphics[width=.18\linewidth]{../../common/logos/CC_by_sa.png} \end{frame} \end{document} diff --git a/talks-public/2018-03-22-RDA-IG/2018-03-22-RDA-IG-outlook.org b/talks-public/2018-03-22-RDA-IG/2018-03-22-RDA-IG-outlook.org new file mode 100644 index 0000000..f089d40 --- /dev/null +++ b/talks-public/2018-03-22-RDA-IG/2018-03-22-RDA-IG-outlook.org @@ -0,0 +1,85 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) +#+TITLE: General Outlook and Call for Action +#+SUBTITLE: Metadata, identifiers and reproducibility +# does not allow short title, so we override it for beamer as follows : +#+BEAMER_HEADER: \author[Roberto Di Cosmo]{Roberto Di Cosmo (Software Heritage, INRIA)} +# -*- org-image-actual-width: nil; -*- +#+DATE: Mars 22nd, 2018 +#+EMAIL: roberto@dicosmo.org +#+DESCRIPTION: General Outlook +#+KEYWORDS: software heritage legacy preservation knowledge mankind technology outlook policy legal action +# + +# +# Prelude contains all the information needed to export the main beamer latex source +# + +#+INCLUDE: "../../common/modules/prelude-toc.org" :minlevel 1 +# +#+INCLUDE: "../../common/modules/169.org" + +* Software artefacts for (some sectors of) Science +** Pressure to make the source code available is raising +*** Why + Necessary to + - /reproduce/, + - /modify/ and /evolve/, *building new experiments* from old ones +#+BEAMER: \pause +*** When and where + - debate started end of first 2000 decade (biology, statistics, medicine, etc.) + - growing in Computer Science since the ESEC/FSE 2011 Artifact Evaluation context (winner: Vouillon and Di Cosmo); + see http://www.artifact-eval.org/ +** ACM take on Reproducibility, Replicability and Source code + ACM policies: [[https://www.acm.org/publications/policies/artifact-review-badging][Artifact Review and Badging]] +*** Terminology (not consensual yet!) + :PROPERTIES: + :BEAMER_col: 0.5 + :BEAMER_env: block + :END: + - *Repeatability* \\ same team, same experimental setup + - *Replicability* \\ different team, same experimental setup + - *Reproducibility* \\ different team, different experimental setup +#+BEAMER: \pause +*** Badging software artefacts + :PROPERTIES: + :BEAMER_col: 0.4 + :BEAMER_env: block + :END: +#+latex: \begin{center} + #+ATTR_LATEX: :width 0.6\linewidth +# file:file:metadata_landscape_final.png +file:acm_badges.png +#+latex: \end{center} +#+BEAMER: \pause +** Why it is important +*** Metadata + \hfill Evaluated software artifacts must be properly *described* +*** Archival + \hfill Evaluated software artifacts must be properly *archived* +*** Identification + \hfill Evaluated software artifacts must be properly *referenced* +*** Citation + \hfill Evaluated software artifacts must be properly *cited* /(not the same as referenced)/ +#+BEAMER: \pause +*** +#+latex: \centerline{This is happening \emph{outside} of RDA} +* Software ignored and endangered +** EU Copyright reform +*** The points of attention +#+latex: {\small \url{http://eur-lex.europa.eu/legal-content/EN/TXT/?uri=COM:2016:593:FIN}} + - Art. 3 (TDM) :: Limitations to text and data mining +# - Art. 11 (Snippets) :: Limitations to short citation/snippets + - Art. 13 (Upload filtering) :: threatens Open Access/Data/Source/Science as a whole! +#+BEAMER: \pause +*** Calendar and actions + - vote of EU parliament expected end of April, *please act now* + + Research http://sparceurope.org/copyrightreform + + Free Software https://fsfe.org/news/2017/news-20170908-01.en.html + + C4C https://copyright4creativity.eu + + VoxScientia https://voxscientia.eu/ + \hfill /More info offline/ +** OECD +#+latex: \vfill\centerline{\Large Off the record}\vfill +* Conclusion +** Conclusion +#+latex: \vfill\centerline{\Large We have a lot to do!}\vfill diff --git a/talks-public/2018-03-22-RDA-IG/2018-03-22-RDA-IG.org b/talks-public/2018-03-22-RDA-IG/2018-03-22-RDA-IG.org new file mode 100644 index 0000000..d1e3e4e --- /dev/null +++ b/talks-public/2018-03-22-RDA-IG/2018-03-22-RDA-IG.org @@ -0,0 +1,287 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) +#+TITLE: Metadata, use cases and identifiers +#+SUBTITLE: Metadata, use cases and identifiers +# does not allow short title, so we override it for beamer as follows : +# +BEAMER_HEADER: \title[Availability and traceability]{Preserving Software and Data} +#+BEAMER_HEADER: \author[Roberto Di Cosmo, Morane Gruenpeter]{Roberto Di Cosmo (SWH, INRIA)\\ Morane Gruenpeter (SWH, CrossMiner)} + + +#+AUTHOR:Roberto Di Cosmo (SWH, INRIA)\\ Morane Gruenpeter (SWH, Crossminer) +#+DATE: Mars 22nd, 2018 +#+EMAIL: morane@softwareheritage.org +#+DESCRIPTION: Intrinsic identifiers for digital objects +#+KEYWORDS: software heritage legacy preservation knowledge mankind technology +# + +# +# Prelude contains all the information needed to export the main beamer latex source +# + +#+INCLUDE: "../../common/modules/prelude-toc.org" :minlevel 1 +# +#+INCLUDE: "../../common/modules/169.org" + +* Explore the metadata landscape +** The metadata challenge +#+latex: \begin{center} \huge{What is software ?} \end{center} +#+BEAMER: \pause +*** Software as a concept :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.5 + :BEAMER_env: block + :END: + - software project / entity +#+BEAMER: \pause + - the creators and the community around it +#+BEAMER: \pause + +*** Software artifact :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.5 + :BEAMER_env: block + :END: + - the binaries for different environments +#+BEAMER: \pause + - the *software source code* for each version + + +** The metadata landscape +#+latex: \begin{center} + #+ATTR_LATEX: :width \linewidth +# file:file:metadata_landscape_final.png +file:metadata_landscape_RDA11_1.png +#+latex: \end{center} + +** The metadata landscape +#+latex: \begin{center} + #+ATTR_LATEX: :width \linewidth +# file:file:metadata_landscape_final.png +file:metadata_landscape_RDA11_2.png +#+latex: \end{center} + +** The metadata landscape +#+latex: \begin{center} + #+ATTR_LATEX: :width \linewidth +# file:file:metadata_landscape_final.png +file:metadata_landscape_RDA11_3.png +#+latex: \end{center} + +** The metadata landscape +#+latex: \begin{center} + #+ATTR_LATEX: :width \linewidth +# file:file:metadata_landscape_final.png +file:metadata_landscape_RDA11_4.png +#+latex: \end{center} +** The metadata landscape +#+latex: \begin{center} + #+ATTR_LATEX: :width \linewidth +# file:file:metadata_landscape_final.png +file:metadata_landscape_RDA11_5.png +#+latex: \end{center} + + +** The metadata landscape +#+latex: \begin{center} + #+ATTR_LATEX: :width \linewidth +# file:file:metadata_landscape_final.png +file:metadata_landscape_RDA11.png +#+latex: \end{center} +* 10th RDA plenary Software Source Code IG results +** 10th RDA plenary Software Source Code IG results +*** Subjects discussed with questionnaire + - interest in /Software Source Code/ + - use cases + - ontology/vocabularies used + - properties needed for Software Source Code + - advantages for structured data + +** Interest in IG +*** Research topics :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.5 + :BEAMER_env: block + :END: + - software accompany data + - promote software as a first class research product + - software citation + - research software + - improve publication +#+BEAMER: \pause + +*** General topics :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.5 + :BEAMER_env: block + :END: + - PID for software + - managing code : incorporate better practices for software + - discover and recover software + - reuse + - preserving software source code + +** Identified use cases +*** Research use cases :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.5 + :BEAMER_env: block + :END: + - publish / deposit source code with metadata + - credit attribution and authorship + - reproducibility + - what test data are available + - research software source code + +#+BEAMER: \pause +*** General use cases :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.5 + :BEAMER_env: block + :END: + - archive software + - expose metadata to indexes + - link to people, data, funding + - discovery (semantic search) + - conditions/restrictions for use + - build software (what compiler is required) + - integrate into workflow + +** Metadata terms +*** identify :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.2 + :BEAMER_env: block + :END: + - identifier + - title + - authors + - version + - type + - origin source +#+BEAMER: \pause +*** execute :B_block:BMCOL: + :PROPERTIES: + :BEAMER_opt: + :BEAMER_env: block + :BEAMER_col: 0.2 + :END: + - link to a compiled version + - repository + - compiler + - environment + - examples +#+BEAMER: \pause +*** classify :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.2 + :BEAMER_env: block + :END: + - description + - keywords + - in/out data + - references + - algorithms + - docs url +#+BEAMER: \pause + +*** administrate :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.2 + :BEAMER_env: block + :END: + - contact + - authorship + - funders + - license + - editor (publisher) + - dates + - status +#+BEAMER: \pause +*** /Group activity/: review, comments and questions + +* Updates from, and links to Software Heritage +** The Software Heritage Project \hfill www.softwareheritage.org + :PROPERTIES: + :CUSTOM_ID: mission + :END: +#+latex: \begin{center} +#+ATTR_LATEX: :width .8\linewidth +# file:SWH-logo+motto.pdf +file:SWH-logo.pdf +#+latex: \end{center} +*** Our mission + *Collect*, *preserve* and *share* the /source code/ of /all the software/\\ +\mbox{}\\ + \hfill /Preserving/ the past, /enhancing/ the present, /preparing/ the future +*** Going global :noexport: + \hfill building an /open, multistakeholder, nonprofit/ organisation + + +** Archive and observatory, serving the needs of society as a whole +#+latex: \begin{center} +#+ATTR_LATEX: :width .6\linewidth +file:SWH-as-foundation-slim.png +#+latex: \end{center} +#+BEGIN_EXPORT latex +\note{On top of Software Heritage one can imagine a myriad applications, for education, +research, industry, cultural heritage, and society as a whole.\\[1em] +But building the universal archive of source code geared towards the long term is a grand challenge +on its own: so we follow the Unix phylosophy, and focus on doing one thing, and doing it well, +building this essential infrastructure for software.\\[1em] +Our principles are simple: all our code is and will be open source, our organisation is transparent;\\[1em] +and we focus on the long term: we will grow an international network of mirrors and partners +and create a non profit foundation to coordinate it for the benefit of society as a whole.} +#+END_EXPORT + +\begin{center} + \includegraphics[width=.7\linewidth]{growth.png} +\end{center} +*** + \hfill largest collection of software source code in the world + +** Our challenge in the PID arena +*** Our requirements + - Long term :: identifiers must be there for the long term + - Free :: one cannot /buy/ billions of identifiers + - No middle man :: identifiers must be meaningful even if resolvers go away + - Integrity, not just naming :: identifier must ensure that the retrieved object is the intended one + - Uniqueness by design :: only one name for each object, each object has only one name +#+BEAMER: \pause +*** We can find no satisfaction... + - Ark, PURLs, DOIs, Handle, ... all miss a part of it + - we use cryptographic hashes instead (Merkle trees, circa 1979) +** Back to basics: DIOs vs. IDOs +*** DIO (digital identifier of an object) :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.5 + :BEAMER_env: block + :END: + - digital identifiers for traditional (non digital) objects + - epistemic complications and significant governance issues, ... +#+BEAMER: \pause + \hfill The \alert{software concept/project} needs a DIO +#+BEAMER: \pause +*** IDO (identifier of a digital object) :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.5 + :BEAMER_env: block + :END: + - (digital) identifier for digital objects + - simpler to build/handle and can be intrinsic +#+BEAMER: \pause + \hfill The \alert{software source code} needs an IDO for each version or state +#+BEAMER: \pause +*** Separation of concerns + - yes, we \alert{need both} DIOs and IDOs + - no, we \alert{must not mistake} DIOs for IDOs (and viceversa) + +* Conclusion +** Request for comment +*** Feedback on our PID schemas + see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html +*** Test the archive navigation in Software Heritage + https://archive.softwareheritage.org/ (user: *rda* passwd: *2018*) +*** Thoughts on the DIO / IDO conceptualization + contact Roberto and Morane +** + +#+latex: \begin{center} \huge{Questions ?} \end{center} diff --git a/talks-public/2018-03-22-RDA-IG/Makefile b/talks-public/2018-03-22-RDA-IG/Makefile new file mode 100644 index 0000000..68fbee7 --- /dev/null +++ b/talks-public/2018-03-22-RDA-IG/Makefile @@ -0,0 +1 @@ +include ../Makefile.slides