diff --git a/common/images/2022-09-14-archive-growth.png b/common/images/2022-09-14-archive-growth.png new file mode 100644 index 0000000..b41a701 Binary files /dev/null and b/common/images/2022-09-14-archive-growth.png differ diff --git a/common/images/git-merkle/merkle-vertical-1.pdf b/common/images/git-merkle/merkle-vertical-1.pdf new file mode 100644 index 0000000..1b63a27 Binary files /dev/null and b/common/images/git-merkle/merkle-vertical-1.pdf differ diff --git a/common/images/git-merkle/merkle-vertical-2-contents.pdf b/common/images/git-merkle/merkle-vertical-2-contents.pdf new file mode 100644 index 0000000..144803c Binary files /dev/null and b/common/images/git-merkle/merkle-vertical-2-contents.pdf differ diff --git a/common/images/git-merkle/merkle-vertical-3-directories.pdf b/common/images/git-merkle/merkle-vertical-3-directories.pdf new file mode 100644 index 0000000..ae07b03 Binary files /dev/null and b/common/images/git-merkle/merkle-vertical-3-directories.pdf differ diff --git a/common/images/git-merkle/merkle-vertical-4-revisions.pdf b/common/images/git-merkle/merkle-vertical-4-revisions.pdf new file mode 100644 index 0000000..d0a7e55 Binary files /dev/null and b/common/images/git-merkle/merkle-vertical-4-revisions.pdf differ diff --git a/common/images/git-merkle/merkle-vertical-5-releases.pdf b/common/images/git-merkle/merkle-vertical-5-releases.pdf new file mode 100644 index 0000000..be51434 Binary files /dev/null and b/common/images/git-merkle/merkle-vertical-5-releases.pdf differ diff --git a/common/images/ngyro-com-pog-reports-guix-coverage-2022-09-14.png b/common/images/ngyro-com-pog-reports-guix-coverage-2022-09-14.png new file mode 100644 index 0000000..b7aa7ca Binary files /dev/null and b/common/images/ngyro-com-pog-reports-guix-coverage-2022-09-14.png differ diff --git a/common/images/ngyro-com-pog-reports-guix-coverage-2022-09-14.svg b/common/images/ngyro-com-pog-reports-guix-coverage-2022-09-14.svg new file mode 100644 index 0000000..fd1984c --- /dev/null +++ b/common/images/ngyro-com-pog-reports-guix-coverage-2022-09-14.svg @@ -0,0 +1,2105 @@ + + + +Gnuplot +Produced by GNUPLOT 5.4 patchlevel 2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0 + + + + + 2000 + + + + + 4000 + + + + + 6000 + + + + + 8000 + + + + + 10000 + + + + + 12000 + + + + + 14000 + + + + + 16000 + + + + + 18000 + + + + + 20000 + + + + + 2019-05-05 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2019-07-03 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2019-09-01 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2019-11-10 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2020-01-05 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2020-03-01 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2020-05-03 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2020-07-05 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2020-09-06 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2020-11-01 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2021-01-03 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2021-03-01 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2021-05-02 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2021-07-04 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2021-09-05 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2021-11-07 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2022-01-02 + + + + + + + + + + + + + + Stored + + + Stored + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Missing + + + Missing + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Unknown + + + Unknown + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/talks-public/2022-09-16-Guix/2022-09-16.org b/talks-public/2022-09-16-Guix/2022-09-16.org new file mode 100644 index 0000000..9b7eaba --- /dev/null +++ b/talks-public/2022-09-16-Guix/2022-09-16.org @@ -0,0 +1,405 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) +#+KEYWORDS: software heritage reproducibility guix +#+TITLE: Software Heritage and Guix +#+SUBTITLE: Software Heritage to the rescue of reproducible Science +#+AUTHOR: vlorentz, ardumont +#+EMAIL: vlorentz@softwareheritage.org, ardumont@softwareheritage.org +#+DATE: 16 Sep 2022 +#+BEAMER_HEADER: \date[16/09/2022]{16/09/2022\\Event 10 years of Guix, Paris 2022} +#+BEAMER_HEADER: \author{Valentin Lorentz (@vlorentz) / Antoine R. Dumont (@ardumont)} +#+BEAMER_HEADER: \institute[Software Heritage]{Software Engineers, Software Heritage\\Inria} + +#+BEAMER_HEADER: \setbeameroption{hide notes} +#+LATEX_HEADER: \usepackage{tcolorbox} +#+LATEX_HEADER: \definecolor{links}{HTML}{2A1B81} +#+LATEX_HEADER: \hypersetup{colorlinks,linkcolor=,urlcolor=links} + +# Syntax highlighting setup +#+LATEX_HEADER_EXTRA: \usepackage{minted} +#+LaTeX_HEADER_EXTRA: \usemintedstyle{emacs} +#+name: setup-minted +#+begin_src emacs-lisp :exports results :results silent + (setq org-latex-listings 'minted) + (setq org-latex-to-pdf-process + '("pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f" + "pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f" + "pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f")) + (add-to-list 'org-latex-minted-langs '("emacs-lisp" "common-lisp")) +#+end_src +# End syntax highlighting setup + +# +# prelude.org contains all the information needed to export the main beamer latex source +# use prelude-toc.org to get the table of contents +# + +#+INCLUDE: "../../common/modules/prelude-toc.org" :minlevel 1 + +#+INCLUDE: "../../common/modules/169.org" + +# +LaTeX_CLASS_OPTIONS: [aspectratio=169,handout,xcolor=table] + +#+LATEX_HEADER: \usepackage{bbding} +#+LATEX_HEADER: \DeclareUnicodeCharacter{66D}{\FiveStar} + +# +# If you want to change the title logo it's here +# +# +BEAMER_HEADER: \titlegraphic{\includegraphics[width=0.5\textwidth]{SWH-logo}} + +# aspect ratio can be changed, but the slides need to be adapted +# - compute a "resizing factor" for the images (macro for picblocks?) +# +# set the background image +# +# https://pacoup.com/2011/06/12/list-of-true-169-resolutions/ +# +#+BEAMER_HEADER: \pgfdeclareimage[height=90mm,width=160mm]{bgd}{swh-world-169.png} +#+BEAMER_HEADER: \setbeamertemplate{background}{\pgfuseimage{bgd}} +#+LATEX: \addtocounter{framenumber}{-1} + +* Introduction: the Software Heritage project + +** What is SoftwareHeritage? + :PROPERTIES: + :CUSTOM_ID: spread + :END: + +#+latex: \begin{center} +#+ATTR_LATEX: :width \extblockscale{.6\linewidth} +file:SWH-logo+motto.pdf +#+latex: \end{center} + + The Universal Source Code Archive + +** Why an archive? Software is spread all around + :PROPERTIES: + :CUSTOM_ID: spread + :END: + #+latex: \begin{flushleft} + #+ATTR_LATEX: :width \extblockscale{.5\linewidth} + file:myriadsources.png + #+latex: \end{flushleft} + +*** Fashion victims + - many development platforms (popular forges: Guix, PyPI, npm, ...) + - various distribution places (standalone forges: gitlab, heptapod, cgit, gitea...) + - projects tend to migrate from one place to another over time + +#+BEAMER: \pause + +*** One place... :B_block: + :PROPERTIES: + :BEAMER_env: block + :END: + \hfill ... where can we find, track, search and recover /all/ source code? + +** Why an archive? Software is fragile + :PROPERTIES: + :CUSTOM_ID: fragile + :END: + #+latex: \begin{flushleft} + #+ATTR_LATEX: :width \extblockscale{.5\linewidth} + file:fragilecloud.png + #+latex: \end{flushleft} + +*** Like all digital information, FOSS is fragile +# - inconsiderate and/or malicious code loss (e.g., Code Spaces) + - link rot: projects are created, moved around, removed + - data rot: physical media with legacy software decay + - business-driven code loss (e.g. Gitorious, Google Code, Bitbucket, ...) + +#+BEAMER: \pause + +*** If a website disappears you go to the Internet Archive... :B_block: + :PROPERTIES: + :BEAMER_env: block + :END: + \hfill where do you go if (a repository on) GitHub or GitLab goes away? + +** Software Heritage in a Nutshell + +#+latex: \begin{center} +#+ATTR_LATEX: :width \extblockscale{.6\linewidth} +file:SWH-logo+motto.pdf +#+latex: \end{center} + +*** Main Objectives +- *Collect*, *Preserve* and *Share* + +** Collect / Preserve + +*** Reference catalog + :PROPERTIES: + :BEAMER_env: block + :BEAMER_COL: .3 + :END: +#+BEGIN_EXPORT latex +\begin{center} +\includegraphics[width=.4\linewidth]{myriadsources} +\end{center} +#+END_EXPORT + *find* and *reference* all software source code + +#+BEAMER: \pause + +*** Universal archive + :PROPERTIES: + :BEAMER_env: block + :BEAMER_COL: .3 + :END: +#+BEGIN_EXPORT latex +\begin{center} +\includegraphics[width=.4\linewidth]{fragilecloud} +\end{center} +#+END_EXPORT + *preserve* *forever* archived software source code + +** Share + +*** Research infrastructure :B_block: + :PROPERTIES: + :BEAMER_COL: .3 + :BEAMER_env: block + :END: +#+BEGIN_EXPORT latex +\begin{center} +\includegraphics[width=.4\linewidth]{atacama-telescope} +\end{center} +#+END_EXPORT + - *enable analysis* of software source code + - make every piece *identifiable* + - and freely *available*... + +#+BEAMER: \pause + +*** Reproducibility :B_block: + :PROPERTIES: + :BEAMER_COL: .3 + :BEAMER_env: block + :END: +#+BEGIN_EXPORT latex +\begin{center} +\includegraphics[width=.4\linewidth]{atacama-telescope} +\end{center} +#+END_EXPORT + - ... *exactly* as it was when archived (as much as possible) + - *for all* research software artefacts + +** Our principles + :PROPERTIES: + :CUSTOM_ID: principlesstatus + :END: +#+latex: \begin{center} +#+ATTR_LATEX: :width .6\linewidth +file:SWH-as-foundation-slim.png +#+latex: \end{center} +#+latex: \footnotesize\vspace{-3mm} + +** Under the hood: Automation and storage + :PROPERTIES: + :CUSTOM_ID: automation + :END: + #+BEAMER: \begin{center} + #+BEAMER: \only<1>{\includegraphics[width=\extblockscale{\textwidth}]{swh-dataflow-merkle.pdf}} + #+BEAMER: \end{center} + +** Coverage status + + /Global development history/ *permanently archived* in a *uniform data model* + - over *12 billion* unique source files from over *180 million* software projects + - *~900 TB* (uncompressed) blobs, *~25 B* nodes, *~300 B* edges + + #+LATEX: \centering + #+ATTR_LATEX: :width \extblockscale{.8\linewidth} + [[file:2022-09-14-archive-growth.png]] + +* Reference archived code with SWHIDs +** Uniform Data Model + + #+LATEX: \centering + #+LATEX: \only<1>{\colorbox{white}{\includegraphics[width=160px]{git-merkle/merkle-vertical-1.pdf}}} + #+LATEX: \forcebeamerend + +** Data Model: A worked example + #+LATEX: \centering\forcebeamerstart + #+LATEX: \only<1>{\colorbox{white}{\includegraphics[width=\extblockscale{\linewidth}]{git-merkle/contents.pdf}}} + #+LATEX: \only<2>{\colorbox{white}{\includegraphics[width=160px]{git-merkle/merkle-vertical-2-contents.pdf}}} + #+LATEX: \only<3>{\colorbox{white}{\includegraphics[width=\extblockscale{\linewidth}]{git-merkle/directories.pdf}}} + #+LATEX: \only<4>{\colorbox{white}{\includegraphics[width=160px]{git-merkle/merkle-vertical-3-directories.pdf}}} + #+LATEX: \only<5>{\colorbox{white}{\includegraphics[width=\extblockscale{\linewidth}]{git-merkle/revisions.pdf}}} + #+LATEX: \only<6>{\colorbox{white}{\includegraphics[width=160px]{git-merkle/merkle-vertical-4-revisions.pdf}}} + #+LATEX: \only<7>{\colorbox{white}{\includegraphics[width=\extblockscale{\linewidth}]{git-merkle/releases.pdf}}} + #+LATEX: \only<8>{\colorbox{white}{\includegraphics[width=160px]{git-merkle/merkle-vertical-5-releases.pdf}}} + #+LATEX: \only<9>{\colorbox{white}{\includegraphics[width=\extblockscale{\linewidth}]{git-merkle/snapshots.pdf}}} + #+LATEX: \forcebeamerend + +** Meet the SWHID intrinsic identifiers + + :PROPERTIES: + :CUSTOM_ID: oneslide + :END: + #+LATEX: \centering + #+LATEX: \only<1>{\includegraphics[width=\linewidth]{SWHID-v1.4_3.png}} + #+LATEX: \forcebeamerend \vspace{-6mm} + +* Collaboration Guix / SWH + +** How does this relate to Guix? + +- Nothing is eternal, source code (in all forms) disappears +- Hopefully, SWH keeps a copy of everything +- [[https://www.softwareheritage.org/2019/04/18/software-heritage-and-gnu-guix-join-forces-to-enable-long-term-reproducibility/][Guix ensures source code is archived in SWH when building]] ("Save Code Now") +- After source code actually disappears, falls back to SWH when rebuilding ("Software Heritage Vault") + +** History of Guix / SWH integration + +- 2018: Guix / SWH to ensure source code artifacts are pushed to SWH +- 2019: TWEAG / Guix / SWH: Work on a new loader to regularly ingest new artifacts +- 2022: ongoing work to refactor current loader into a standard lister/loader + +** Reproducibility is of the essence! + +*** Current situation + +- Persistent intrinsic identifiers (SWHID) are not (yet?) package manager standard +- Guix (and other) package managers reference tarball hashes, instead of hashes of the content + +#+begin_src emacs-lisp +(define-public ... + (package + ... + (source (origin (method url-fetch) + (uri (string-append + "https://..." version ".tar.gz")) + (sha256 (base32 "03mwi1l3354x52nar...")))) + ... +#+end_src + +#+BEAMER: \pause + +*** Solutions + +- make (non-specific swh) SWHID standard or rebuild original bit-for-bit tarball + +** How to rebuild original tarballs? + +*** pristine-tar + +- https://manpages.debian.org/bullseye/pristine-tar/pristine-tar.1.en.html +- ~xdelta~: binary diffs of tar headers' content and order +- ~zgz~: guessing compression parameters + +*** Limitations + +- it is brittle (relies on reference Tar implementation) +- it produces large diffs + +* Enters... Disarchive + +** How it started + +*** Discussions + - "gforge.inria.fr to be taken off-line in Dec. 2020" https://issues.guix.gnu.org/42162 + - "lookup ingested tarballs by container checksum" https://forge.softwareheritage.org/T2430 + +*** New software + - Disarchive by Timothy Sample https://git.ngyro.com/disarchive/ + +** How it works: + +*** Principles + + - Manifest of tarball fields: entry order, PAX headers, ... + - References the root directory in SWH + - WIP: guessing compression parameters/implementations (using ~zgz~) + - -> rebuild original ~.tar~, then original ~.tar.{gz,xz}~ + +** Example manifest (1/2) + +*** Example manifest (1/2) + +#+begin_src emacs-lisp +(disarchive + (version 0) + (tarball + (name "test-archive.tar") + (digest (sha256 "0da9fa3e7b360533678338871d9dd36f3...")) + (default-header + (chksum (trailer " ")) + (magic "ustar ") + (version " \x00") + (devmajor 0 (source "" (trailer ""))) + (devminor 0 (source "" (trailer ""))) + (data-padding "")) +... +#+end_src + +** Example manifest (2/2) + +*** Example manifest (2/2) + +#+begin_src emacs-lisp +(disarchive +... + (headers + ("test-archive/" (mode 493) (chksum 4291) (typeflag 53)) + ("test-archive/file-a" (size 15) (chksum 4849)) + ("test-archive/file-b" (size 15) (chksum 4850))) + (padding 6656) + (input (directory-ref + (version 0) + (name "test-archive") + (addresses + (swhid "swh:1:dir:902b1e94f0f5efdde6...")) + (digest (sha256 "277decb2666f4832ef64a..."))))) +#+end_src + +** Planned integration of SWH with Disarchive + +*** Currently + + - SWH does not store Disarchive manifests yet + +*** Plan + + - Run Disarchive every time SWH loads a tarball + - Add the manifest to the Archive + - when someone requests ~tarball-hash~, rebuild from the manifest + +* Current Work in Progress + +** NixGuix manifests coverage in SWH + +*** goal: 100% coverage + - currently missing sources due to technical limitations: bare files, directories, patches + - Redesign in progress to deal with such limitations + + #+latex: \centering + #+ATTR_LATEX: :width \extblockscale{.8\linewidth} + file:ngyro-com-pog-reports-guix-coverage-2022-09-14.png + +** Disarchive + +*** Integration + - code dump at https://git.ngyro.com/swh/ + - needs to be reviewed and merged + +* The End + +** The End + +*** Links + +- [[https://www.softwareheritage.org][Software Heritage website]] +- [[https://archive.softwareheritage.org][SWH's Archive]] +- [[https://forge.softwareheritage.org][Development forge]] +- [[https://guix.gnu.org/manual/en/html_node/Invoking-guix-lint.html#Invoking-guix-lint][guix lint]] + +*** We are hiring + +[[https://www.softwareheritage.org/jobs/][We are hiring devs and sysadmins]] + +*** Questions? + +And thanks for your time! diff --git a/talks-public/2022-09-16-Guix/2022-09-16_transcript.md b/talks-public/2022-09-16-Guix/2022-09-16_transcript.md new file mode 100644 index 0000000..c8b4dc1 --- /dev/null +++ b/talks-public/2022-09-16-Guix/2022-09-16_transcript.md @@ -0,0 +1,323 @@ +# Software Heritage and Guix + +Software Heritage (SWH) to the rescue of reproducible Science + +# Introduction: the Software Heritage project + +Welcome to our Software Heritage / Guix presentation. We'll be talking about how SWH +helps reproducible science. + +First a bit of information about us. We are Valentin Lorentz and Antoine Dumont, +Software engineers @ SWH. + +With the other member of the technical team (around 8 devs and sysadms), we are in +charge of developing, reviewing, and maintaining SWH services & tools in good shape. + +## What is Software Heritage? + +It's the universal source code Archive, with an emphasis on **archiving all** software +source code available. + +## Why an archive? Software is spread all around + +### Fashion victims + +Well, as of today, software is everywhere. It exists a multiple of development +platforms, various software distribution places. And projects tend to move around, +migration from one place to another... + +### One place... + +So we need at least one place where we can find, track, search and recover all source +code. + +## Why an archive? Software is fragile + +Because, among other things, those various places are subject to + +### Like all digital information, FOSS is fragile + +link rot, data rot or simply are made out of service because it's no longer economically +viable at some point. + +### If a website disappears you go to the Internet Archive... + +'s wayback machine. + +But where do you go when a repository you depend upon disappear? Hopefully, SWH. + +## Software Heritage in a Nutshell + +### Main Objectives + +Collect / Preserve / Share + +## Collect / Preserve + +### Reference catalog + +to find and reference all software source code. + +### Universal archive + +to preserve forever archived software source code. + +## Share + +In the end goal of sharing it. + +### Research infrastructure + +It enables analysis of software source code, to make every piece identifiable and freely +available... + +### Reproducibility + +and from a reproducibility standpoint, exactly as it was when archived, for all research +software artefacts. + +## Our principles + +SWH to be the core foundation to help other actors of our societies, the ones that rely +on software: cultural heritage, industry, research and public administration. + +## Under the hood: Automation and storage + +Regarding coverage, our stack is mainly composed of listers which are in charge of +discovering new origins by discussing with multiple development platforms (PyPI, NPM, +OPAM, GitHub, Bitbucket, ...). Those origins are then scheduled for ingestion by +loaders. Loaders are in charge of discussing with various technologies (git, hg, bzr, +cvs, svn, tarballs, pypi, npm, ...) to ingest the various objects into the SWH archive +(Merkle dag). + +## Coverage status + +As of today, we permanently archived in our uniform data model over ~12 billion of +unique source files from over ~200 million software projects. + +Which means around ~900TB of uncompressed blobs and a graph of 25B nodes and 300B edges. + +# Reference archived code with SWHIDs + +## Uniform Data Model + +As we mentioned, it's a Merkle dag. Meaning that every leaf has a cryptographic hash out +of its raw content. And other non-leaf elements are labelled with their child hashes. + +From bottom to top in order, we have: +- contents which are the raw files +- directories (~folders) holding up files or other folders +- revisions (~commit) targeting directories +- releases targeting either directories or revisions +- at last snapshots which are the top-level representation of the origin at a given + point in time. Each branch of a snapshot can target either a revision or a release. + +## Data Model: A worked example + +### Content + +Out of the file's content and length, we compute a bunch of hashes (sha1, sha2, ...) + +### Directory + +Listing the nodes of the folders as a manifest and compute its id. + +### Revision + +Out of the metadata present in the commit (author, committer, date, ...), compute a +manifest and hash it to have its id. + +### Release + +Out of the metadata present in the tag/release (version, date, ...), compute a manifest +and hash it to have its id. + +### Snapshot + +Finally, creating a similar manifest as the directory and hash the manifest to obtain +its id. + +## Meet the SWHID intrinsic identifiers + +All those elements present in the dag whose unique intrinsic identifier we talked about +is actually what we called a SWHID. + +It's intrinsic because it can be computed out of its raw content and/or their child's +content (as per the Merkle dag definition). + +# Collaboration Guix / SWH + +## How does this relate to Guix? + +Now why does this have to do with Guix? + +Well, as we, nothing is eternal, and source code in particular often disappears. +Guix depends on some of this source code, so we need first to archive it, then have a way to retrieve it. + +And this is exactly what we are doing (and by "we", I mean both Software Heritage and Guix developers): + +- Since November 2018, [https://www.softwareheritage.org/2019/04/18/software-heritage-and-gnu-guix-join-forces-to-enable-long-term-reproducibility/](Guix ensures source code is archived in SWH when building), via an API we call "Save Code Now" (it is also [available as a GUI](https://archive.softwareheritage.org/save/) +- After source code actually disappears, Guix falls back to SWH when rebuilding (["Software Heritage Vault"](https://docs.softwareheritage.org/devel/swh-vault/getting-started.html)) + +The Vault is a component that accepts requests to rebuild specific artefacts, then fetches their content recursively in the large graph of the Software Heritage Archive, and reproduces the original artifact: git repository, tarball, ... + +This takes some time: typically a few minutes for a tarball, but up to a day or two to regenerate a git repository for large repositories like Linux. (Of course, resulting tarballs are cached.) + +## History of Guix / SWH integration + +- 2018: Guix / SWH to ensure source code artifacts are **pushed** to Software Heritage via the "Safe Code Now" +- 2019: TWEAG / Guix / SWH: Work on a new loader to regularly ingest new artifacts, by **pulling** from the Guix and Nix package repositories daily +- 2022: ongoing work to refactor current loader into a standard lister/loader, to actually support all source code referenced by Guix and Nix, as we are currently missing support for single files and patches + +## Reproducibility is of the essence! + +### Report + +Another specific issue we have is will tarballs. + +To load a tarball, the first thing Software Heritage does is decompress and unpack it; then it reads the files and directories they contain. + +On request, the Vault reads these files and directories from the archive, then packs them into tarballs and compresses them. + +Unfortunately, this loses some information from tarballs, because decompressing and unpacking a tarball is not injective: + +* order of entries +* unused fields in entry headers +* padding within headers and between entries +* etc. + +Not to mention compression, as the same file is compressed differently by different implementations, or even different versions of the same implementation. And then there are compression levels... + +And this is an issue, because not reproducing tarballs bit-for-bit fails Guix's integrity checks, like this one: + +``` +(define-public ... + (package + ... + (source (origin (method url-fetch) + (uri (string-append + "https://..." version ".tar.gz")) + (sha256 (base32 "03mwi1l3354x52nar...")))) + ... +``` + +because it checks the hash of the compressed tarball, instead of its content. + +### Conclusion + +So how do we reconcile Software Heritage's tarballs with Guix's integrity checks? + +The long term solution is to change Guix's integrity checks to use intrinsic identifiers (such as SWHIDs) to identify content. And we also need to do this for Nix, Debian, ... + +And it requires changing signature schemes too, to sign SWHIDs instead of signing tarballs themselves. + +In the meantime, we focused our efforts on making Software Heritage reproduce tarballs bit-for-bit; and the rest of this talk will be about how we are solving this problem. + +## How to rebuild original tarballs? + +### pristine-tar + +First, this is not a new problem. Debian has been doing something similar for a decade as part of their packaging workflow, using a tool called [pristine-tar](https://manpages.debian.org/bullseye/pristine-tar/pristine-tar.1.en.html) + +Here is how pristine-tar works: + +1. it takes a compressed tarball as source, decompresses then unpacks it +2. using a reference Tar implementation, it repacks it +3. then the original decompressed tarball, and the tarball repacked with the reference implementation go through a binary diff, called `xdelta`. It stores its result +4. Finally, it bruteforces recompression parameters to find how to reproduce the original file from the decompressed tarball. These parameters include: implementation (there are plenty of gzip implementations...), version, and compression level. The tool they use for this is called `zgz`. + +Then the `xdelta` output and the parameters guessed by `zgz`; and commit them in a git repository alongside the unpacked content. + +Yes, it's pretty ugly, but it works most of the time! + +Now the issue is that this is pretty brittle, because it relies on the reference Tar implementation being available forever, in order for the `xdelta` manifests to be usable. + +While this works fine for Debian's workflow, we are aiming for more than a decade, so it is not satisfying for us. + +Additionally, it tends to produce rather large diffs, which can be an issue when archiving so many tarballs. + +# Enters... Disarchive + +## How it started + +["gforge.inria.fr to be taken off-line in Dec. 2020"](https://issues.guix.gnu.org/42162) is the ticket on the Guix bugtracker that started it. + +Inria's forge hosted quite a number of tarballs used by Guix, and this prompted this issue of reproducing tarballs. + +This renewed a recent discussion on Software Heritage's bugtracker: ["lookup ingested tarballs by container checksum"](https://forge.softwareheritage.org/T2430); where we evaluated pristine-tar and Timothy Sample announced Disarchive. + +## How it works: + +Instead of storing binary diffs, Disarchive produces a complete manifest of the tarball: entry order, PAX headers, ignored fields, header padding, padding between entries, ... + +However, it does not contain the content itself (otherwise we might as well store the tarball itself); and relies on the rest of the Software Heritage archive (or any filesystem-like archival system) to store them. + +Here is what such a manifest looks like: + +``` +(disarchive + (version 0) + (tarball + (name "test-archive.tar") + (digest (sha256 "0da9fa3e7b360533678338871d9dd36f3...")) + (default-header + (chksum (trailer " ")) + (magic "ustar ") + (version " \x00") + (devmajor 0 (source "" (trailer ""))) + (devminor 0 (source "" (trailer ""))) + (data-padding "")) + (headers + ("test-archive/" (mode 493) (chksum 4291) (typeflag 53)) + ("test-archive/file-a" (size 15) (chksum 4849)) + ("test-archive/file-b" (size 15) (chksum 4850))) + (padding 6656) + (input (directory-ref + (version 0) + (name "test-archive") + (addresses + (swhid "swh:1:dir:902b1e94f0f5efdde6...")) + (digest (sha256 "277decb2666f4832ef64a..."))))) +``` + +First it preserves the original tarball name, as well as its hash to allow double-checking when rebuilding. + +Then it stores the default value of various fields in entry headers, to avoid duplicating them. + +Then the bulk of the tarball: the list of entries, along with the specific values for fields in their headers. + +Finally, it references the root object, using its SWHID, so it can be retrieved from Software Heritage. + +## Planned integration of SWH with Disarchive + +SWH does not store Disarchive manifests yet. + +The plan is to run: + +* Disarchive every time we load a new tarball, and store its manifest alongside the Archive. +* Run it on all tarballs we already loaded, that haven't disappeared yet +* when someone requests a tarball with a specific hash hash, look up its manifest, and use it to rebuild the tarball + +# Current Work in Progress + +## Nix/Guix manifests coverage in SWH + +Coverage has been slowly increasing: https://ngyro.com/pog-reports/latest/ + +We are currently missing sources due to technical limitations: bare files, directories, patches. +This is due to the initial design of the Nix/Guix loader, and we are rewriting it from scratch to address it. + +We hope to get to 100% after this redesign. + +## Disarchive + +Timothy Sample (the author of Disarchive) recently sent a code dump to integrate Disarchive to Software Heritage (at https://git.ngyro.com/swh/ ). + +It looks great, but still needs to be reviewed and merged. +We are hoping to merge and deploy the changes before the end of the year. + +# The End + +- [https://www.softwareheritage.org](Software Heritage website) +- [https://archive.softwareheritage.org](SWH's Archive) +- [https://forge.softwareheritage.org](Development forge) diff --git a/talks-public/2022-09-16-Guix/Makefile b/talks-public/2022-09-16-Guix/Makefile new file mode 100644 index 0000000..68fbee7 --- /dev/null +++ b/talks-public/2022-09-16-Guix/Makefile @@ -0,0 +1 @@ +include ../Makefile.slides