diff --git a/common/images/coverage/Makefile b/common/images/coverage/Makefile
new file mode 100644
index 0000000..215b0a7
--- /dev/null
+++ b/common/images/coverage/Makefile
@@ -0,0 +1,6 @@
+SVGs = $(wildcard *.svg)
+PDFs = $(patsubst %.svg,%.pdf,$(SVGs))
+
+all: $(PDFs)
+%.pdf: %.svg
+ inkscape --export-area-drawing --export-pdf $@ $<
diff --git a/common/images/coverage/debian.pdf b/common/images/coverage/debian.pdf
new file mode 100644
index 0000000..d9e782c
Binary files /dev/null and b/common/images/coverage/debian.pdf differ
diff --git a/common/images/coverage/debian.svg b/common/images/coverage/debian.svg
new file mode 100644
index 0000000..47626c8
--- /dev/null
+++ b/common/images/coverage/debian.svg
@@ -0,0 +1,152 @@
+
+
diff --git a/common/images/coverage/github.pdf b/common/images/coverage/github.pdf
new file mode 100644
index 0000000..24bb3e8
Binary files /dev/null and b/common/images/coverage/github.pdf differ
diff --git a/common/images/coverage/github.png b/common/images/coverage/github.png
new file mode 100644
index 0000000..d14e49f
Binary files /dev/null and b/common/images/coverage/github.png differ
diff --git a/common/images/coverage/github.svg b/common/images/coverage/github.svg
new file mode 100644
index 0000000..68fd205
--- /dev/null
+++ b/common/images/coverage/github.svg
@@ -0,0 +1,31 @@
+
+
+
+
diff --git a/common/images/coverage/gitlab.pdf b/common/images/coverage/gitlab.pdf
new file mode 100644
index 0000000..dcf0f8d
Binary files /dev/null and b/common/images/coverage/gitlab.pdf differ
diff --git a/common/images/coverage/gitlab.svg b/common/images/coverage/gitlab.svg
new file mode 100644
index 0000000..7450597
--- /dev/null
+++ b/common/images/coverage/gitlab.svg
@@ -0,0 +1,32 @@
+
+
\ No newline at end of file
diff --git a/common/images/coverage/gitorious.png b/common/images/coverage/gitorious.png
new file mode 100644
index 0000000..a45ba81
Binary files /dev/null and b/common/images/coverage/gitorious.png differ
diff --git a/common/images/coverage/gnu.png b/common/images/coverage/gnu.png
new file mode 100644
index 0000000..60917ad
Binary files /dev/null and b/common/images/coverage/gnu.png differ
diff --git a/common/images/coverage/googlecode.png b/common/images/coverage/googlecode.png
new file mode 100644
index 0000000..23140d7
Binary files /dev/null and b/common/images/coverage/googlecode.png differ
diff --git a/common/images/coverage/hal.png b/common/images/coverage/hal.png
new file mode 100644
index 0000000..7803f0e
Binary files /dev/null and b/common/images/coverage/hal.png differ
diff --git a/common/images/coverage/inria.png b/common/images/coverage/inria.png
new file mode 100644
index 0000000..02fdf5a
Binary files /dev/null and b/common/images/coverage/inria.png differ
diff --git a/common/images/coverage/pypi.pdf b/common/images/coverage/pypi.pdf
new file mode 100644
index 0000000..98daf12
Binary files /dev/null and b/common/images/coverage/pypi.pdf differ
diff --git a/common/images/coverage/pypi.svg b/common/images/coverage/pypi.svg
new file mode 100644
index 0000000..e53853c
--- /dev/null
+++ b/common/images/coverage/pypi.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/common/modules/status-extended.org b/common/modules/status-extended.org
index 26f0446..387980c 100644
--- a/common/modules/status-extended.org
+++ b/common/modules/status-extended.org
@@ -1,433 +1,437 @@
#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt)
#+INCLUDE: "prelude.org" :minlevel 1
# not to be included as a whole, just pick individual slides as you see fit
* Status
:PROPERTIES:
:CUSTOM_ID: main
:END:
** The people
:PROPERTIES:
:CUSTOM_ID: people
:END:
*** The core team :B_picblock:
:PROPERTIES:
:CUSTOM_ID: core-team-formal
:BEAMER_env: picblock
:BEAMER_opt: pic=team,width=.4\linewidth
:END:
- Roberto Di Cosmo
- Stefano Zacchiroli
- Nicolas Dandrimont (Engineer)
- Antoine Dumont (Engineer)
# - and /Jordi, Quentin and Guillaume/
*** Scientific advisors
- Serge Abiteboul (French Science Academy)
- Jean-François Abramatic (former W3C director)
- Gerard Berry (CNRS Gold Medal, French Science Academy)
- Julia Lawall (Coccinelle, Linux Kernel, Outreachy)
** Archive coverage --- archive.softwareheritage.org
:PROPERTIES:
:CUSTOM_ID: archive
:END:
- #+BEAMER: \vspace{-2mm}
+ #+BEAMER: \vspace{-1mm}
#+BEAMER: \begin{center}\includegraphics[width=\extblockscale{1.1\linewidth}]{2018-10-archive-growth.png}\end{center}
#+BEAMER: \vspace{-2mm}
-*** Current sources
- - live: GitHub, Debian, GitLab.com, PyPI
- - one-off: Gitorious, Google Code, GNU
- - WIP: Bitbucket
- #+BEAMER: \pause
***
- 200 TB (compressed) blobs, 6 TB database (as a graph: 10 B nodes + 100 B edges)
+ #+BEAMER: \includegraphics[width=0.19\linewidth]{coverage/github} \hfill
+ #+BEAMER: \includegraphics[width=0.2\linewidth]{coverage/debian} \hfill
+ #+BEAMER: \includegraphics[width=0.2\linewidth]{coverage/gitlab} \hfill
+ #+BEAMER: \includegraphics[width=0.2\linewidth]{coverage/googlecode} \\
+ #+BEAMER: \includegraphics[width=0.2\linewidth]{coverage/gitorious} \hfill
+ #+BEAMER: \includegraphics[width=0.15\linewidth]{coverage/gnu} \hfill
+ #+BEAMER: \includegraphics[width=0.13\linewidth]{coverage/hal} \hfill
+ #+BEAMER: \includegraphics[width=0.16\linewidth]{coverage/inria} \hfill
+ #+BEAMER: \includegraphics[width=0.13\linewidth]{coverage/pypi}
#+BEAMER: \pause
***
- \hfill The /richest/ public source code archive, ... and growing daily!
+ - 200 TB (compressed) blobs, 6 TB database (as a graph: 10 B nodes + 100 B edges)
+ - The /richest/ public source code archive, ... and growing daily!
** The structure of the archive :noexport:
*** On-disk storage
- flat file storage for contents
- postgres database for the metadata
*** Data model: /one/ big Merkle DAG, inspired by the git model
- Origins (= repositories)
- Occurrences (= branches)
- Releases (= tags)
- Revisions (= commits)
- Directories (= trees)
- Contents (= blobs)
** Archiving goals
:PROPERTIES:
:CUSTOM_ID: archivinggoals
:END:
Targets: VCS repositories & source code releases (e.g., tarballs)
*** We DO archive
- file *content* (= blobs)
- *revisions* (= commits), with full metadata
- *releases* (= tags), ditto
- where (*origin*) & when (*visit*) we found any of the above
# - time-indexed repo *snapshots* (i.e., we never delete anything)
… in a VCS-/archive-agnostic *canonical data model*
*** We DON'T archive
# - diffs → derived data from related contents
- homepages, wikis
- BTS/issues/code reviews/etc.
- mailing lists
Long term vision: play our part in a /"semantic wikipedia of software"/
** Architecture
:PROPERTIES:
:CUSTOM_ID: architecture
:END:
*** Data flow
:PROPERTIES:
:CUSTOM_ID: dataflow
:END:
#
#+BEAMER: \begin{center}\includegraphics[width=\extblockscale{1.2\textwidth}]{swh-dataflow.pdf}\end{center}
** Data model :noexport:
*** General schema
- VCS-independent
- fully deduplicated
+ files, directories and commits are /shared/
- biggest git-like /graph/ in the world
***
\begin{center}
\url{http://deb.li/swhdm}
\end{center}
*** full hash index (sha1, sha256, ...)
Some funny facts:
- the GPL2 licence appears under more than 500 names
+ including /aa.css.txt/ and /FullSync.txt/ ~ :-)
** Merkle DAG
*** Merkle structure
:PROPERTIES:
:CUSTOM_ID: merkle
:END:
**** Merkle trees
:PROPERTIES:
:CUSTOM_ID: merkletree
:END:
# R. C. Merkle, A digital signature based on a conventional encryption
# function, Crypto '87
#+BEAMER: \vspace{-3mm}
***** Merkle tree (R. C. Merkle, Crypto 1979) :B_picblock:
:PROPERTIES:
:BEAMER_opt: pic=merkle, leftpic=true, width=.7\linewidth
:BEAMER_env: picblock
:BEAMER_act:
:END:
Combination of
- tree
- hash function
#+BEAMER: \pause
#+BEAMER: \footnotesize
***** Classical cryptographic construction
- fast, parallel signature of large data structures
- widely used (e.g., Git, blockchains, IPFS, ...)
- built-in deduplication
**** The archive in a few pictures
:PROPERTIES:
:CUSTOM_ID: merkledemo
:END:
***** A giant (extended) Merkle DAG
#+LATEX: \only<1>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_1.pdf}}}
#+LATEX: \only<2>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/contents.pdf}}}
#+LATEX: \only<3>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_2_contents.pdf}}}
#+LATEX: \only<4>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/directories.pdf}}}
#+LATEX: \only<5>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_3_directories.pdf}}}
#+LATEX: \only<6>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/revisions.pdf}}}
#+LATEX: \only<7>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_4_revisions.pdf}}}
#+LATEX: \only<8>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/releases.pdf}}}
#+LATEX: \only<9>{\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_5_releases.pdf}}}
# #+LATEX: {\colorbox{white}{\includegraphics[width=\extblockscale{.9\linewidth}]{git-merkle/merkle_1.pdf}}}
*** A revision node
:PROPERTIES:
:CUSTOM_ID: merklerevision
:END:
**** Example: a Software Heritage revision
*****
#+BEAMER: \vspace{-.5cm}\centering\includegraphics[width=0.9\textwidth]{git-merkle/revisions}
*****
Note: most object kinds currently have Git-compatible identifiers
*** Giant DAG
:PROPERTIES:
:CUSTOM_ID: giantdag
:END:
**** The archive: a (giant) Merkle DAG
# Using an empty frame because the image is difficult to read on swh bg.
# Finding a way to override image bg for just this frame would be better.
*****
#+BEAMER: \centering \includegraphics[width=\extblockscale{\textwidth}]{git-merkle/merkle_5_releases}
*** Giant DAG (single slide)
:PROPERTIES:
:CUSTOM_ID: giantdag1slide
:END:
**** The Software Heritage archive: a gigantic Merkle DAG
#+LATEX: \centering\forcebeamerstart{}
#+LATEX: \only<1>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_1}}}
#+LATEX: \only<2>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/contents}}}
#+LATEX: \only<3>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_2_contents}}}
#+LATEX: \only<4>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/directories}}}
#+LATEX: \only<5>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_3_directories}}}
#+LATEX: \only<6>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/revisions}}}
#+LATEX: \only<7>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_4_revisions}}}
#+LATEX: \only<8>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/releases}}}
#+LATEX: \only<9>{\colorbox{white}{\includegraphics[width=.75\linewidth]{git-merkle/merkle_5_releases}}}
#+LATEX: \forcebeamerend{}
*** Giant DAG (detailed)
:PROPERTIES:
:CUSTOM_ID: dagdetail
:END:
**** The archive: a (giant) Merkle DAG
#+BEAMER: \vspace{-3mm}
#+BEAMER: \centering \includegraphics[width=\textwidth]{swh-merkle-dag-wide}
** Technology :noexport:
:PROPERTIES:
:CUSTOM_ID: technology
:END:
*** Software stack
**** 3rd party
- Debian, Puppet, Ceph
- PostgreSQL for metadata storage, with barman & pglogical
- Celery (RabbitMQ backend) for task scheduling
- Python3 and psycopg2 for the backend
- Django, Bootstrap, D3.js for Web stuff
**** in house
- /ad hoc/ object storage (to avoid imposing tech to mirrors)
- data model implementation, listers, loaders, scheduler
- ~60 Git repositories (~20 Python packages, ~30 Puppet modules)
- ~30 kSLOC Python / ~12 kSLOC SQL / ~4 kSLOC Puppet
- licence choice: GPLv3 (backend) / AGPLv3 (frontend)
*** Hardware stack
**** in house
- 2x hypervisors with ~20 VMs
- 2x high density storage array (60 * 6TB => 300TB usable each)
- Prototype: ceph storage cluster for blobs
**** on Azure
- full object storage mirror
- full mirror of the database containing the graph
- workers for content indexing
- workers for download bundle preparation
**** at the University of Bologna
- backend storage (60TB) for the bundles available for download
*** Software architecture :noexport:
**** Module dependencies (internal + external) :B_picblock:
:PROPERTIES:
:BEAMER_env: picblock
:BEAMER_opt: pic=swh-modules-deps-all,width=\linewidth
:END:
****
let's zoom in: http://deb.li/swhdeps
** Technology :noexport:
:PROPERTIES:
:CUSTOM_ID: technology-short
:END:
*** Deployment and resource usage
**** Software
- around 30k SLOC of custom Python code, running on Debian Stable
- PostgreSQL database for the metadata storage
**** Hardware
- 3 hypervisors with mass storage and a backup server at Inria
- Work in progress: in-house Ceph deployment for object storage
- In-kind sponsorship of cloud and storage resources (Microsoft, University of Bologna)
** Software development :noexport:
:PROPERTIES:
:CUSTOM_ID: development
:END:
*** Software development
**** classic FOSS development
- language: English
- development mailing list
#+BEAMER: \\{\small \url{https://sympa.inria.fr/sympa/info/swh-devel}}
- IRC
#+BEAMER: \\
#swh-devel / FreeNode
- Forge
#+BEAMER: \\{\small \url{https://forge.softwareheritage.org}}
- Git, tasks, code review, etc.
**** for more information
#+BEAMER: \scriptsize
https://www.softwareheritage.org/community/developers/
** Roadmap
:PROPERTIES:
:CUSTOM_ID: features
:END:
*** Features...
- (done) *lookup* by content hash
- (done) *browsing*: "wayback machine" for source code (API + UI)
- (early access) *deposit* of source code bundles directly to the archive
- (early access) *save code now*, on-demand archive
- (done) *download*: =wget= / =git clone= from the archive
- (todo) *provenance* lookup for all archived content
- (todo) *full-text search* on all archived source code files
#+BEAMER: \pause
*** ... and much more than one could possibly imagine
all the world's software development history at hand's reach!
** Web API :noexport:
:PROPERTIES:
:CUSTOM_ID: api
:END:
*** Web API
:PROPERTIES:
:CUSTOM_ID: apiintro
:END:
****
RESTful API to programmatically access the Software Heritage archive \\
*\url{https://archive.softwareheritage.org/api/}*
**** Features
- pointwise *browsing* of the archive
- … snapshots → revisions → directories → contents …
- full access to the *metadata* of archived objects
- *crawling* information
- /when have you last visited this Git repository I care about?/
- /where were its branches/tags pointing to at the time?/
# - derived information about archived contents (WIP)
# - MIME type, programming language, license, etc.
**** Endpoint index
\url{https://archive.softwareheritage.org/api/1/}
*** A tour of the Web API --- origins & visits
:PROPERTIES:
:CUSTOM_ID: apitourvisits
:END:
#+BEAMER: \footnotesize
#+BEGIN_SRC
GET https://archive.softwareheritage.org/api/1/origin/ \
git/url/https://github.com/hylang/hy
{ "id": 1,
"origin_visits_url": "/api/1/origin/1/visits/",
"type": "git",
"url": "https://github.com/hylang/hy"
}
#+END_SRC
#+BEAMER: \vfill
#+BEGIN_SRC
GET https://archive.softwareheritage.org/api/1/origin/ \
1/visits/
[ ...,
{ "date": "2016-09-14T11:04:26.769266+00:00",
"origin": 1,
"origin_visit_url": "/api/1/origin/1/visit/13/",
"status": "full",
"visit": 13
}, ...
]
#+END_SRC
*** A tour of the Web API --- snapshots
:PROPERTIES:
:CUSTOM_ID: apitoursnapshots
:END:
#+BEAMER: \footnotesize
#+BEGIN_SRC
GET https://archive.softwareheritage.org/api/1/origin/ \
1/visit/13/
{ ...,
"occurrences": { ...,
"refs/heads/master": {
"target": "b94211251...",
"target_type": "revision",
"target_url": "/api/1/revision/b94211251.../"
},
"refs/tags/0.10.0": {
"target": "7045404f3...",
"target_type": "release",
"target_url": "/api/1/release/7045404f3.../"
}, ...
},
"origin": 1,
"origin_url": "/api/1/origin/1/",
"status": "full",
"visit": 13
}
#+END_SRC
*** A tour of the Web API --- releases :noexport:
:PROPERTIES:
:CUSTOM_ID: apitourreleases
:END:
#+BEAMER: \footnotesize
#+BEGIN_SRC
GET https://archive.softwareheritage.org/api/1/release/ \
7045404f3d1c54e6473c71bbb716529fbad4be24/
{
"author": {
"email": "tag@pault.ag",
"fullname": "Paul Tagliamonte ",
"id": 96,
"name": "Paul Tagliamonte"
},
"date": "2014-04-10T23:01:28-04:00",
"message": "0.10: The Oh f*ck it's PyCon release",
"name": "0.10.0",
"synthetic": false,
"target": "6072557b6...",
"target_type": "revision",
"target_url": "/api/1/revision/6072557b6.../",
...
}
#+END_SRC
*** A tour of the Web API --- revisions
:PROPERTIES:
:CUSTOM_ID: apitourrevisions
:END:
#+BEAMER: \footnotesize
#+BEGIN_SRC
GET https://archive.softwareheritage.org/api/1/revision/ \
6072557b6c10cd9a21145781e26ad1f978ed14b9/
{
"author": {
"email": "tag@pault.ag",
"fullname": "Paul Tagliamonte ",
"id": 96,
"name": "Paul Tagliamonte"
},
"committer": { ... },
"date": "2014-04-10T23:01:11-04:00",
"committer_date": "2014-04-10T23:01:11-04:00",
"directory": "2df4cd84e...",
"directory_url": "/api/1/directory/2df4cd84e.../",
"history_url": "/api/1/revision/6072557b6.../log/",
"merge": false,
"message": "0.10: The Oh f*ck it's PyCon release",
"parents": [ {
"id": "10149f66e...",
"url": "/api/1/revision/10149f66e.../"
} ],
...
}
#+END_SRC
*** A tour of the Web API --- contents
:PROPERTIES:
:CUSTOM_ID: apitourcontents
:END:
#+BEAMER: \footnotesize
#+BEGIN_SRC
GET https://archive.softwareheritage.org/api/1/content/ \
adc83b19e793491b1c6ea0fd8b46cd9f32e592fc/
{
"data_url": "/api/1/content/sha1:adc83b19e.../raw/",
"filetype_url": "/api/1/content/sha1:.../filetype/",
"language_url": "/api/1/content/sha1:.../language/",
"length": 1,
"license_url": "/api/1/content/sha1:.../license/",
"sha1": "adc83b19e...",
"sha1_git": "8b1378917...",
"sha256": "01ba4719c...",
"status": "visible"
}
#+END_SRC
#+BEAMER: \normalsize \vfill \pause
**** Caveats
- rate limits apply throughout the API
- raw download available for textual contents
** Some technical challenges
:PROPERTIES:
:CUSTOM_ID: techchallenges
:END:
*** Expanding the archive
- discover and classify /all/ the software sources
- importers for other VCSs (SVN, Hg, ...)
\hfill /We need your help!/
*** Staying current
get new repositories and commits ASAP\\
\hfill /We need reliable, standardised event feeds./
*** Handling the backlog
ingesting all the pre-existing data\\
\hfill /Decades of software development are waiting!/