diff --git a/common/images/compression/compression-steps.dot b/common/images/compression/compression-steps.dot
new file mode 100644
index 0000000..dd0d48e
--- /dev/null
+++ b/common/images/compression/compression-steps.dot
@@ -0,0 +1,28 @@
+digraph "Compression steps" {
+    // Horizontal graph
+    rankdir=LR;
+
+    mph [label="MPH", shape=box];
+
+    bv_compress [label="BV compress", shape=box];
+
+    bfs [label="BFS", shape=box];
+
+    permute [label="Permute", shape=box];
+    comp_fwd [label="Compressed\ngraph\n(forward)", shape=none];
+    comp_bwd [label="Compressed\ngraph\n(backward)", shape=none];
+
+    transpose [label="Transpose", shape=box];
+
+    merkle_dag [label="Merkle\nDAG", shape=none];
+
+    merkle_dag -> mph;
+    merkle_dag -> bv_compress;
+    mph -> bv_compress;
+    bv_compress -> bfs;
+    bv_compress -> permute;
+    bfs -> permute;
+    permute -> comp_fwd;
+    comp_fwd -> transpose;
+    transpose -> comp_bwd;
+}
diff --git a/common/images/compression/compression-steps.pdf b/common/images/compression/compression-steps.pdf
new file mode 100644
index 0000000..7b1ee32
Binary files /dev/null and b/common/images/compression/compression-steps.pdf differ
diff --git a/talks-public/2019-11-14-zurich-uzh/2019-11-14-zurich-uzh.org b/talks-public/2019-11-14-zurich-uzh/2019-11-14-zurich-uzh.org
new file mode 100644
index 0000000..2c1943f
--- /dev/null
+++ b/talks-public/2019-11-14-zurich-uzh/2019-11-14-zurich-uzh.org
@@ -0,0 +1,145 @@
+#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt)
+#+TITLE: Software Heritage
+#+SUBTITLE: Source Code Archival and Analysis at the Scale of the World
+#+BEAMER_HEADER: \date[14 Nov 2019, UZH]{14 November 2019\\University of Zurich--- Zurich, Switzerland}
+#+AUTHOR: Stefano Zacchiroli
+#+DATE: 14 November 2019
+#+EMAIL: zack@upsilon.cc
+
+#+INCLUDE: "../../common/modules/prelude-toc.org" :minlevel 1
+#+INCLUDE: "../../common/modules/169.org"
+#+BEAMER_HEADER: \institute[UPD \& Inria]{Univ. Paris Diderot \& Inria --- {\tt zack@upsilon.cc, @zacchiro}}
+#+BEAMER_HEADER: \author{Stefano Zacchiroli}
+
+* Source code in science
+** Software Source code: pillar of Open Science
+*** Software is everywhere in modern research                    :B_picblock:
+    :PROPERTIES:
+    :BEAMER_opt: pic=papermountain, leftpic=true, width=.3\linewidth
+    :BEAMER_env: picblock
+    :BEAMER_COL: .6
+    :END:
+#+BEGIN_QUOTE
+[...] software [...] essential in their fields.
+
+\mbox{}\hfill Top 100 papers (Nature, 2014)
+#+END_QUOTE
+#+BEGIN_QUOTE
+Sometimes, if you dont have the software, you dont have the data
+
+\mbox{}\hfill Christine Borgman, Paris, 2018
+#+END_QUOTE
+# http://www.nature.com/news/the-top-100-papers-1.16224
+#+BEAMER: \pause
+*** Open Science: three pillars                                     :B_block:
+    :PROPERTIES:
+    :BEAMER_COL: .45
+    :BEAMER_env: block
+    :END:
+#+latex: \begin{center}
+#+ATTR_LATEX: :width \extblockscale{\linewidth}
+file:PreservationTriangle.png
+#+latex: \end{center}
+#+BEAMER: \pause
+*** :B_ignoreheading:
+    :PROPERTIES:
+    :BEAMER_env: ignoreheading
+    :END:
+*** Nota bene
+    \hfill The links in the picture are *essential*
+** Source code is /special/
+#+INCLUDE: "../../common/modules/source-code-different-short.org::#softwareisdifferent" :only-contents t :minlevel 3
+** The state of the art is not ideal
+#+INCLUDE: "../../common/modules/reprod-bad-sota.org::#collbergmethod" :only-contents t :minlevel 3
+** The state of the art is not ideal... (cont.)
+#+INCLUDE: "../../common/modules/reprod-bad-sota.org::#collbergfindings" :only-contents t :minlevel 3
+#+BEAMER: \pause
+*** The main reason
+    \hfill source code (/or the right version of it/) cannot be found
+** Motivations                                                     :noexport:
+#+INCLUDE: "../../common/modules/swh-motivations-foss.org::#fragile" :minlevel 2
+#+INCLUDE: "../../common/modules/swh-motivations-foss.org::#spread" :minlevel 2
+#+INCLUDE: "../../common/modules/swh-motivations-foss.org::#research" :minlevel 2
+
+* Software Heritage
+ #+INCLUDE: "../../common/modules/swh-overview-sourcecode.org::#mission" :minlevel 2
+ #+INCLUDE: "../../common/modules/principles-short.org::#principles" :minlevel 2
+ #+INCLUDE: "../../common/modules/status-extended.org::#archivinggoals" :minlevel 2
+ #+INCLUDE: "../../common/modules/status-extended.org::#architecture" :minlevel 2 :only-contents t
+ #+INCLUDE: "../../common/modules/status-extended.org::#datamodel" :only-contents t
+ #+INCLUDE: "../../common/modules/status-extended.org::#dagdetailsmall" :only-contents t
+ #+INCLUDE: "../../common/modules/status-extended.org::#archive" :minlevel 2
+ # #+INCLUDE: "../../common/modules/status-extended.org::#api" :only-contents t
+ # #+INCLUDE: "../../common/modules/status-extended.org::#apiintro" :minlevel 2
+ # #+INCLUDE: "../../common/modules/vault.org::#overview" :minlevel 2
+ # #+INCLUDE: "../../common/modules/webui.org::#intro" :minlevel 2
+
+* Research challenges & roadmap
+  #+INCLUDE: "this/research-roadmap.org::#main" :only-contents t
+* Conclusion
+** MSR 2020 Mining Challenge = Software Heritage Graph Dataset
+   #+BEGIN_EXPORT latex
+   \includegraphics[width=\textwidth]{this/msr-2020}
+   #+END_EXPORT
+
+   https://2020.msrconf.org/track/msr-2020-mining-challenge
+
+*** 
+    #+BEGIN_EXPORT latex
+    \begin{thebibliography}{Foo Bar, 1969}
+    \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano
+    Zacchiroli\newblock The Software Heritage graph dataset: public software
+    development under one roof\newblock MSR 2019: Mining Software Repositories,
+    IEEE
+    \end{thebibliography}
+    #+END_EXPORT
+
+** We're hiring! (a postdoc)
+*** Paris-based postdoc on software provenance
+    - large-scale, big data *graph analysis*
+    - tracking the *provenance of source code* artifacts
+    - … at the *scale of the world* (what else?)
+    - in the context of *industrial partnerships* on open source license
+      compliance
+    - supervision: Stefano Zacchiroli, Roberto Di Cosmo
+
+*** Learn more and apply
+    - https://www.softwareheritage.org/jobs/
+    - ask me! zack@upsilon.cc
+
+** Wrapping up
+  #+latex: \vspace{-2mm}
+*** 
+    - Software Heritage archives all software source code with its development
+      history.
+    - It is a major endeavor that benefits society, science, and industry.
+    - For computer scientists, it is a gold mine of research opportunities.
+      Wanna join?
+  #+latex: \vspace{-2mm}
+*** References
+  #+latex: \vspace{-1mm}
+  #+BEGIN_EXPORT latex
+  \begin{thebibliography}{Foo Bar, 1969}
+  \scriptsize
+
+  \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli\newblock
+  The Software Heritage graph dataset: public software development under one roof\newblock
+  MSR 2019: Mining Software Repositories, IEEE
+
+  \bibitem{Abramatic2018} Jean-François Abramatic, Roberto Di Cosmo, Stefano Zacchiroli\newblock
+  Building the Universal Archive of Source Code\newblock
+  Communications of the ACM, October 2018
+
+  \bibitem{DiCosmo2017} Roberto Di Cosmo, Stefano Zacchiroli\newblock
+  Software Heritage: Why and How to Preserve Software Source Code\newblock
+  iPRES 2017: Intl. Conf. on Digital Preservation
+
+  \end{thebibliography}
+  #+END_EXPORT
+*** Contacts
+    Stefano Zacchiroli / zack@upsilon.cc / @zacchiro
+* Appendix                                                       :B_appendix:
+  :PROPERTIES:
+  :BEAMER_env: appendix
+  :END:
+  #+INCLUDE: "../../common/modules/dataset.org::#main" :only-contents t
diff --git a/talks-public/2019-11-14-zurich-uzh/Makefile b/talks-public/2019-11-14-zurich-uzh/Makefile
new file mode 100644
index 0000000..68fbee7
--- /dev/null
+++ b/talks-public/2019-11-14-zurich-uzh/Makefile
@@ -0,0 +1 @@
+include ../Makefile.slides
diff --git a/talks-public/2019-11-14-zurich-uzh/this/msr-2020.png b/talks-public/2019-11-14-zurich-uzh/this/msr-2020.png
new file mode 100644
index 0000000..0c87433
Binary files /dev/null and b/talks-public/2019-11-14-zurich-uzh/this/msr-2020.png differ
diff --git a/talks-public/2019-11-14-zurich-uzh/this/research-roadmap.org b/talks-public/2019-11-14-zurich-uzh/this/research-roadmap.org
new file mode 100644
index 0000000..10ddf2f
--- /dev/null
+++ b/talks-public/2019-11-14-zurich-uzh/this/research-roadmap.org
@@ -0,0 +1,273 @@
+#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt)
+
+# A research roadmap to realize the "big code" analysis vision in practice
+
+#+INCLUDE: "prelude.org" :minlevel 1
+
+* Research roadmap
+  :PROPERTIES:
+  :CUSTOM_ID: main
+  :END:
+** Realizing the "large telescope of source code" in practice
+*** Requirements
+    - *Availability*: Software Heritage mirror, relatively up-to-date
+    - *Efficiency*: massive computing resources with fast access to the mirror
+    - *Sustainability*: pay-per-use or bring-your-own-computing
+*** Challenges
+    - mirroring
+    - compression
+    - efficient processing
+    - experiments description language
+    - big code analysis (i.e., ML on source code)
+***                                                         :B_ignoreheading:
+    :PROPERTIES:
+    :BEAMER_env: ignoreheading
+    :END:
+    ... at the scale of the world!
+
+** Challenge #1: Mirroring
+*** Thomas Jefferson, February 18, 1791
+    #+latex:  \begin{quote}
+    Let us save what remains: not by vaults and locks which fence them from the
+    public eye and use in consigning them to the waste of time, but by such a
+    *multiplication of copies*, as shall place them *beyond the reach of
+    accident*.
+    #+latex: \end{quote}
+  #+beamer: \pause
+*** Mirroring: the good
+    - big, but not /that/ big
+    - append-only archive (in theory), easy to journal and incrementally update
+** Challenge #1: Mirroring (cont.)
+*** Mirroring: the bad --- Merkle DAGs with "holes"
+    - the world sucks: corrupted repositories, takedown notices, data losses
+      - nodes can go missing at archival time or disappear later on
+    - top-level hash(es) no longer capture the full state of the archive
+*** Open questions
+    - how do you capture such full state then?
+      - by extension: how do you /timestamp/ the archive?
+    - how do you efficiently check if something is to be re-archived?
+    - ultimately, what's your notion of having "fully archived" something?
+
+** Challenge #2: Compression --- file contents
+*** Storage figures
+    - file contents: ~400 TB (raw), ~200 TB compressed (1-by-1 content
+      compression)
+    - median compressed size: 3 KB → *a lot (~6 B) of very small files*
+***                                                         :B_ignoreheading:
+    :PROPERTIES:
+    :BEAMER_env: ignoreheading
+    :END:
+    Practical problem: scale-out object storages are not designed for this
+    workload.
+  #+beamer: \pause
+*** Content compression
+    - low compression ratio (2x) with 1-by-1 compression
+    - typical Git/VCS packing heuristics do not work here, because contents
+      occur in many different contexts
+    - early experiences with Rabin-style compression & co. were unsatisfactory
+      - increased deduplication granularity (e.g., SLOCs) will likely suffer of
+        the same problem
+    - research lead: *object packing*, using heuristics that maximize the
+      chances of compressability (e.g., having /a/ file name in common)
+
+** Challenge #2: Compression --- graph
+*** Storage figures
+    - Merkle DAG: ~20 B nodes, ~280 B edges
+      - breakdown by node type: ~40% contents, ~40% directories, ~10% commits
+***                                                         :B_ignoreheading:
+    :PROPERTIES:
+    :BEAMER_env: ignoreheading
+    :END:
+    Still outside the limits of what (cheaply & trivially) fits in RAM.
+  #+beamer: \pause
+*** Graph compression
+    - related: Web graph compression techniques in the style of, e.g.:
+      #+BEGIN_EXPORT latex
+      \begin{thebibliography}{Foo Bar, 1969}
+      \bibitem{Vigna2004} Paolo Boldi, Sebastiano Vigna
+      \newblock The WebGraph framework I: compression techniques
+      \newblock WWW 2004
+      \end{thebibliography}
+      #+END_EXPORT
+    - challenges: no canonical identifiers with good locality properties,
+      different node types/subgraphs, live updates
+    - research lead: graph topology characterization
+
+** Graph compression --- preliminary results
+   #+BEGIN_EXPORT latex
+   \vspace{-5mm}
+   \includegraphics[width=\linewidth]{../../common/images/compression/compression-steps}
+   #+END_EXPORT
+   #+BEAMER: \vspace{-1cm}
+*** Dataset
+    - archive snapshot 2018-09-25
+    - size: 12 B nodes, 165 B edges
+   #+BEAMER: \pause
+*** Compression efficiency (space)
+    #+BEAMER: \vspace{-3mm}\begin{columns}\begin{column}{0.45\textwidth}
+    | *forward  graph*  |        |
+    |-------------------+--------|
+    | total size        | 91 GiB |
+    | bits per edge     |   4.91 |
+    | compression ratio |  15.8% |
+    #+BEAMER: \end{column}\begin{column}{0.45\textwidth}
+    | *backward graph*  |        |
+    |-------------------+--------|
+    | total size        | 83 GiB |
+    | bits per edge     |   4.49 |
+    | compression ratio |  14.4% |
+    #+BEAMER: \end{column}\end{columns}
+*** 
+    :PROPERTIES:
+    :BEAMER_env: ignoreheading
+    :END:
+    The structure of a full bidirectional archive graph fits in less than 200
+    GiB of RAM, for a hardware cost of ~300 USD.
+** Graph compression --- preliminary results (cont.)
+*** Analysis efficiency (time) --- Full BFS visit
+    #+BEAMER: \begin{columns}\begin{column}{0.45\textwidth}
+    | *forward  graph* |                |
+    |------------------+----------------|
+    | wall time        | 1h48m          |
+    | throughput       | 1.81 M nodes/s |
+    |                  | (553 ns/node)  |
+    #+BEAMER: \end{column}\begin{column}{0.45\textwidth}
+    | *backward graph* |                |
+    |------------------+----------------|
+    | wall time        | 3h17m          |
+    | throughput       | 988 M nodes/s  |
+    |                  | (1.01 µs/node) |
+    #+BEAMER: \end{column}\end{columns}
+*** Analysis efficiency (time) --- Edge lookup
+    random sample: 1 B nodes (8.3% of entire graph)
+    #+BEAMER: \begin{columns}\begin{column}{0.45\textwidth}
+    | *forward  graph* |                |
+    |------------------+----------------|
+    | visited edges    | 13.6 B         |
+    | throughput       | 12.0 M edges/s |
+    |                  | (83 ns/edge)   |
+    #+BEAMER: \end{column}\begin{column}{0.45\textwidth}
+    | *backward graph* |                |
+    |------------------+----------------|
+    | visited edges    | 13.6 B         |
+    | throughput       | 9.45 M edges/s |
+    |                  | (106 ns/edge)  |
+    #+BEAMER: \end{column}\end{columns}
+*** 
+    :PROPERTIES:
+    :BEAMER_env: ignoreheading
+    :END:
+    Note how edge lookup time is close to DRAM random access time (50-60 ns).
+** Challenge #3: Efficient processing --- graph
+   Big graphs calls for scale-out processing, e.g.:
+   #+BEGIN_EXPORT latex
+   \begin{thebibliography}{Foo Bar, 1969} \footnotesize
+   \bibitem{Pregel} Malewicz, Grzegorz, et al.
+   \newblock Pregel: a system for large-scale graph processing
+   \newblock ACM SIGMOD 2010
+   \bibitem{Chaos} Roy, Amitabha, et al.
+   \newblock Chaos: Scale-out graph processing from secondary storage
+   \newblock ACM SOSP 2015
+   \end{thebibliography}
+   #+END_EXPORT
+   support very large graphs, exploiting topological characteristics (e.g.,
+   small world)
+*** \vspace                                                 :B_ignoreheading:
+    :PROPERTIES:
+    :BEAMER_env: ignoreheading
+    :END:
+    #+beamer: \vspace{-1mm}
+*** Software Heritage graph characteristics
+    - scale-free, not small world (?)
+    - connected components size distribution unclear yet
+*** \vspace                                                 :B_ignoreheading:
+    :PROPERTIES:
+    :BEAMER_env: ignoreheading
+    :END:
+    #+beamer: \vspace{-1mm}
+*** Approach: topological characterization
+    - if amenable to scale-out → distribution
+    - if not → scale-up using graph compression
+
+** Challenge #3: Efficient processing --- file contents
+*** Code search: a natural need
+    Find code snippets for reuse, find reverse dependencies for maintenance,
+    impact analysis, etc.
+*** Approaches vary with their level of code "understanding"
+    - full-text search: treat source code as text
+    - symbol extraction (e.g., ctags)
+    - AST search (language-specific)
+** Challenge #3: Efficient processing --- file contents (cont.)
+  #+BEGIN_EXPORT latex
+  \vspace{-2mm}
+  \begin{thebibliography}{Foo Bar, 1969}
+  \bibitem{CodeSearch} Russ Cox \footnotesize
+  \newblock Regular Expression Matching with a Trigram Index or How Google Code Search Worked
+  \newblock 2012, https://swtch.com/~rsc/regexp/regexp4.html
+  \end{thebibliography}
+  \vspace{-1mm}
+  #+END_EXPORT
+*** Use regexs, Luke!
+    - build an inverted index of trigrams for all source code files
+    - compile regex to trigrams, find /potential/ matches
+    - =grep= each potential match using map reduce
+*** \vspace                                                 :B_ignoreheading:
+    :PROPERTIES:
+    :BEAMER_env: ignoreheading
+    :END:
+    #+beamer: \vspace{-1mm}
+*** Application
+    - Google Code Search (now gone)
+    - Debsources (https://sources.debian.org)
+      - scale up to 1 B SLOCs (Debian development branch)
+*** \vspace                                                 :B_ignoreheading:
+    :PROPERTIES:
+    :BEAMER_env: ignoreheading
+    :END:
+    #+beamer: \vspace{-1mm}
+*** Challenge
+    Scale it up to Software Heritage, several orders of magnitude later
+
+** Challenge #4: Experiment definition
+*** Problem
+    How would researchers define their experiments on the dataset?
+    - corpus selection (code? history? sampling? etc.)
+    - how will they run their tools in a fully-deduplicated world?
+    - job scheduling (well-known HPC problem)
+*** Related work (at a smaller scale)
+    #+BEGIN_EXPORT latex
+    \vspace{-2mm}
+    \begin{thebibliography}{Foo Bar, 1969}
+    \bibitem{Dyer2015} Robert Dyer et al.
+    \newblock Boa: Ultra-large-scale software repository and source-code mining
+    \newblock ACM TOSEM 25.1 (2015): 7
+    \end{thebibliography}
+    #+END_EXPORT
+*** Approach
+    formal languages, DSLs, tool adapters/porting
+
+** Challenge #5: Machine Learning on code
+*** Big Code = Big Data + Source Code
+    Popular research trends: NLP on code / SBSE using ML, e.g.:
+    #+BEGIN_EXPORT latex
+    \begin{thebibliography}{Foo Bar, 1969} \footnotesize
+    \bibitem{Allamanis2018} Miltiadis Allamanis et al.
+    \newblock A survey of machine learning for big code and naturalness
+    \newblock ACM Computing Surveys
+    \bibitem{Allamanis2018} Xiaodong Gu, Hongyu Zhang, Sunghun Kim
+    \newblock Deep Code Search
+    \newblock ICSE 2018
+    \end{thebibliography}
+    #+END_EXPORT
+***                                                         :B_ignoreheading:
+    :PROPERTIES:
+    :BEAMER_env: ignoreheading
+    :END:
+    Common assumptions: AST availability, "small" datasets (e.g., random GitHub
+    samples)
+
+  #+latex: \begin{example}[Universal programming language detection]
+  - programming language detection at the scale of Software Heritage
+  - no AST, only bytes (not characters!)
+  - supervised or unsupervised (would be best for language evolution)
+  #+latex: \end{example}