diff --git a/talks-public/2017-07-18-team/2017-07-18-team.org b/talks-public/2017-07-18-team/2017-07-18-team.org new file mode 100644 index 0000000..ddc26fc --- /dev/null +++ b/talks-public/2017-07-18-team/2017-07-18-team.org @@ -0,0 +1,717 @@ +#+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) +#+TITLE: Software Heritage +#+SUBTITLE: Vision and outlook +#+AUTHOR: Roberto Di Cosmo +#+DATE: 18/7/2017 +#+EMAIL: roberto@dicosmo.org +#+DESCRIPTION: Preserving the technological knowledge of mankind +#+KEYWORDS: software heritage legacy preservation knowledge mankind technology +#+BEAMER_HEADER: \title[Strategic team meeting]{Software Heritage: vision and outlook} +#+BEAMER_HEADER: \date[18/7/2017]{July 18th 2017\\ Paris} +#+LATEX_HEADER: \usepackage{color} +#+LATEX_HEADER: \usepackage{colortbl} +#+LATEX_HEADER: \usepackage[table]{xcolor}% http://ctan.org/pkg/xcolor +#+LATEX_HEADER: \usepackage{array} +#+LATEX_HEADER: \usepackage{supertabular} + +# +# prelude.org contains all the information needed to export the main beamer latex source +# use prelude-toc.org to get the table of contents +# + +#+INCLUDE: "../../common/modules/prelude-toc.org" :minlevel 1 + +#+INCLUDE: "../../common/modules/169.org" + +# +# Some context: where we come from +# +# +INCLUDE: "../../common/modules/mancoosi-background.org::#main" :minlevel 1 + +# +# Basic properties for software studies +# +# +INCLUDE: "../../common/modules/software-studies-stepback-properties.org::#main" :minlevel 2 :only-contents t + +* Context and motivations +** Software is everywhere + :PROPERTIES: + :CUSTOM_ID: softwareispervasive + :END: +*** At the heart of our society :B_picblock: + :PROPERTIES: + :BEAMER_opt: pic=software-center.pdf, leftpic=true, width=.4\linewidth + :BEAMER_env: picblock + :END: + - communication, entertainment + - administration, finance + - health, energy, transportation + - education, research, politics + - ... +*** Knowledge enabler + - /Key mediator/ for accessing /all/ information + - /Essential component/ of modern scientific research + #+BEAMER: \pause +*** + :PROPERTIES: + :BEAMER_env: block + :END: + Software embodies \\ + \hfill our collective *Knowledge* and *Cultural Heritage* + +# * Source code is essential... + +# why software source code is special (2 slides) +# +#+INCLUDE: "../../common/modules/source-code-different-long.org::#thesourcecode" :minlevel 2 +#+INCLUDE: "../../common/modules/source-code-different-long.org::#softwareisdifferent" :minlevel 2 + +# * ... but it is not in good shape! +#+INCLUDE: "../../common/modules/swh-motivations-foss.org::#spread" :minlevel 2 +#+INCLUDE: "../../common/modules/swh-motivations-foss.org::#fragile" :minlevel 2 +#+INCLUDE: "../../common/modules/swh-motivations-foss.org::#research" :minlevel 2 + +# +# Negative presentation, what we are missing today +# +# +INCLUDE: "../../common/modules/swh-motivations-foss.org::#main" :only-contents t :minlevel 2 + +** We are at a turning point +*** Looking at the past + - a lot of old software misplaced, lost, or behind barriers, but... + - most founding fathers are still here, and willing to share + - \alert{urgent} to collect their knowledge + \hfill Only a few years left. +#+BEAMER: \pause +*** Looking at the future + - software development skyrockets + - \alert{essential} to provide a platform for the future + \hfill Every year that goes by makes the problem worse. +#+BEAMER: \pause +*** + \hfill it is \alert{urgent} to take action! + +* The Software Heritage initiative +# +# what SWH does (mission, collect protect share, basic infrastructure schema) +# +# +# emphasis on the source code +# +#+INCLUDE: "../../common/modules/swh-overview-sourcecode.org::#mission" :minlevel 2 + +# Better society, better industry, better science +# +#+INCLUDE: "../../common/modules/vision.org::#foundations" :minlevel 2 +#+INCLUDE: "../../common/modules/vision.org::#heritage" :minlevel 2 +#+INCLUDE: "../../common/modules/vision.org::#industry" :minlevel 2 +#+INCLUDE: "../../common/modules/vision.org::#research" :minlevel 2 + +* Key properties, and principles +** Three properties are key for Software Heritage's mission + :PROPERTIES: + :CUSTOM_ID: keyproperties + :END: +*** Availability + :PROPERTIES: + :BEAMER_act: +- + :END: + - /all/ the /history/ of /all/ the software + - no restrictions (technical, legal, ... ) on /content/ or /metadata/ +*** Traceability + :PROPERTIES: + :BEAMER_act: +- + :END: + - know /what/ we get, /when/, from /where/ and /how/ + - /unique/ identifiers : /one/ name for each object + - /persistent/ and /intrinsic/ identifiers : no middle man, no dangling pointers! +*** Uniformity + :PROPERTIES: + :BEAMER_act: +- + :END: + - one /standard/ metadata structure, /irrespective of the origins/ + - /uniform/ naming /schema/ +** Software Heritage's approach + :PROPERTIES: + :CUSTOM_ID: keyproperties + :END: +*** Availability + :PROPERTIES: + :BEAMER_act: +- + :END: + - collect /all/ software from /all/ possible places + - /replicate/ the archive in a network of mirrors +*** Traceability + :PROPERTIES: + :BEAMER_act: +- + :END: + - keep /provenance/ information, systematically + - /unique/ identifiers : use /cryptographic hashes/, derived from the software itself +*** Uniformity + :PROPERTIES: + :BEAMER_act: +- + :END: + - version control data model designed to /represent all the others/ + +** Our principles +#+latex: \begin{center} +#+ATTR_LATEX: :width .7\linewidth +file:SWH-as-foundation-slim.png +#+latex: \end{center} +#+BEAMER: \pause +*** Open approach :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.3 + :BEAMER_env: block + :END: + open source, transparency +*** Unix philosophy :B_block:BMCOL:noexport:noexport: + :PROPERTIES: + :BEAMER_opt: + :BEAMER_env: block + :BEAMER_col: 0.3 + :END: + - do /one/ thing + - do it /well/ +*** In for the long haul :B_block:BMCOL: + :PROPERTIES: + :BEAMER_col: 0.3 + :BEAMER_env: block + :END: + non profit, replication + +#+BEAMER: \pause +*** Thomas Jefferson, February 18, 1791 + :PROPERTIES: + :BEAMER_act: +- + :END: +# #+latex: \begin{quote} + ...let us save what remains: not by vaults and locks which fence them + from the public eye and use in consigning them to the waste of time, + but by such a multiplication of copies, as shall place them beyond + the reach of accident. +# #+latex: \end{quote} +** Summing up + - Transparency and Free Software + - Multi-stakeholder and not-for-profit + - Replication all over the place + - Intrinsic identifiers + - Facts and provenance + - Minimalism (Unix phylosophy) +* Yes, we really mean all the source code +** Classifying what is out there +*** Time dimension + - current, and future + - old, legacy +*** Openness dimension + - publicly available, and open source + - non publicly available, closed source +#+BEAMER: \pause +*** + \hfill there are many implications +** Strategy to collect all the source code +*** Different unit cost for each sector +#+BEGIN_EXPORT latex +\begin{center} +\tablefirsthead{} +\tablehead{} +\tabletail{} +\tablelasttail{} +\begin{supertabular}{|c|c|c|} + \cline{2-3} + %\rowcolor{blue!25} +\multicolumn{1}{c}{~} + & + \multicolumn{1}{|c|}{\cellcolor{yellow}Open} & + \multicolumn{1}{c|}{\cellcolor{yellow}Proprietary}\\\hline +\cellcolor{yellow} Current and future & +\cellcolor{yellow} SWH: {\bf \$}, ~~~ extern: {\bf \$} & + SWH: {\bf \$\$}, ~~~ extern: {\bf \$\$} + \\\hline +\cellcolor{yellow} Legacy & + SWH:{\bf \$}, ~~~ extern: {\bf \$\$} & + SWH:{\bf \$\$}, ~~~ extern: {\bf \$\$\$} + \\\hline +\end{supertabular} +\end{center} +#+END_EXPORT +#+BEAMER: \pause +*** Different approaches for each sector +#+BEGIN_EXPORT latex +\begin{center} +\tablefirsthead{} +\tablehead{} +\tabletail{} +\tablelasttail{} +\begin{supertabular}{|c|c|c|} + \cline{2-3} + %\rowcolor{blue!25} +\multicolumn{1}{c}{~} + & + \multicolumn{1}{|c|}{\cellcolor{yellow}Open} & + \multicolumn{1}{c|}{\cellcolor{yellow}Proprietary}\\\hline +\cellcolor{yellow} Current and future & + \cellcolor{yellow}{{\bf Automation}} & + {\bf Embargo} + \\\hline +\cellcolor{yellow} Legacy & + {\bf Crowdsourcing} & + {\bf Focused search} + \\\hline +\end{supertabular} +\end{center} +#+END_EXPORT +#+BEAMER: \pause +# IMPACTS +*** We started on the first quadrant, we need all four! + - tech: access control, security, identification, authorization + - legal: policies, contracts + - community: network, standards, endorsement + +#+INCLUDE: "../../common/modules/swh-functional-architecture.org::#phases" :minlevel 2 + +** Community is essential +# IMPACTS +*** Daunting task + - challenge: extreme variability of sources and technologies + - opportunity: highly parallelisable, /if we provide good abstractions/ +#+BEAMER: \pause +*** Collect phase entry points + - forge listers (e.g.: Avi's and Sushant's work) + - forge protocol extensions (e.g.: Adullact's work on FusionForge) + - VCS loaders (e.g.: Avi's work) + - Web crawler connection (e.g.: Internet Archive discussions) +#+BEAMER: \pause +*** Archive phase entry points + - storage and indexing backends + - application specific data representations + +* Building for the long term +** Three pillars +*** Awareness, visibility, endorsement + - promote public and private policies + - attract users + - unlock funds + - turn copycats into partners +#+BEAMER: \pause +*** Resources + \hfill fund the effort +#+BEAMER: \pause +*** Science and technology + \hfill build on sound basis + # Where we are today: endorsement + # + #+INCLUDE: "../../common/modules/endorsement.org::#endorsement" :minlevel 2 +** Political awareness +*** April 3rd, 2017: landmark Inria Unesco agreement... +#+BEGIN_EXPORT latex + \includegraphics[width=\extblockscale{.25\linewidth}]{inria-logo-new} \hfill + \includegraphics[width=\extblockscale{.35\linewidth}]{unesco-accord} \hfill + \includegraphics[width=\extblockscale{.2\linewidth}]{unesco}\\[1em] + \mbox{}\hfill + \includegraphics[width=\extblockscale{.2\linewidth}]{rdc-fh-ib} \hfill + \includegraphics[width=\extblockscale{.15\linewidth}]{SWH-logo_share} \hfill + \includegraphics[width=\extblockscale{.2\linewidth}]{swh-team-2017-04-03}\hfill + \mbox{}\\ + \url{https://www.softwareheritage.org/blog} +#+END_EXPORT +** Resources + #+INCLUDE: "../../common/modules/swh-sponsors.org::#sponsors" :only-contents t +** Science +*** Communication + - iPres 2017 paper accepted + - CACM Viewpoint in the works (thanks Moshe Vardi) + - RDA session September 2017 +*** Collaboration + - point to point contacts around the Big Data / ML challenge + - multiple undercover contacts ongoing (MSR, ICSE, EU, MENESR) +#+BEAMER: \pause +*** Challenge for us + - provide reliable intefaces with the scientific community + (human and technical) +* Zoom on science +# +# Software Research +# +** Multiple facets +*** Scientists as users + - SWH as dataset (computer science) + - reproducibility via SWH (all) +*** Scientists as providers/partners + - research on SWH challenges +** An Universal Archive of Software Development + :PROPERTIES: + :CUSTOM_ID: main + :END: +#+LATEX: \includegraphics[width=\extblockscale{.15\linewidth}]{universal.png} +*** /Repeatable/ Software Studies + :PROPERTIES: + :BEAMER_act: +- + :END: + - vulnerability detection + - dependency analysis + - pattern elicitation + - study of the development graph + - ... the sky is the limit +*** Prerequisites + clean, evolvable data and metadata model + +** How we built our scientific knowledge +# +# Scientific method, reproducibility +# +#+INCLUDE: "../../common/modules/scientific-method.org::#short" :only-contents t + +# +# Connection with Open Access +# +#+INCLUDE: "../../common/modules/conservancy.org::#main" :minlevel 2 + +# +# URLS are not good tracers +# +#+INCLUDE: "../../common/modules/urls-decay.org::#main" :only-contents t :minlevel 2 + +# +# DOI is not a solution +# +#+INCLUDE: "../../common/modules/doi-analysis.org::#main" :only-contents t :minlevel 2 + +** What could the good links look like? +*** Links to /software source code/ in an article + Leverage Software Heritage as universal archive: + - set of files :: \small\url{swh:1:tree:06741c8c37c5a384083082b99f4c5ad94cd0cd1f}\\ + id of tree object listing all the files in a project (at a given time) + - revision :: \url{swh:1:rev:7598fb94d59178d65bd8d2892c19356290f5d4e3}\\ + id of commit object which a tree and (a pointer to) the history + - metadata :: this /may/ involve a DOI +*** + \hfill this is also of /industrial/ relevance! +*** Links to /data/ in /software source code/ :noexport: + - external linking mechanisms /that guarantee integrity/ + + git lfs + + git annex + - need to extend them into a generic, VCS independent solution + +** The SWH - HAL connector +*** Strategic + First open access / open source archival process +*** Opportunity + - HAL is one of a kind + - ArXiv uses the same tech +* Roadmap for a sustainable organisation + :PROPERTIES: + :CUSTOM_ID: main + :END: +** Growing a sustainable common digital infrastructure + :PROPERTIES: + :CUSTOM_ID: phases + :END: +*** Ignition (3 Y) \alert{\em Inria} :B_exampleblock: + :PROPERTIES: + :BEAMER_env: exampleblock + :BEAMER_COL: .3 + :BEAMER_ACT: +- + :END: + - Vision + - Team + - Core infrastructure + - Identity + + communication + + community + - Legitimacy + + awareness + + support +*** Scale up (5 Y) :B_block: + :PROPERTIES: + :BEAMER_env: block + :BEAMER_COL: .35 + :BEAMER_ACT: +- + :END: + - Core Infra (engineer) + - Collect (4 strategies) + - Preserve + + mirrors, multiple techs + - Share + + search, browse, APIs + - Connect + + community + - Organisation + + build the foundation +*** Stable Operation ($\infty$) :B_block: + :PROPERTIES: + :BEAMER_env: alertblock + :BEAMER_COL: .38 + :BEAMER_ACT: +- + :END: + - Maintain+Evolve + + archive, community + + bylaws, organisation + - Interact+Engage + + research + + industry + + education + + culture + - Sustainability + + /key/ \alert{infrastructure} + + /ecosystem/ \alert{diversity} + + /foundation/ \alert{endowment} +** Today: team +*** Management + - Roberto and Stefano (CEO/CTO) + - Jean-Fran\c{c}ois Abramatic (Head of Advisory Board) + - Magali Fitzgibbon (Legal, Contracts) +*** R and D, Ops + - 3 engineers (+2 soon) + - 1 PhD, 1 intern + - 1 visiting scientist +*** Everything else + provided by Inria +** Today: funding +*** Baseline + Inria engagement (~ 500Ke/year) +*** Sponsoring + - 3 platinum sponsors (Microsoft, Intel, SocGen) + - 1 silver sponsor (Huawei) + - 4 bronze sponsors (DANS, Nokia, DISI, GitHub) +*** Partnerships + - Crossminer +*** + \hfill a /huge/ part of my time +** The next 5 years +*** Collect + - stable process for adding new listers/loaders + - community of contributors +*** Preserve + - stable process for mirror network + - at least 10 mirrores worldwide +*** Share + - browse/search/navigate/download + - automatic classification + - support for research and industry use +*** Process + - continuous improvement (tech, community) +** The next 5 years, cont'd +*** Team + 30 full time people on SWH core\\ + management, dev/ops, fundraising, comm, product, liaison +*** Funding + 4 or 5 Me/year +*** Organisation + - Independent international foundation + - International network of peers +*** Community + research, industry, culture, ... +** Pause +*** Yes, it is + - huge challenge + - unprecedented effort + - much more than just technology + - high risk, high gain +#+BEAMER: \pause +*** + \hfill I believe we can make it! +** What we need to succeed +*** Operations + - stability, reliability, efficiency + #+BEAMER: \pause +*** Engineering + - modularity (platform/plugins, tech oecumenism) + - replicability (mirrors, contributors) + - evolvability + #+BEAMER: \pause +*** Product vision + - "users" and "clients" are coming + #+BEAMER: \pause +*** Mindset + - make the principles guide the technology\\ + /not the other way around/ +* Conclusion +** Come in, we're open +*** Software Heritage is + - a /reference archive/ of /all available/ source code + - a fantastic new tool for /research/ software + - a unique /complement/ for /development platforms/ + - an international, open, nonprofit, /mutualized infrastructure/ + - at the service of our community, at the service of society +*** Questions :B_ignoreheading: + :PROPERTIES: + :BEAMER_env: ignoreheading + :END: +#+BEAMER: {\vfill\begin{center}\Huge{Questions ?}\end{center}\vfill} + +* Appendix :B_appendix: + :PROPERTIES: + :BEAMER_env: appendix + :END: +# +# How we want to work, including core properties +# + +* Selected research challenges : building the archive +** Data compression + Deduplication is performed at the file level /across all projects in the world/ +*** Pros + - very efficient to cope with file clones + - quite resilient to technology changes +*** Cons + - a minor edit creates two different files +#+BEAMER: \pause +*** Challenge: exploit file similarities + - adapt / improve variable size checksums / diff detection + - compression rates of up to 100 to 1 may arise +** Metadata alignment :noexport: +*** Many concepts related to source code + - project, archive, source, language, licence, bts, mailing list, ... + - developer, committer, author, architect, ... +*** Many existing ontologies + DOAP, FOAF, Appstream, schema.org, ADMS.SW, ... +*** Many disparate catalogs + :PROPERTIES: + :BEAMER_act: +- + :END: +# mostly manual + Freecode (40.000+), Plume (400+), Debian (25.000+), OpenHub (670.000+), ... +# FramaSoft (1500+), +# OpenHub is mostly automatic +# Wikipedia ? +*** Challenge : scale up metadata to millions of projects + :PROPERTIES: + :BEAMER_act: +- + :END: + - /reconcile/ existing ontologies + - /link/ and /check/ existing catalogs with Software Heritage + - handle /inconsistent data/ and /provenance information/ + - synthesise missing information (machine learning) + +** Software phylogenetics :noexport: +*** The Software Diaspora + :PROPERTIES: + :BEAMER_act: +- + :END: + - Code often /migrates/ across projects : forks, copy-paste + - Code gets /cloned/ : reuse, language limitations, code smells + - Projects /migrate/ across forges : fashion, functionality + - Projects get /cloned/ : mirrors, packages +*** Challenge: tracing software evolution across billions of files + :PROPERTIES: + :BEAMER_act: +- + :END: + - rebuild the history of software artefacts + - identify code origins + - spot code clones + - build project impact graphs +** Distributed infrastructure +*** The software graph + - files + - directories + - commits + - projects + all de-duplicated in Software Heritage +*** Challenge: design efficient architectures and algorithms + - replication and availability (CAP?) + - navigation + - query + - path analysis +* Selected research challenges : using the archive +** Code search +*** A natural need + :PROPERTIES: +# :BEAMER_act: +- + :END: + - Find the definition of a function/class/procedure/type/structure + - Search examples of code usage in an archive of source code + - you name it... +*** Approaches + :PROPERTIES: +# :BEAMER_act: +- + :END: + - language specific /patterns/ + - working on /abstract syntax trees/ + Regular expressions are a nice /swiss-army knife/ approximation, can we build a specific tool that scales? +*** What about /all the source code/ in the world? + :PROPERTIES: + :BEAMER_act: +- + :END: + - /hundreds/ of billions of LOCs + We need new insight for handling this. +** Software as Big Data +*** Remember the numbers + - 60+ million repositories ingested + - 700+ million commits + - 3+ billion unique source files / 200 TB of raw source code + and growing by the day! +*** Challenge: what can machines learn here? + - programming patterns / trends + - developer skills + - vulnerabilities + - bugs and fixes +** Efficient data representation :noexport: +*** Remember the numbers + - 60+ million repositories ingested + - 700+ million commits + - 3+ billion unique source files / 200 TB of raw source code + and growing by the day! +*** Challenge: can we make this fit in memory? + - efficient graph representation + - fast non-local queries + - mitigate the size/speed tradeoff +* A glimpse of the archive +#+INCLUDE: "../../common/modules/status-extended.org::#api" :only-contents t +* Bits from the drawing board +#+INCLUDE: "../../common/modules/bits-drawing-board.org::#keyproperties" :minlevel 2 +#+INCLUDE: "../../common/modules/bits-drawing-board.org::#foss" :minlevel 2 +#+INCLUDE: "../../common/modules/bits-drawing-board.org::#intrinsicids" :minlevel 2 +#+INCLUDE: "../../common/modules/bits-drawing-board.org::#replication" :minlevel 2 +** Some planned working groups +#+INCLUDE: "../../common/modules/your-help-wg.org::#sodi" :minlevel 3 +#+INCLUDE: "../../common/modules/your-help-wg.org::#sapi" :minlevel 3 +#+INCLUDE: "../../common/modules/your-help-wg.org::#opad" :minlevel 3 +* Tech bits +** More details on the internals +#+INCLUDE: "../../common/modules/status-extended.org::#architecture" :only-contents t +#+INCLUDE: "../../common/modules/status-extended.org::#merklerevision" :only-contents t +# +# Contributing to the great picture +# +** The team :noexport: + #+latex: \begin{center} + #+ATTR_LATEX: :width .35\linewidth +file:core-team-formal.png + #+latex: \end{center} + #+BEAMER: \pause +* Technical status +# #+INCLUDE: "../../common/modules/status-extended.org::#people" :minlevel 2 +#+INCLUDE: "../../common/modules/status-extended.org::#archive" :minlevel 2 +** Archiving goals + Targets: VCS repositories & source code releases (e.g., tarballs) +*** We DO archive + - file *content* (= blobs) + - *revisions* (= commits), with full metadata + - *releases* (= tags), ditto + - where (*origin*) & when (*visit*) we found any of the above + # - time-indexed repo *snapshots* (i.e., we never delete anything) + … in a VCS-/archive-agnostic *canonical data model* +*** We DON'T archive (for now) + # - diffs → derived data from related contents + - homepages, wikis + - BTS/issues/code reviews/etc. + - mailing lists + Long term vision: play our part in a /"semantic wikipedia of software"/ + +** Dataflow + #+BEAMER: \begin{center}\includegraphics[width=\extblockscale{.9\textwidth}]{swh-dataflow.pdf}\end{center} +# +# Key properties of the system +# +** Much more than an archive! + #+INCLUDE: "../../common/modules/status-extended.org::#merkletree" :only-contents t + #+INCLUDE: "../../common/modules/status-extended.org::#merkledemo" :minlevel 2 +# +INCLUDE: "../../common/modules/status.org::#datamodel" :minlevel 2 +# +INCLUDE: "../../common/modules/status-extended.org::#merkletree" :minlevel 2 +# +INCLUDE: "../../common/modules/status-extended.org::#merkledemo" :minlevel 2 +# +INCLUDE: "../../common/modules/status-extended.org::#architecture" :only-contents t +# +INCLUDE: "../../common/modules/status-extended.org::#merklerevision" :only-contents t +# +INCLUDE: "../../common/modules/status-extended.org::#giantdag" :only-contents t +# +INCLUDE: "../../common/modules/status-extended.org::#features" :minlevel 2 + diff --git a/talks-public/2017-07-18-team/Makefile b/talks-public/2017-07-18-team/Makefile new file mode 100644 index 0000000..68fbee7 --- /dev/null +++ b/talks-public/2017-07-18-team/Makefile @@ -0,0 +1 @@ +include ../Makefile.slides