diff --git a/common/modules/dataset.org b/common/modules/dataset.org index 43e26fb..3f4e249 100644 --- a/common/modules/dataset.org +++ b/common/modules/dataset.org @@ -1,260 +1,261 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 * Open Datasets :PROPERTIES: :CUSTOM_ID: main :END: ** Software Heritage Graph dataset :PROPERTIES: :CUSTOM_ID: graphdataset :END: #+BEAMER: \vspace{-1mm} **Use case:** large scale analyses of the most comprehensive corpus on the development history of free/open source software. *** #+BEGIN_EXPORT latex \vspace{-2mm} \begin{thebibliography}{Foo Bar, 1969} \small \bibitem{Pietri2019} Antoine Pietri, Diomidis Spinellis, Stefano Zacchiroli\newblock The Software Heritage Graph Dataset: Public software development under one roof\newblock MSR 2019: 16th Intl. Conf. on Mining Software Repositories. IEEE\newblock preprint: \url{http://deb.li/swhmsr19} \end{thebibliography} #+END_EXPORT #+BEAMER: \vspace{-1mm} *** Dataset - Relational representation of the full graph as a set of tables - Available as open data: https://doi.org/10.5281/zenodo.2583978 - Chosen as subject for the *MSR 2020 Mining Challenge* #+BEAMER: \vspace{-1mm} *** Formats - Local use: PostgreSQL dumps, or Apache Parquet files (~1 TiB each) - Live usage: Amazon Athena (SQL-queriable), Azure Data Lake ** Sample query --- most frequent first commit words :PROPERTIES: :CUSTOM_ID: graphquery1 :END: *** #+begin_src sql SELECT COUNT(*) AS c, word FROM ( SELECT LOWER(REGEXP_EXTRACT(FROM_UTF8( message), 'ˆ\w+')) AS word FROM revision) WHERE word != '' GROUP BY word ORDER BY COUNT(*) DESC LIMIT 5; #+end_src #+BEAMER: \pause *** | *Count* | *Word* | |------------+--------| | 71 338 310 | update | | 64 980 346 | merge | | 56 854 372 | add | | 44 971 954 | added | | 33 222 056 | fix | ** Sample query --- fork and merge arities :PROPERTIES: :CUSTOM_ID: graphquery2 :END: *** Fork arity :PROPERTIES: :BEAMER_env: block :BEAMER_COL: 0.5 :END: i.e., how often is a commit based upon? #+BEAMER: \scriptsize #+begin_src sql SELECT fork_deg, count(*) FROM ( SELECT id, count(*) AS fork_deg FROM revision_history GROUP BY id) t GROUP BY fork_deg ORDER BY fork_deg; #+end_src #+BEAMER: \includegraphics[width=\linewidth]{fork-degree} #+BEAMER: \pause *** Merge arity :PROPERTIES: :BEAMER_env: block :BEAMER_COL: 0.5 :END: i.e., how large are merges? #+BEAMER: \scriptsize #+begin_src sql SELECT merge_deg, COUNT(*) FROM ( SELECT parent_id, COUNT(*) AS merge_deg FROM revision_history GROUP BY parent_id) t GROUP BY deg ORDER BY deg; #+end_src #+BEAMER: \includegraphics[width=\linewidth]{merge-degree} * Other queries :PROPERTIES: :CUSTOM_ID: morequery :END: ** Sample query --- ratio of commits performed during weekends :PROPERTIES: :CUSTOM_ID: weekendsrc :END: #+BEGIN_SRC sql WITH revision_date AS (SELECT FROM_UNIXTIME(date / 1000000) AS date FROM revision) SELECT yearly_rev.year AS year, CAST(yearly_weekend_rev.number AS DOUBLE) / yearly_rev.number * 100.0 AS weekend_pc FROM (SELECT YEAR(date) AS year, COUNT(*) AS number FROM revision_date WHERE YEAR(date) BETWEEN 1971 AND 2018 GROUP BY YEAR(date) ) AS yearly_rev JOIN (SELECT YEAR(date) AS year, COUNT(*) AS number FROM revision_date WHERE DAY_OF_WEEK(date) >= 6 AND YEAR(date) BETWEEN 1971 AND 2018 GROUP BY YEAR(date) ) AS yearly_weekend_rev ON yearly_rev.year = yearly_weekend_rev.year ORDER BY year DESC; #+END_SRC ** Sample query --- ratio of commits performed during weekends (cont.) :PROPERTIES: :CUSTOM_ID: weekendout :END: | *Year* | *Weekend* | *Total* | *Weekend percentage* | |--------+-----------+-----------+----------------------| | 2018 | 15130065 | 78539158 | 19.26 | | 2017 | 33776451 | 168074276 | 20.09 | | 2016 | 43890325 | 209442130 | 20.95 | | 2015 | 35781159 | 166884920 | 21.44 | | 2014 | 24591048 | 122341275 | 20.10 | | 2013 | 17792778 | 88524430 | 20.09 | | 2012 | 12794430 | 64516008 | 19.83 | | 2011 | 9765190 | 48479321 | 20.14 | | 2010 | 7766348 | 38561515 | 20.14 | | 2009 | 6352253 | 31053219 | 20.45 | | 2008 | 4568373 | 22474882 | 20.32 | | 2007 | 3318881 | 16289632 | 20.37 | | 2006 | 2597142 | 12224905 | 21.24 | | 2005 | 2086697 | 9603804 | 21.72 | | 2004 | 1752400 | 7948104 | 22.04 | | 2003 | 1426033 | 6941593 | 20.54 | | 2002 | 1159294 | 5378538 | 21.55 | | 2001 | 849905 | 4098587 | 20.73 | | 2000 | 2091770 | 4338842 | 48.21 | | 1999 | 438540 | 2026906 | 21.63 | | 1998 | 311888 | 1430567 | 21.80 | | 1997 | 263995 | 1129249 | 23.37 | | 1996 | 192543 | 795827 | 24.19 | | 1995 | 176270 | 670417 | 26.29 | | 1994 | 137811 | 581563 | 23.69 | | 1993 | 169767 | 697343 | 24.34 | | 1992 | 74923 | 422068 | 17.75 | | 1991 | 92782 | 484547 | 19.14 | | 1990 | 113201 | 340489 | 33.24 | | 1989 | 31742 | 182325 | 17.40 | | 1988 | 44983 | 206275 | 21.80 | | 1987 | 27892 | 146157 | 19.08 | | 1986 | 54200 | 237330 | 22.83 | | 1985 | 75595 | 306564 | 24.65 | | 1984 | 26391 | 95506 | 27.63 | | 1983 | 89776 | 370687 | 24.21 | | 1982 | 51524 | 191933 | 26.84 | | 1981 | 32995 | 123618 | 26.69 | | 1980 | 31832 | 133733 | 23.80 | | 1979 | 20943 | 175164 | 11.95 | | 1978 | 3773 | 33677 | 11.20 | | 1977 | 4783 | 19376 | 24.68 | | 1976 | 1907 | 7048 | 27.05 | | 1975 | 2089 | 26579 | 7.85 | | 1974 | 2095 | 14290 | 14.66 | | 1973 | 2988 | 15580 | 19.17 | | 1972 | 1755 | 6552 | 26.78 | | 1971 | 1723 | 6125 | 28.13 | ** Sample query --- average size of the most popular file types :PROPERTIES: :CUSTOM_ID: popfilesrc :END: #+BEGIN_SRC sql SELECT suffix, ROUND(COUNT(*) * 100 / 1e6) AS Million_files, ROUND(AVG(length) / 1024) AS Average_k_length FROM (SELECT length, suffix FROM -- File length in joinable form (SELECT TO_BASE64(sha1_git) AS sha1_git64, length FROM content ) AS content_length JOIN -- Sample of files with popular suffixes (SELECT target64, file_suffix_sample.suffix AS suffix FROM -- Popular suffixes (SELECT suffix FROM ( SELECT REGEXP_EXTRACT(FROM_UTF8(name), '\.[^.]+$') AS suffix FROM directory_entry_file) AS file_suffix GROUP BY suffix ORDER BY COUNT(*) DESC LIMIT 20 ) AS pop_suffix JOIN -- Sample of files and suffixes (SELECT TO_BASE64(target) AS target64, REGEXP_EXTRACT(FROM_UTF8(name), '\.[^.]+$') AS suffix FROM directory_entry_file TABLESAMPLE BERNOULLI(1)) AS file_suffix_sample ON file_suffix_sample.suffix = pop_suffix.suffix) AS pop_suffix_sample ON pop_suffix_sample.target64 = content_length.sha1_git64) GROUP BY suffix ORDER BY AVG(length) DESC; #+END_SRC * Discussion :PROPERTIES: :CUSTOM_ID: discussion :END: ** Discussion - one /can/ query such a corpus SQL-style - but relational representation shows its limits at this scale - ...at least as deployed on commercial SQL offerings such as Athena - note: (naive) sharding is ineffective, due to the pseudo-random distribution of node identifiers - experiments with Google BigQuery are ongoing - (we broke it at the first import attempt..., due to very large arrays in directory entry tables) * License Dataset ** Software Heritage License Blob Dataset :PROPERTIES: :CUSTOM_ID: licensedataset :END: + #+BEAMER: \vspace{-2mm} *** #+BEGIN_EXPORT latex \vspace{-2mm} \begin{thebibliography}{Foo Bar, 1969} \footnotesize \bibitem{Zacchiroli2022LicenseBlobs} Stefano Zacchiroli \newblock A Large-scale Dataset of (Open Source) License Text Variants \newblock MSR 2022 (best dataset paper award)\newblock preprint: \url{https://arxiv.org/abs/2204.00256} \end{thebibliography} #+END_EXPORT *** Dataset #+BEAMER: \vspace{-1mm} - 6.5 million unique full texts of FOSS license variants - Detected using filename patterns across the entire SWH archive - =LICENSE=, =COPYRIGHT=, =NOTICE=, etc. - Metadata: file lengths measures, detected MIME type, detected SPDX license (via ScanCode), example origin repository, oldest public commit of origin *** Use cases #+BEAMER: \vspace{-1mm} - Empirical studies on FOSS licensing, including phylogenetics - Training of automated license classifiers - NLP analyses of legal texts diff --git a/common/modules/graph-compression.org b/common/modules/graph-compression.org index a6d79e5..e42d9ae 100644 --- a/common/modules/graph-compression.org +++ b/common/modules/graph-compression.org @@ -1,316 +1,345 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 # Depends: \usepackage{pdfpages} * Graph Compression :PROPERTIES: :CUSTOM_ID: main :END: +** Graph compression on the Software Heritage archive :noexport: + :PROPERTIES: + :CUSTOM_ID: oneslide + :END: +*** + #+BEGIN_EXPORT latex + \vspace{-3mm} + \footnotesize + \begin{thebibliography}{Foo Bar, 1969} + \bibitem{Boldi2020} Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli + \newblock Ultra-Large-Scale Repository Analysis via Graph Compression + \newblock SANER 2020, 27th Intl. Conf. on Software Analysis, Evolution and Reengineering. IEEE + \end{thebibliography} + #+END_EXPORT + #+BEAMER: \vspace{-1mm} +*** Research question + Is it possible to efficiently perform software development history analyses + at the scale of Software Heritage archive on a single, relatively cheap + machine? + #+BEAMER: \vspace{-1mm} +*** Idea + Apply state-of-the-art graph compression techniques from the field of Web + graph / social network analysis. + #+BEAMER: \vspace{-1mm} +*** Results + The entire archive graph (25 B nodes, 350 B edges) can be loaded in 200 GiB + and then traversed at the cost of tens of nanoseconds per edge (= a few + hours for a complete single-thread traversal of the archive). + ** Graph compression on the Software Heritage archive :PROPERTIES: :CUSTOM_ID: intro :END: *** #+BEGIN_EXPORT latex \vspace{-3mm} \begin{thebibliography}{Foo Bar, 1969} \bibitem{Boldi2020} Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli \newblock Ultra-Large-Scale Repository Analysis via Graph Compression \newblock SANER 2020, 27th Intl. Conf. on Software Analysis, Evolution and Reengineering. IEEE \end{thebibliography} #+END_EXPORT *** Research question Is it possible to efficiently perform software development history analyses at ultra large scale (= the scale of Software Heritage archive or more), on a single, relatively cheap machine? *** Idea Apply state-of-the-art graph compression techniques from the field of Web graph / social network analysis. ** Background --- (Web) graph compression :PROPERTIES: :CUSTOM_ID: background1 :END: *** The graph of the Web :PROPERTIES: :BEAMER_env: definition :END: Directed graph that has Web pages as nodes and hyperlinks between them as edges. *** Properties (1) - **Locality:** pages link to pages whose URLs are lexicographically similar. URLs share long common prefixes. → use *D-gap compression* *** Adjacency lists :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.51 :END: #+BEAMER: \scriptsize | *Node* | *Outdegree* | *Successors* | |--------+-------------+--------------------------------------| | ... | ... | ... | | 15 | 11 | 13,15,16,17,18,19,23,24,203,315,1034 | | 16 | 10 | 15,16,17,22,23,24,315,316,317,3041 | | 17 | 0 | | | 18 | 5 | 13,15,16,17,50 | | ... | ... | ... | *** D-gapped adjacency lists :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.48 :END: #+BEAMER: \scriptsize | *Node* | *Outdegree* | *Successors* | |--------+-------------+-----------------------------| | ... | ... | ... | | 15 | 11 | 3,1,0,0,0,0,3,0,178,111,718 | | 16 | 10 | 1,0,0,4,0,0,290,0,0,2723 | | 17 | 0 | | | 18 | 5 | 9,1,0,0,32 | | ... | ... | ... | ** Background --- (Web) graph compression (cont.) :PROPERTIES: :CUSTOM_ID: background2 :END: *** The graph of the Web :PROPERTIES: :BEAMER_env: definition :END: Directed graph that has Web pages as nodes and hyperlinks between them as edges. *** Properties (2) - **Similarity:** pages that are close together in lexicographic order tend to have many common successors. → use *reference compression* *** Adjacency lists :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.47 :END: #+BEAMER: \scriptsize | *Node* | *Outd.* | *Successors* | |--------+---------+--------------------------------------| | ... | ... | ... | | 15 | 11 | 13,15,16,17,18,19,23,24,203,315,1034 | | 16 | 10 | 15,16,17,22,23,24,315,316,317,3041 | | 17 | 0 | | | 18 | 5 | 13,15,16,17,50 | | ... | ... | ... | *** Copy lists :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.60 :END: #+BEAMER: \scriptsize | *Node* | *Ref.* | *Copy list* | *Extra nodes* | |--------+--------+-------------+--------------------------------------| | ... | ... | ... | ... | | 15 | 0 | | 13,15,16,17,18,19,23,24,203,315,1034 | | 16 | 1 | 01110011010 | 22,316,317,3041 | | 17 | | | | | 18 | 3 | 11110000000 | 50 | | ... | ... | ... | | ** Background --- Web graph compression (OLD) :noexport: Borrowing (great!) slides from: #+BEGIN_EXPORT latex \begin{thebibliography}{} \bibitem{Pibiri2018} Giulio Ermanno Pibiri \newblock Effective Web Graph Representations, 2018 \newblock \url{http://pages.di.unipi.it/pibiri/slides/webgraphs\_compression.pdf} \end{thebibliography} #+END_EXPORT ** Background -- Web graph compression (imported slides) (OLD) :noexport: :PROPERTIES: :BEAMER_env: ignoreheading :END: #+BEGIN_EXPORT latex { \setbeamercolor{background canvas}{bg=} \setbeamertemplate{background}{} \includepdf[pages={4,11,12,13}]{webgraphs_compression.pdf} \addtocounter{framenumber}{4} } #+END_EXPORT ** Corpus :PROPERTIES: :CUSTOM_ID: corpus :END: *** Nodes :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.5 :END: #+BEAMER: \small | *Node type* | *N. of nodes* | |-------------+---------------| | origins | 88 M | | snapshots | 57 M | | releases | 9.9 M | | revisions | 1.1 B | | directories | 4.9 B | | contents | 5.5 B | |-------------+---------------| | Total nodes | 12 B | *** Edges :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.5 :END: #+BEAMER: \footnotesize | *Edge type* | *N. of edges* | |-----------------------+---------------| | origin → snapshot | 195 M | | snapshot → revision | 616 M | | snapshot → release | 215 M | | release → revision | 9.9 M | | revision → revision | 1.2 B | | revision → directory | 1.1 B | | directory → directory | 48 B | | directory → revisiony | 482 M | | directory → content | 112 B | |-----------------------+---------------| | Total edges | 165 B | *** :PROPERTIES: :BEAMER_env: ignoreheading :END: #+BEAMER: \vspace{1mm} Stats for archive snapshot 2018-09-25, from the Software Heritage graph dataset. Growth rate: exponential, doubling every 22-30 months, cf.: #+BEGIN_EXPORT latex \begin{thebibliography}{Foo Bar, 1969} \footnotesize \bibitem{Rousseau2020} Roberto Di Cosmo, Guillaume Rousseau, Stefano Zacchiroli \newblock Software Provenance Tracking at the Scale of Public Source Code \newblock Empirical Software Engineering 25(4): 2930-2959 (2020) \end{thebibliography} #+END_EXPORT ** Graph compression pipeline :PROPERTIES: :CUSTOM_ID: pipeline :END: #+BEAMER: \hspace*{-0.1\linewidth} \includegraphics[width=1.2\linewidth]{compression/compression-steps} #+BEAMER: \vspace{-1cm} *** - *MPH*: minimal perfect hash, mapping Merkle IDs to 0..N-1 integers - *BV* compress: Boldi-Vigna compression (based on MPH order) - *BFS*: breadth-first visit to renumber - *Permute*: update BV compression according to BFS order *** (Re)establishing locality - key for good compression is a node ordering that ensures locality and similarity - which is very much /not/ the case with Merkle IDs, ...but is the case /again/ after BFS reordering ** Compression experiment :PROPERTIES: :CUSTOM_ID: compexp :END: | *Step* | *Wall time* (hours) | |-------------+---------------------| | MPH | 2 | | BV Compress | 84 | | BFS | 19 | | Permute | 18 | | Transpose | 15 | |-------------+---------------------| | Total | 138 (6 days) | - server equipped with 24 CPUs and 750 GB of RAM - RAM mostly used as I/O cache for the BFS step - /minimum/ memory requirements are close to the RAM needed to load the final compressed graph in memory ** Compression efficiency (space) :PROPERTIES: :CUSTOM_ID: spaceefficiency :END: *** :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.4 :END: | *Forward graph* | | | total size | 91 GiB | | bits per edge | 4.91 | | compression ratio | 15.8% | *** :PROPERTIES: :BEAMER_env: block :BEAMER_col: 0.4 :END: | *Backward graph* | | | total size | 83 GiB | | bits per edge | 4.49 | | compression ratio | 14.4% | *** Operating cost The structure of a full bidirectional archive graph fits in less than 200 GiB of RAM, for a hardware cost of ~300 USD. ** Compression efficiency (time) :PROPERTIES: :CUSTOM_ID: timeefficiency :END: *** Benchmark --- Full BFS visit (single thread) #+BEAMER: \begin{columns}\begin{column}{0.45\textwidth} | *Forward graph* | | |------------------+----------------| | wall time | 1h48m | | throughput | 1.81 M nodes/s | | | (553 ns/node) | #+BEAMER: \end{column}\begin{column}{0.45\textwidth} | *Backward graph* | | |------------------+----------------| | wall time | 3h17m | | throughput | 988 M nodes/s | | | (1.01 µs/node) | #+BEAMER: \end{column}\end{columns} *** Benchmark --- Edge lookup random sample: 1 B nodes (8.3% of entire graph); then enumeration of all successors #+BEAMER: \begin{columns}\begin{column}{0.45\textwidth} | *Forward graph* | | |------------------+----------------| | visited edges | 13.6 B | | throughput | 12.0 M edges/s | | | (83 ns/edge) | #+BEAMER: \end{column}\begin{column}{0.45\textwidth} | *Backward graph* | | |------------------+----------------| | visited edges | 13.6 B | | throughput | 9.45 M edges/s | | | (106 ns/edge) | #+BEAMER: \end{column}\end{columns} *** :PROPERTIES: :BEAMER_env: ignoreheading :END: Note how edge lookup time is close to DRAM random access time (50-60 ns). ** Discussion :PROPERTIES: :CUSTOM_ID: discussion :END: *** Incrementality compression is *not incremental*, due to the use of contiguous integer ranges - but the graph is append-only, so... - ...based on expected graph growth rate it should be possible to pre-allocate enough free space in the integer ranges to support *amortized incrementality* (future work) #+BEAMER: \pause *** In-memory v. on-disk the compressed in-memory graph structure has *no attributes* - usual design is to exploit the 0..N-1 integer ranges to *memory map node attributes* to disk for efficient access - works well for queries that does graph traversal first and "join" node attributes last; ping-pong between the two is expensive - edge attributes are more problematic (work in progress) diff --git a/common/modules/swh-fuse.org b/common/modules/swh-fuse.org index 461ee9d..74f6bbf 100644 --- a/common/modules/swh-fuse.org +++ b/common/modules/swh-fuse.org @@ -1,114 +1,139 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+INCLUDE: "prelude.org" :minlevel 1 ** Software Heritage Filesystem (SwhFS) :PROPERTIES: :CUSTOM_ID: oneslide :END: *** The *Software Heritage Filesystem (SwhFS)* is a user-space POSIX filesystem that enables browsing parts of the Software Heritage archive as if it were locally available. *** - code: https://forge.softwareheritage.org/source/swh-fuse/ - documentation: https://docs.softwareheritage.org/devel/swh-fuse/ *** #+BEGIN_EXPORT latex \begin{thebibliography}{Foo Bar, 1969} \bibitem{Allancon2021} Thibault Allançon, Antoine Pietri, Stefano Zacchiroli \newblock The Software Heritage Filesystem (SwhFS): Integrating Source Code Archival with Development \newblock ICSE 2021: The 43rd International Conference on Software Engineering \newblock \url{https://arxiv.org/abs/2102.06390} \end{thebibliography} #+END_EXPORT +** Software Heritage Filesystem (SwhFS) --- Example :noexport: + :PROPERTIES: + :CUSTOM_ID: examplemini + :END: +*** +#+BEAMER: \footnotesize +#+BEGIN_SRC +$ mkdir swhfs +$ swh fs mount swhfs/ # mount the archive +$ cd swhfs/ + +$ cat archive/swh:1:cnt:c839dea9e8e6f0528b468214348fee8669b305b2 +#include + +int main(void) { + printf("Hello, World!\n"); +} + +$ cd archive/swh:1:dir:1fee702c7e6d14395bbf5ac3598e73bcbf97b030 +$ ls | wc -l +127 +$ grep -i antenna THE_LUNAR_LANDING.s | cut -f 5 +# IS THE LR ANTENNA IN POSITION 1 YET +# BRANCH IF ANTENNA ALREADY IN POSITION 1 +#+END_SRC * Software Heritage Filesystem (SwhFS) --- Tutorial :PROPERTIES: :CUSTOM_ID: tutorial :END: ** Software Heritage Filesystem (SwhFS) --- Tutorial *** #+BEGIN_SRC $ pip install swh.fuse # install SwhFS $ mkdir swhfs $ swh fs mount swhfs/ # mount the archive $ ls -1F swhfs/ # list entry points archive/ # <- start browsing from here cache/ origin/ README #+END_SRC ** Software Heritage Filesystem (SwhFS) --- Tutorial (cont.) *** #+BEAMER: \footnotesize #+BEGIN_SRC $ cd swhfs/ $ cat archive/swh:1:cnt:c839dea9e8e6f0528b468214348fee8669b305b2 #include int main(void) { printf("Hello, World!\n"); } #+END_SRC ** Software Heritage Filesystem (SwhFS) --- Tutorial (cont.) *** #+BEAMER: \footnotesize #+BEGIN_SRC $ cd archive/swh:1:dir:1fee702c7e6d14395bbf5ac3598e73bcbf97b030 $ ls | wc -l 127 $ grep -i antenna THE_LUNAR_LANDING.s | cut -f 5 # IS THE LR ANTENNA IN POSITION 1 YET # BRANCH IF ANTENNA ALREADY IN POSITION 1 #+END_SRC ** Software Heritage Filesystem (SwhFS) --- Tutorial (cont.) *** #+BEAMER: \footnotesize #+BEGIN_SRC $ cd archive/swh:1:rev:9d76c0b163675505d1a901e5fe5249a2c55609bc $ ls -F history/ meta.json@ parent@ parents/ root@ $ jq '.author.name, .date, .message' meta.json "Michal Golebiowski-Owczarek" "2020-03-02T23:02:42+01:00" "Data:Event:Manipulation: Prevent collisions with Object.prototype ..." $ find root/src/ -type f -name '*.js' | xargs cat | wc -l 10136 #+END_SRC ** Software Heritage Filesystem (SwhFS) --- Tutorial (cont.) *** #+BEAMER: \footnotesize #+BEGIN_SRC $ swh web search git-annex --limit 1 ... git://git.joeyh.name/git-annex.git \ https://archive.softwareheritage.org/api/1/origin/git://git.joeyh.name/git-annex.git/visits/ ... $ swh web search git-annex --url-encode | cut -f 1 git%3A%2F%2Fgit.joeyh.name%2Fgit-annex.git $ cd origin/git%3A%2F%2Fgit.joeyh.name%2Fgit-annex.git $ ls -F 2020-12-19/ $ ls 2020-12-19/snapshot/refs/heads/master/root/ Annex/ COPYRIGHT NEWS Annex.hs Creds.hs P2P/ Assistant/ Crypto.hs README Assistant.hs Database/ Remote/ Backend/ debian/ RemoteDaemon/ #+END_SRC diff --git a/talks-public/2022-09-28-ese-research/2022-09-28-ese-research.org b/talks-public/2022-09-28-ese-research/2022-09-28-ese-research.org index a7eb802..1e45f9e 100644 --- a/talks-public/2022-09-28-ese-research/2022-09-28-ese-research.org +++ b/talks-public/2022-09-28-ese-research/2022-09-28-ese-research.org @@ -1,16 +1,30 @@ #+COLUMNS: %40ITEM %10BEAMER_env(Env) %9BEAMER_envargs(Env Args) %10BEAMER_act(Act) %4BEAMER_col(Col) %10BEAMER_extra(Extra) %8BEAMER_opt(Opt) #+TITLE: Empirical Software Engineering Research with Software Heritage #+BEAMER_HEADER: \date[2022-09-28]{28 September 2022} +#+BEAMER_HEADER: \title[Empirical Software Eng. Research with Software Heritage]{Empirical Software Engineering Research with Software Heritage} #+AUTHOR: Stefano Zacchiroli #+DATE: 28 September 2022 #+EMAIL: stefano.zacchiroli@telecom-paris.fr #+INCLUDE: "../../common/modules/prelude-toc.org" :minlevel 1 #+INCLUDE: "../../common/modules/169.org" #+BEAMER_HEADER: \institute[Télécom Paris]{Télécom Paris, Polytechnic Institute of Paris\\ {\tt stefano.zacchiroli@telecom-paris.fr}} #+BEAMER_HEADER: \author{Stefano Zacchiroli} * Datasets -#+INCLUDE: "../../common/modules/dataset.org::#graphdataset" -#+INCLUDE: "../../common/modules/dataset.org::#graphquery1" -#+INCLUDE: "../../common/modules/dataset.org::#licensedataset" +** Graph dataset +#+INCLUDE: "../../common/modules/dataset.org::#graphdataset" :only-contents t +** Graph dataset --- example +#+INCLUDE: "../../common/modules/dataset.org::#graphquery1" :only-contents t +** License dataset +#+INCLUDE: "../../common/modules/dataset.org::#licensedataset" :only-contents t +* Accessing source code artifacts +** The Software Heritage Filesystem (SwhFS) +#+INCLUDE: "../../common/modules/swh-fuse.org::#oneslide" :only-contents t +** The Software Heritage Filesystem (SwhFS) --- example +#+INCLUDE: "../../common/modules/swh-fuse.org::#examplemini" :only-contents t +** Graph compression +#+INCLUDE: "../../common/modules/graph-compression.org::#oneslide" :only-contents t +* Software provenance and evolution +* Software forks +* Diversity, equity, and inclusion