diff --git a/sql/swh-graph/README.md b/sql/swh-graph/README.md new file mode 100644 index 0000000..0f79b06 --- /dev/null +++ b/sql/swh-graph/README.md @@ -0,0 +1,6 @@ +SQL queries to compute the size of the **Software Heritage graph**, which has +as nodes commits/trees/blobs, and as edges the relations among them. + +The main file is swh-graph.txt, in org-mode, which generates swh-graph.sql +using org-babel-tangle (C-c C-v t). swh-graph.out are the last results obtained +running the query. diff --git a/sql/swh-graph/swh-graph.out b/sql/swh-graph/swh-graph.out new file mode 100644 index 0000000..2239be8 --- /dev/null +++ b/sql/swh-graph/swh-graph.out @@ -0,0 +1,112 @@ +Timing is on. +create temporary view counts as + select relname as tbl, reltuples::bigint as tuples + from pg_class + where relnamespace = 'public'::regnamespace + and relkind = 'r' + order by relname; +CREATE VIEW +Time: 160,458 ms +create temporary table nodes ( + description text, + tuples bigint not null +); +CREATE TABLE +Time: 35,787 ms +create temporary table edges (like nodes); +CREATE TABLE +Time: 11,709 ms +insert into nodes + select 'content', sum(tuples) + from counts + where tbl in ('content', 'skipped_content'); +INSERT 0 1 +Time: 4,890 ms +insert into nodes + select 'revision', tuples + from counts + where tbl = 'revision'; +INSERT 0 1 +Time: 2,040 ms +insert into nodes + select 'release', tuples + from counts + where tbl = 'release'; +INSERT 0 1 +Time: 7,646 ms +insert into nodes + select 'directory', tuples + from counts + where tbl = 'directory'; +INSERT 0 1 +Time: 64,765 ms +insert into edges + select 'revision->revision', tuples + from counts + where tbl = 'revision_history'; +INSERT 0 1 +Time: 51,195 ms +insert into edges + select 'revision->directory', tuples + from counts + where tbl = 'revision'; +INSERT 0 1 +Time: 42,475 ms +insert into edges + select 'release->revision', tuples + from counts + where tbl = 'release'; +INSERT 0 1 +Time: 21,419 ms +insert into edges + with edges as ( + select sum(coalesce(cardinality(dir_entries), 0)) as dir_edges, + sum(coalesce(cardinality(file_entries), 0)) as file_edges, + sum(coalesce(cardinality(rev_entries), 0)) as rev_edges + from directory tablesample bernoulli (1.0) + ) + select 'directory->directory' as description, + dir_edges * 100 as tuples + from edges + union + select 'directory->file' as description, + file_edges * 100 as tuples + from edges + union + select 'directory->revision' as description, + rev_edges * 100 as tuples + from edges; +INSERT 0 3 +Time: 2081001,866 ms +select * from nodes; + description | tuples +-------------+------------ + content | 2036196307 + revision | 468546912 + release | 4093895 + directory | 1770988032 +(4 rows) + +Time: 14,725 ms +select * from edges; + description | tuples +----------------------+------------- + revision->revision | 483681056 + revision->directory | 468546912 + release->revision | 4093895 + directory->directory | 13324743100 + directory->file | 20863659500 + directory->revision | 132739100 +(6 rows) + +Time: 2,515 ms +select 'nodes' as feat, sum(tuples) as card from nodes +union +select 'edges' as feat, sum(tuples) as card from edges; + feat | card +-------+------------- + edges | 35277463563 + nodes | 4279825146 +(2 rows) + +Time: 36,392 ms diff --git a/sql/swh-graph/swh-graph.sql b/sql/swh-graph/swh-graph.sql new file mode 100644 index 0000000..0b1da02 --- /dev/null +++ b/sql/swh-graph/swh-graph.sql @@ -0,0 +1,76 @@ +\timing + +create temporary view counts as + select relname as tbl, reltuples::bigint as tuples + from pg_class + where relnamespace = 'public'::regnamespace + and relkind = 'r' -- ordinary table; no indexes, sequences, etc. + order by relname; + +create temporary table nodes ( + description text, + tuples bigint not null +); + +create temporary table edges (like nodes); + +insert into nodes + select 'content', sum(tuples) + from counts + where tbl in ('content', 'skipped_content'); + +insert into nodes + select 'revision', tuples + from counts + where tbl = 'revision'; + +insert into nodes + select 'release', tuples + from counts + where tbl = 'release'; + +insert into nodes + select 'directory', tuples + from counts + where tbl = 'directory'; + +insert into edges + select 'revision->revision', tuples + from counts + where tbl = 'revision_history'; + +insert into edges + select 'revision->directory', tuples + from counts + where tbl = 'revision'; + +insert into edges + select 'release->revision', tuples + from counts + where tbl = 'release'; + +insert into edges + with edges as ( + select sum(coalesce(cardinality(dir_entries), 0)) as dir_edges, + sum(coalesce(cardinality(file_entries), 0)) as file_edges, + sum(coalesce(cardinality(rev_entries), 0)) as rev_edges + from directory tablesample bernoulli (1.0) + ) + select 'directory->directory' as description, + dir_edges * 100 as tuples + from edges + union + select 'directory->file' as description, + file_edges * 100 as tuples + from edges + union + select 'directory->revision' as description, + rev_edges * 100 as tuples + from edges; + +select * from nodes; +select * from edges; + +select 'nodes' as feat, sum(tuples) as card from nodes +union +select 'edges' as feat, sum(tuples) as card from edges; diff --git a/sql/swh-graph/swh-graph.txt b/sql/swh-graph/swh-graph.txt new file mode 100644 index 0000000..7780c88 --- /dev/null +++ b/sql/swh-graph/swh-graph.txt @@ -0,0 +1,113 @@ +# -*- mode: org -*- +#+PROPERTY: header-args :tangle yes + +* preamble + view to lookup approximate table counts from postgres statistics + #+begin_src sql + \timing + + create temporary view counts as + select relname as tbl, reltuples::bigint as tuples + from pg_class + where relnamespace = 'public'::regnamespace + and relkind = 'r' -- ordinary table; no indexes, sequences, etc. + order by relname; + + create temporary table nodes ( + description text, + tuples bigint not null + ); + + create temporary table edges (like nodes); + #+end_src +* nodes +** content + content nodes (skipped or not) are the leaves in the graph + #+begin_src sql + insert into nodes + select 'content', sum(tuples) + from counts + where tbl in ('content', 'skipped_content'); + #+end_src +** revision + revision nodes are the inner nodes of history subgraphs + #+begin_src sql + insert into nodes + select 'revision', tuples + from counts + where tbl = 'revision'; + #+end_src +** release + tagged release nodes are graph roots + #+begin_src sql + insert into nodes + select 'release', tuples + from counts + where tbl = 'release'; + #+end_src +** directory + anonymous directory nodes + #+begin_src sql + insert into nodes + select 'directory', tuples + from counts + where tbl = 'directory'; + #+end_src +* edges +** revision +*** revision -> revision + history edges between pairs of revisions + #+begin_src sql + insert into edges + select 'revision->revision', tuples + from counts + where tbl = 'revision_history'; + #+end_src +*** revision -> directory + each revision points to a single directory + #+begin_src sql + insert into edges + select 'revision->directory', tuples + from counts + where tbl = 'revision'; + #+end_src +** release -> revision + one release edge for each release node + #+begin_src sql + insert into edges + select 'release->revision', tuples + from counts + where tbl = 'release'; + #+end_src +** directories -> { directory, file, revision } + one edge for each directory entry (edges pointing to other directories + + edges pointing to contents + edges pointing to revisions) + #+begin_src sql + insert into edges + with edges as ( + select sum(coalesce(cardinality(dir_entries), 0)) as dir_edges, + sum(coalesce(cardinality(file_entries), 0)) as file_edges, + sum(coalesce(cardinality(rev_entries), 0)) as rev_edges + from directory tablesample bernoulli (1.0) + ) + select 'directory->directory' as description, + dir_edges * 100 as tuples + from edges + union + select 'directory->file' as description, + file_edges * 100 as tuples + from edges + union + select 'directory->revision' as description, + rev_edges * 100 as tuples + from edges; + #+end_src +* totals + #+begin_src sql + select * from nodes; + select * from edges; + + select 'nodes' as feat, sum(tuples) as card from nodes + union + select 'edges' as feat, sum(tuples) as card from edges; + #+end_src