diff --git a/sql/swh-graph/swh-graph.out b/sql/swh-graph/swh-graph.out index f588437..aa0c636 100644 --- a/sql/swh-graph/swh-graph.out +++ b/sql/swh-graph/swh-graph.out @@ -1,50 +1,129 @@ +-- set bytea_output = 'escape'; +\timing Timing is on. +create temporary view counts as + select object_type as tbl, value as tuples + from object_counts + order by tbl; CREATE VIEW -Time: 41,763 ms +Time: 11,486 ms +create temporary table nodes ( + description text, + tuples bigint not null +); CREATE TABLE -Time: 8,410 ms +Time: 6,057 ms +create temporary table edges (like nodes); CREATE TABLE -Time: 5,473 ms +Time: 5,650 ms +insert into nodes + select 'content', sum(tuples) + from counts + where tbl in ('content', 'skipped_content'); INSERT 0 1 -Time: 9,010 ms +Time: 5,714 ms +insert into nodes + select 'revision', tuples + from counts + where tbl = 'revision'; INSERT 0 1 -Time: 3,693 ms +Time: 3,652 ms +insert into nodes + select 'release', tuples + from counts + where tbl = 'release'; INSERT 0 1 -Time: 3,602 ms +Time: 3,219 ms +insert into nodes + select 'directory', tuples + from counts + where tbl = 'directory'; INSERT 0 1 -Time: 3,915 ms +Time: 3,470 ms +insert into nodes + select 'snapshot', tuples + from counts + where tbl = 'snapshot'; INSERT 0 1 -Time: 3,866 ms +Time: 4,228 ms +insert into edges + select 'revision->revision', tuples + from counts + where tbl = 'revision_history'; +INSERT 0 0 +Time: 3,399 ms +insert into edges + select 'revision->directory', tuples + from counts + where tbl = 'revision'; INSERT 0 1 -Time: 3,891 ms +Time: 3,490 ms +insert into edges + select 'release->revision', tuples + from counts + where tbl = 'release'; INSERT 0 1 -Time: 4,210 ms +Time: 3,594 ms +insert into edges + select 'snapshot->' || snapshot_branch.target_type as target_type, count(*) * 1000 + from snapshot tablesample system(0.1) + inner join snapshot_branches on snapshot.object_id = snapshot_branches.snapshot_id + inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id + group by snapshot_branch.target_type; INSERT 0 3 -Time: 1438562,509 ms +Time: 243080,038 ms (04:03,080) +insert into edges + with edges as ( + select sum(coalesce(cardinality(dir_entries), 0)) as dir_edges, + sum(coalesce(cardinality(file_entries), 0)) as file_edges, + sum(coalesce(cardinality(rev_entries), 0)) as rev_edges + from directory tablesample system (0.1) + ) + select 'directory->directory' as description, + dir_edges * 1000 as tuples + from edges + union + select 'directory->file' as description, + file_edges * 1000 as tuples + from edges + union + select 'directory->revision' as description, + rev_edges * 1000 as tuples + from edges; +INSERT 0 3 +Time: 32650,563 ms (00:32,651) +select * from nodes; description | tuples -------------+------------ - content | 3263018166 - revision | 757823872 - release | 5849477 - directory | 2812096000 -(4 rows) + content | 4586303051 + revision | 1039856865 + release | 7713653 + directory | 4020639929 + snapshot | 55410353 +(5 rows) -Time: 4,672 ms +Time: 3,904 ms +select * from edges; description | tuples ----------------------+------------- - revision->revision | 766975616 - revision->directory | 757823872 - release->revision | 5849477 - directory->directory | 22268443900 - directory->file | 33603252100 - directory->revision | 179751200 -(6 rows) + revision->directory | 1039856865 + release->revision | 7713653 + snapshot->release | 223180000 + snapshot->revision | 702870000 + snapshot->directory | 5000 + directory->directory | 38411104000 + directory->revision | 285837000 + directory->file | 80660680000 +(8 rows) -Time: 4,885 ms - feat | card --------+------------- - edges | 57582096165 - nodes | 6838787515 +Time: 3,439 ms +select 'nodes' as feat, sum(tuples) as card from nodes +union +select 'edges' as feat, sum(tuples) as card from edges; + feat | card +-------+-------------- + edges | 121331246518 + nodes | 9709923851 (2 rows) -Time: 6,577 ms +Time: 4,174 ms diff --git a/sql/swh-graph/swh-graph.sql b/sql/swh-graph/swh-graph.sql index 0b1da02..4f7ba5f 100644 --- a/sql/swh-graph/swh-graph.sql +++ b/sql/swh-graph/swh-graph.sql @@ -1,76 +1,86 @@ \timing create temporary view counts as - select relname as tbl, reltuples::bigint as tuples - from pg_class - where relnamespace = 'public'::regnamespace - and relkind = 'r' -- ordinary table; no indexes, sequences, etc. - order by relname; + select object_type as tbl, value as tuples + from object_counts + order by tbl; create temporary table nodes ( description text, tuples bigint not null ); create temporary table edges (like nodes); insert into nodes select 'content', sum(tuples) from counts where tbl in ('content', 'skipped_content'); insert into nodes select 'revision', tuples from counts where tbl = 'revision'; insert into nodes select 'release', tuples from counts where tbl = 'release'; insert into nodes select 'directory', tuples from counts where tbl = 'directory'; +insert into nodes + select 'snapshot', tuples + from counts + where tbl = 'snapshot'; + insert into edges select 'revision->revision', tuples from counts where tbl = 'revision_history'; insert into edges select 'revision->directory', tuples from counts where tbl = 'revision'; insert into edges select 'release->revision', tuples from counts where tbl = 'release'; +insert into edges + select 'snapshot->' || snapshot_branch.target_type as target_type, count(*) * 1000 + from snapshot tablesample system(0.1) + inner join snapshot_branches on snapshot.object_id = snapshot_branches.snapshot_id + inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id + group by snapshot_branch.target_type; + insert into edges with edges as ( select sum(coalesce(cardinality(dir_entries), 0)) as dir_edges, sum(coalesce(cardinality(file_entries), 0)) as file_edges, sum(coalesce(cardinality(rev_entries), 0)) as rev_edges - from directory tablesample bernoulli (1.0) + from directory tablesample system (0.1) ) select 'directory->directory' as description, - dir_edges * 100 as tuples + dir_edges * 1000 as tuples from edges union select 'directory->file' as description, - file_edges * 100 as tuples + file_edges * 1000 as tuples from edges union select 'directory->revision' as description, - rev_edges * 100 as tuples + rev_edges * 1000 as tuples from edges; select * from nodes; select * from edges; select 'nodes' as feat, sum(tuples) as card from nodes union select 'edges' as feat, sum(tuples) as card from edges; diff --git a/sql/swh-graph/swh-graph.txt b/sql/swh-graph/swh-graph.txt index 7780c88..b875c6c 100644 --- a/sql/swh-graph/swh-graph.txt +++ b/sql/swh-graph/swh-graph.txt @@ -1,113 +1,128 @@ # -*- mode: org -*- #+PROPERTY: header-args :tangle yes * preamble - view to lookup approximate table counts from postgres statistics + view to lookup precise table counts from Software Heritage object counts #+begin_src sql \timing create temporary view counts as - select relname as tbl, reltuples::bigint as tuples - from pg_class - where relnamespace = 'public'::regnamespace - and relkind = 'r' -- ordinary table; no indexes, sequences, etc. - order by relname; + select object_type as tbl, value as tuples + from object_counts + order by tbl; create temporary table nodes ( description text, tuples bigint not null ); create temporary table edges (like nodes); #+end_src * nodes ** content content nodes (skipped or not) are the leaves in the graph #+begin_src sql insert into nodes select 'content', sum(tuples) from counts where tbl in ('content', 'skipped_content'); #+end_src ** revision revision nodes are the inner nodes of history subgraphs #+begin_src sql insert into nodes select 'revision', tuples from counts where tbl = 'revision'; #+end_src ** release tagged release nodes are graph roots #+begin_src sql insert into nodes select 'release', tuples from counts where tbl = 'release'; #+end_src ** directory anonymous directory nodes #+begin_src sql insert into nodes select 'directory', tuples from counts where tbl = 'directory'; #+end_src +** snapshot + snapshot nodes are graph entry points + #+begin_src sql + insert into nodes + select 'snapshot', tuples + from counts + where tbl = 'snapshot'; + #+end_src * edges ** revision *** revision -> revision history edges between pairs of revisions #+begin_src sql insert into edges select 'revision->revision', tuples from counts where tbl = 'revision_history'; #+end_src *** revision -> directory each revision points to a single directory #+begin_src sql insert into edges select 'revision->directory', tuples from counts where tbl = 'revision'; #+end_src ** release -> revision one release edge for each release node #+begin_src sql insert into edges select 'release->revision', tuples from counts where tbl = 'release'; #+end_src -** directories -> { directory, file, revision } +** snapshot -> { revision, release, etc. } :approximate: + #+begin_src sql + insert into edges + select 'snapshot->' || snapshot_branch.target_type as target_type, count(*) * 1000 + from snapshot tablesample system(0.1) + inner join snapshot_branches on snapshot.object_id = snapshot_branches.snapshot_id + inner join snapshot_branch on snapshot_branches.branch_id = snapshot_branch.object_id + group by snapshot_branch.target_type; + #+end_src +** directories -> { directory, file, revision } :approximate: one edge for each directory entry (edges pointing to other directories + edges pointing to contents + edges pointing to revisions) #+begin_src sql insert into edges with edges as ( select sum(coalesce(cardinality(dir_entries), 0)) as dir_edges, sum(coalesce(cardinality(file_entries), 0)) as file_edges, sum(coalesce(cardinality(rev_entries), 0)) as rev_edges - from directory tablesample bernoulli (1.0) + from directory tablesample system (0.1) ) select 'directory->directory' as description, - dir_edges * 100 as tuples + dir_edges * 1000 as tuples from edges union select 'directory->file' as description, - file_edges * 100 as tuples + file_edges * 1000 as tuples from edges union select 'directory->revision' as description, - rev_edges * 100 as tuples + rev_edges * 1000 as tuples from edges; #+end_src * totals #+begin_src sql select * from nodes; select * from edges; select 'nodes' as feat, sum(tuples) as card from nodes union select 'edges' as feat, sum(tuples) as card from edges; #+end_src