diff --git a/sql/swh-graph/README.md b/sql/swh-graph/estimate/README.md similarity index 100% rename from sql/swh-graph/README.md rename to sql/swh-graph/estimate/README.md diff --git a/sql/swh-graph/dump-swh-graph.sql b/sql/swh-graph/estimate/dump-swh-graph.sql similarity index 100% rename from sql/swh-graph/dump-swh-graph.sql rename to sql/swh-graph/estimate/dump-swh-graph.sql diff --git a/sql/swh-graph/swh-graph.out b/sql/swh-graph/estimate/swh-graph.out similarity index 100% rename from sql/swh-graph/swh-graph.out rename to sql/swh-graph/estimate/swh-graph.out diff --git a/sql/swh-graph/swh-graph.sh b/sql/swh-graph/estimate/swh-graph.sh similarity index 100% rename from sql/swh-graph/swh-graph.sh rename to sql/swh-graph/estimate/swh-graph.sh diff --git a/sql/swh-graph/swh-graph.sql b/sql/swh-graph/estimate/swh-graph.sql similarity index 100% rename from sql/swh-graph/swh-graph.sql rename to sql/swh-graph/estimate/swh-graph.sql diff --git a/sql/swh-graph/swh-graph.txt b/sql/swh-graph/estimate/swh-graph.txt similarity index 100% rename from sql/swh-graph/swh-graph.txt rename to sql/swh-graph/estimate/swh-graph.txt diff --git a/sql/swh-graph/export/README.md b/sql/swh-graph/export/README.md new file mode 100644 index 0000000..1abeade --- /dev/null +++ b/sql/swh-graph/export/README.md @@ -0,0 +1,4 @@ +Scripts to dump the content of the Software Heritage archive as a graph. + +Note: only for development use, the generation of CSV files is naive enough +that will explode on the real production DB. diff --git a/sql/swh-graph/export/export-edges.sql b/sql/swh-graph/export/export-edges.sql new file mode 100644 index 0000000..ef22021 --- /dev/null +++ b/sql/swh-graph/export/export-edges.sql @@ -0,0 +1,15 @@ +\copy (SELECT 'swh:1:dir:' || encode(directory.id, 'hex') as source, 'swh:1:cnt:' || encode(target, 'hex') as dest FROM directory CROSS JOIN UNNEST(file_entries) as t(file_id) INNER JOIN directory_entry_file ON directory_entry_file.id = file_id ORDER BY dest) TO STDOUT WITH CSV DELIMITER ' '; + +\copy (SELECT 'swh:1:dir:' || encode(directory.id, 'hex') as source, 'swh:1:dir:' || encode(target, 'hex') as dest FROM directory CROSS JOIN UNNEST(dir_entries) as t(dir_id) INNER JOIN directory_entry_dir ON directory_entry_dir.id = dir_id ORDER BY dest) TO STDOUT WITH CSV DELIMITER ' '; + +\copy (SELECT 'swh:1:dir:' || encode(directory.id, 'hex') as source, 'swh:1:rev:' || encode(target, 'hex') as dest FROM directory CROSS JOIN UNNEST(rev_entries) as t(rev_id) INNER JOIN directory_entry_rev ON directory_entry_rev.id = rev_id ORDER BY dest) TO STDOUT WITH CSV DELIMITER ' '; + +\copy (SELECT 'swh:1:ori:' || encode(digest(url, 'sha1'), 'hex') as source, 'swh:1:snp:' || encode(snapshot.id, 'hex') as dest FROM origin_visit INNER JOIN snapshot on origin_visit.snapshot = snapshot.id INNER JOIN origin on origin_visit.origin = origin.id ORDER BY dest) TO STDOUT WITH CSV DELIMITER ' '; + +\copy (SELECT 'swh:1:rel:' || encode(id, 'hex') as source, 'swh:1:rev:' || encode(target, 'hex') as dest FROM release ORDER BY dest) TO STDOUT WITH CSV DELIMITER ' '; + +\copy (SELECT 'swh:1:rev:' || encode(id, 'hex') as source, 'swh:1:dir:' || encode(directory, 'hex') as dest FROM revision ORDER BY dest) TO STDOUT WITH CSV DELIMITER ' '; + +\copy (SELECT 'swh:1:rev:' || encode(id, 'hex') as source, 'swh:1:rev:' || encode(parent_id, 'hex') as dest FROM revision_history ORDER BY dest) TO STDOUT WITH CSV DELIMITER ' '; + +\copy (SELECT 'swh:1:snp:' || encode(snapshot.id, 'hex') as source, 'swh:1:rev:' || encode(snapshot_branch.target, 'hex') as dest FROM snapshot_branch INNER JOIN snapshot_branches on snapshot_branches.branch_id = snapshot_branch.object_id INNER JOIN snapshot on snapshot_branches.snapshot_id = snapshot.object_id ORDER BY dest) TO STDOUT WITH CSV DELIMITER ' '; diff --git a/sql/swh-graph/export/export-graph.sh b/sql/swh-graph/export/export-graph.sh new file mode 100755 index 0000000..2b9e3d1 --- /dev/null +++ b/sql/swh-graph/export/export-graph.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +PSQL_CONN="service=softwareheritage-dev" +if [ "$1" = "-h" -o "$1" = "--help" ] ; then + echo "Usage: export-graph.sh [POSTGRES_CONNECTION_STRING]" + exit 1 +elif [ -n "$1" ] ; then + PSQL_CONN="$1" + shift +fi + +tmp_dir=$(mktemp -td swh-graph.XXXXXXXXXX) +trap "rm -rf ${tmp_dir}" EXIT + +fifo="${tmp_dir}/graph.fifo" + +psql "$PSQL_CONN" < export-edges.sql \ + | tee "$fifo" | pigz -c > swh-graph.edges.csv.gz + +tr ' ' '\n' < "$fifo" | sort -u | pigz -c > swh-graph.nodes.csv.gz + +psql "$PSQL_CONN" < export-origins.sql | pigz -c > swh-graph.origins.csv.gz + +echo "All done." +echo "- graph stored in: swh-graph.{nodes,edges}.csv.gz" +echo "- origins stored in: swh-graph.origins.csv.gz" diff --git a/sql/swh-graph/export/export-origins.sql b/sql/swh-graph/export/export-origins.sql new file mode 100644 index 0000000..4b81bf0 --- /dev/null +++ b/sql/swh-graph/export/export-origins.sql @@ -0,0 +1 @@ +\copy (SELECT 'swh:1:ori:' || encode(digest(url, 'sha1'), 'hex') as pid, url FROM origin ORDER BY url) TO STDOUT WITH CSV DELIMITER ' ';