diff --git a/count.py b/count.py new file mode 100755 index 0000000..4e0a4dd --- /dev/null +++ b/count.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python + +import sys + + +def linecount(filename): + count = 0 + for line in open(filename).xreadlines(): + count += 1 + return count + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print('usage: count ') + exit(-1) + + print(linecount(sys.argv[1])) diff --git a/metrics.py b/metrics.py new file mode 100755 index 0000000..3eb842e --- /dev/null +++ b/metrics.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +import io +import sys + +from swh.provenance import get_provenance + + +# TODO: take conninfo as command line arguments. +conninfo = { + "cls": "ps", + "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "ordered"}, +} + + +if __name__ == "__main__": + # Get provenance object for both databases and query its lists of content. + provenance = get_provenance(**conninfo) + + tables = [ + "content", + "content_early_in_rev", + "content_in_dir", + "directory", + "directory_in_rev", + "location", + "revision" + ] + + row_count = {} + table_size = {} + indexes_size = {} + relation_size = {} + for table in tables: + provenance.cursor.execute(f"SELECT COUNT(*) FROM {table}") + row_count[table] = provenance.cursor.fetchone()[0] + + provenance.cursor.execute(f"SELECT pg_table_size('{table}')") + table_size[table] = provenance.cursor.fetchone()[0] + + provenance.cursor.execute(f"SELECT pg_indexes_size('{table}')") + indexes_size[table] = provenance.cursor.fetchone()[0] + + # provenance.cursor.execute(f"SELECT pg_total_relation_size('{table}')") + # relation_size[table] = provenance.cursor.fetchone()[0] + relation_size[table] = table_size[table] + indexes_size[table] + + print(f"{table}:") + print(f" total rows: {row_count[table]}") + print(f" table size: {table_size[table]} bytes ({table_size[table] / row_count[table]:.2f} per row)") + print(f" index size: {indexes_size[table]} bytes ({indexes_size[table] / row_count[table]:.2f} per row)") + print(f" total size: {relation_size[table]} bytes ({relation_size[table] / row_count[table]:.2f} per row)") + + print("ratios:") + print(f" content/revision: {row_count['content'] / row_count['revision']:.2f}") + print(f" content_early_in_rev/content: {row_count['content_early_in_rev'] / row_count['content']:.2f}") + print(f" directory/revision: {row_count['directory'] / row_count['revision']:.2f}") + print(f" content_in_dir/directory: {row_count['content_in_dir'] / row_count['directory']:.2f}") + print(f" directory_in_rev/revision: {row_count['directory_in_rev'] / row_count['revision']:.2f}") diff --git a/revisions_format.py b/revisions_format.py new file mode 100755 index 0000000..f193963 --- /dev/null +++ b/revisions_format.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +import io +import json +import sys + +from swh.model.hashutil import hash_to_hex, hash_to_bytes +from swh.provenance.postgresql.db_utils import connect + + +conninfo = { + "host": "db.internal.softwareheritage.org", + "dbname": "softwareheritage", + "user": "guest" +} +# conninfo = 'postgresql://guest@db.internal.softwareheritage.org/softwareheritage' + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print('usage: revisions_format ') + exit(-1) + + print(f'Connection to database: {conninfo}...') + conn = connect(conninfo) + + infilename = sys.argv[1] + outfilename = sys.argv[2] + + with io.open(infilename) as infile: + with io.open(outfilename, 'w') as outfile: + ids = json.loads(infile.read()) + print(f"Formatting {len(ids)} revisions") + for id in ids: + cursor = conn.cursor() + cursor.execute('''SELECT id, date, directory FROM revision + WHERE id=%s AND date IS NOT NULL''', + (hash_to_bytes(id),)) + rev = cursor.fetchone() + assert rev is not None + outfile.write(f'{hash_to_hex(rev[0])},{rev[1]},{hash_to_hex(rev[2])}\n') diff --git a/revisions_pick.py b/revisions_pick.py new file mode 100755 index 0000000..e6d38a6 --- /dev/null +++ b/revisions_pick.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +import io +import sys + +from swh.model.hashutil import hash_to_hex +from swh.provenance.postgresql.db_utils import connect + + +conninfo = { + "host": "db.internal.softwareheritage.org", + "dbname": "softwareheritage", + "user": "guest" +} +# conninfo = 'postgresql://guest@db.internal.softwareheritage.org/softwareheritage' + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print('usage: listrevs ') + exit(-1) + + count = int(sys.argv[1]) + filename = sys.argv[2] + + print(f'Connection to database: {conninfo}...') + conn = connect(conninfo) + + low = b'\x0150352e5a43c5b9368990e1dfe0c1510f86de73' + high = b'\xffecf10c8c0106a8d66718e29aa6604df441704e' + limit = count + + revcur = conn.cursor() + revcur.execute('''SELECT id FROM revision + WHERE id BETWEEN %s AND %s LIMIT %s''', + (low, high, limit)) + + ids = [] + for revision in revcur.fetchall(): + ids.append(revision) + + parcur = conn.cursor() + parcur.execute('''SELECT parent_id FROM revision_history + WHERE id=%s''', + (revision)) + ids.extend(parcur.fetchall()) + + # Remove duplicates + ids = list(dict().fromkeys(ids)) + print(f"Found {len(ids)} distinct revisions.") + + revcur.execute('''SELECT id, date, directory FROM revision + WHERE id IN %s AND date IS NOT NULL''', + (tuple(ids),)) + revisions = list(revcur.fetchall()) + revisions.sort(key=lambda rev: rev[1]) + assert len(revisions) >= count + + print(f"Filtering first {count}.") + with io.open(filename, 'w') as outfile: + for rev in revisions[:count]: + outfile.write(f'{hash_to_hex(rev[0])},{rev[1]},{hash_to_hex(rev[2])}\n')