diff --git a/dump.py b/dump.py index 7703c1f..85bd66f 100755 --- a/dump.py +++ b/dump.py @@ -1,129 +1,129 @@ #!/usr/bin/env python import glob import io import logging import os import psycopg2 from swh.model.hashutil import hash_to_hex from swh.provenance import get_provenance # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } def dump(type, hash, time, path="", header="", table=""): return f"{str(header).ljust(5)} | {str(table).ljust(5)} | {str(path).ljust(30)} | {type} {hash_to_hex(hash)} | {str(time).rjust(10)}" if __name__ == "__main__": # Set minimum logging level to INFO. logging.getLogger().setLevel(logging.INFO) - # Get provenance object for both databases and query its lists of content. + # Get provenance object. provenance = get_provenance(**conninfo) provenance.cursor.execute("""SELECT sha1, date FROM revision ORDER BY date""") revisions = list(provenance.cursor.fetchall()) for idx, (revision, date) in enumerate(revisions): # Display current revision information. header = f"R{idx:04}" timestamp = date.timestamp() print(f"{timestamp} {hash_to_hex(revision)} {header}") print(dump("R", revision, timestamp, header=header)) # Display content found early in current revision. provenance.cursor.execute( """SELECT content.sha1 AS blob, content.date AS date, content_location.path AS path FROM (SELECT content_in_rev.blob, location.path FROM (SELECT content_early_in_rev.blob, content_early_in_rev.loc FROM content_early_in_rev JOIN revision ON revision.id=content_early_in_rev.rev WHERE revision.sha1=%s ) AS content_in_rev JOIN location ON location.id=content_in_rev.loc ) AS content_location JOIN content ON content.id=content_location.blob ORDER BY path""", (revision,) ) content = list(provenance.cursor.fetchall()) for blob, date, path in content: delta = date.timestamp() - timestamp location = os.fsdecode(path) print(dump("C", blob, delta, path=location, table="R---C")) # Display isochrone frontiers found in current revision. provenance.cursor.execute( """SELECT directory.sha1 AS dir, directory.date AS date, directory_location.path AS path FROM (SELECT isochrone_frontier.dir, location.path FROM (SELECT directory_in_rev.dir, directory_in_rev.loc FROM directory_in_rev JOIN revision ON revision.id=directory_in_rev.rev WHERE revision.sha1=%s ) AS isochrone_frontier JOIN location ON location.id=isochrone_frontier.loc ) AS directory_location JOIN directory ON directory.id=directory_location.dir ORDER BY path""", (revision,) ) directories = list(provenance.cursor.fetchall()) for directory, date, path in directories: delta = date.timestamp() - timestamp location = os.fsdecode(path) + "/" if location == "/": location = "./" print(dump("D", directory, delta, path=location, table="R-D ")) # Display content found outside the current isochrone frontier. provenance.cursor.execute( """SELECT content.sha1 AS blob, content.date AS date, content_location.path AS path FROM (SELECT content_outside.blob, location.path FROM (SELECT content_in_dir.blob, content_in_dir.loc FROM content_in_dir JOIN directory ON directory.id=content_in_dir.dir WHERE directory.sha1=%s ) AS content_outside JOIN location ON location.id=content_outside.loc ) AS content_location JOIN content ON content.id=content_location.blob ORDER BY path""", (directory,) ) content = list(provenance.cursor.fetchall()) for blob, date, path in content: delta = date.timestamp() - timestamp location = " + " + os.fsdecode(path) print(dump("C", blob, delta, path=location, table=" D-C")) print("") diff --git a/find-blob.py b/find-blob.py index ed5435a..2865f3c 100755 --- a/find-blob.py +++ b/find-blob.py @@ -1,66 +1,66 @@ #!/usr/bin/env python import logging import os import sys from swh.model.cli import identify_object from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.provenance import get_provenance # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } if __name__ == "__main__": # Set minimum logging level to INFO. logging.getLogger().setLevel(logging.INFO) if len(sys.argv) < 2: print("usage: find-blob [limit]") exit(-1) - # Get provenance object for both databases and query its lists of content. + # Get provenance object. provenance = get_provenance(**conninfo) obj, swhid = identify_object("content", True, True, sys.argv[1]) sha1 = hash_to_bytes(swhid.split(":")[-1]) print(f"Identifier of object {obj}: {swhid}") limit = sys.argv[2] if len(sys.argv) > 2 else None first = provenance.content_find_first(sha1) if first is not None: print("===============================================================================") print(f"First occurrence of {obj}:") print( " content: swh:1:cnt:{cnt}, revision: swh:1:rev:{rev}, date: {date}, location: {path}".format( cnt=hash_to_hex(first[0]), rev=hash_to_hex(first[1]), date=first[2], path=os.fsdecode(first[3]), ) ) print("===============================================================================") if limit is None: print(f"All occurrences of {obj}:") else: print(f"First {limit} occurrences of {obj}:") for occur in provenance.content_find_all(sha1, limit=limit): print( " content: swh:1:cnt:{cnt}, revision: swh:1:rev:{rev}, date: {date}, location: {path}".format( cnt=hash_to_hex(occur[0]), rev=hash_to_hex(occur[1]), date=occur[2], path=os.fsdecode(occur[3]), ) ) else: logging.warning("Requested content not available in the provenance database.") diff --git a/histogram.py b/histogram.py index 3e78e2a..c44d2ee 100755 --- a/histogram.py +++ b/histogram.py @@ -1,43 +1,44 @@ #!/usr/bin/env python import io -import os - -from collections import Counter from swh.provenance import get_provenance # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } if __name__ == "__main__": - # Get provenance object for both databases and query its lists of content. + # Get provenance object. provenance = get_provenance(**conninfo) tables = ["directory_in_rev", "content_in_dir"] for table in tables: - provenance.cursor.execute(f"""SELECT depths.depth, COUNT(depths.depth) - FROM (SELECT - CASE location.path - WHEN '' THEN 0 - WHEN '.' THEN 0 - ELSE 1 + CHAR_LENGTH(ENCODE(location.path, 'escape')) - - CHAR_LENGTH(REPLACE(ENCODE(location.path, 'escape'), '/', '')) - END AS depth - FROM {table} - JOIN location - ON {table}.loc=location.id - ) AS depths - GROUP BY depths.depth - ORDER BY depths.depth""") + provenance.cursor.execute(f""" + SELECT depths.depth, COUNT(depths.depth) + FROM (SELECT + CASE location.path + WHEN '' THEN 0 + WHEN '.' THEN 0 + ELSE 1 + CHAR_LENGTH(ENCODE(location.path, 'escape')) - + CHAR_LENGTH(REPLACE(ENCODE(location.path, 'escape'), '/', '')) + END AS depth + FROM {table} + JOIN location + ON {table}.loc=location.id + ) AS depths + GROUP BY depths.depth + ORDER BY depths.depth + """) + filename = "depths_" + conninfo["db"]["dbname"] + f"_{table}.csv" + with io.open(filename, "w") as outfile: outfile.write(f"{table} depth,{table} count\n") for depth, count in provenance.cursor.fetchall(): outfile.write(f"{depth},{count}\n") diff --git a/metrics.py b/metrics.py index 49e95fa..c58fb33 100755 --- a/metrics.py +++ b/metrics.py @@ -1,158 +1,121 @@ #!/usr/bin/env python import io import json import os import sys from swh.model.hashutil import hash_to_hex from swh.provenance import get_provenance from swh.provenance.provenance import ProvenanceInterface # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } def get_tables_stats(provenance: ProvenanceInterface): tables = { "content": dict(), "content_early_in_rev": dict(), "content_in_dir": dict(), "directory": dict(), "directory_in_rev": dict(), "location": dict(), "revision": dict() } for table in tables: provenance.cursor.execute(f"SELECT COUNT(*) FROM {table}") tables[table]["row_count"] = provenance.cursor.fetchone()[0] provenance.cursor.execute(f"SELECT pg_table_size('{table}')") tables[table]["table_size"] = provenance.cursor.fetchone()[0] provenance.cursor.execute(f"SELECT pg_indexes_size('{table}')") tables[table]["indexes_size"] = provenance.cursor.fetchone()[0] # provenance.cursor.execute(f"SELECT pg_total_relation_size('{table}')") # relation_size[table] = provenance.cursor.fetchone()[0] tables[table]["relation_size"] = tables[table]["table_size"] + tables[table]["indexes_size"] return tables if __name__ == "__main__": - # Get provenance object for both databases and query its lists of content. + # Get provenance object. provenance = get_provenance(**conninfo) tables = get_tables_stats(provenance) for table in tables: row_count = tables[table]["row_count"] table_size = tables[table]["table_size"] indexes_size = tables[table]["indexes_size"] relation_size = tables[table]["relation_size"] print(f"{table}:") print(f" total rows: {row_count}") if row_count == 0: row_count = 1 print(f" table size: {table_size} bytes ({table_size / row_count:.2f} per row)") print(f" index size: {indexes_size} bytes ({indexes_size / row_count:.2f} per row)") print(f" total size: {relation_size} bytes ({relation_size / row_count:.2f} per row)") # Ratios between de different entities/relations. print("ratios:") print(f" content/revision: {tables['content']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}") print(f" content_early_in_rev/content: {tables['content_early_in_rev']['row_count'] / (tables['content']['row_count'] if tables['content']['row_count'] != 0 else 1):.2f}") print(f" content_in_dir/content: {tables['content_in_dir']['row_count'] / (tables['content']['row_count'] if tables['content']['row_count'] != 0 else 1):.2f}") print(f" directory/revision: {tables['directory']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}") print(f" directory_in_rev/directory: {tables['directory_in_rev']['row_count'] / (tables['directory']['row_count'] if tables['directory']['row_count'] != 0 else 1):.2f}") print(f" ==============================") print(f" content_early_in_rev/revision: {tables['content_early_in_rev']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}") print(f" content_in_dir/directory: {tables['content_in_dir']['row_count'] / (tables['directory']['row_count'] if tables['directory']['row_count'] != 0 else 1):.2f}") print(f" directory_in_rev/revision: {tables['directory_in_rev']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}") # Metrics for frontiers defined in root directories. provenance.cursor.execute(f"""SELECT dir FROM directory_in_rev INNER JOIN location ON loc=location.id WHERE location.path=%s""", (b"",)) directories = list(provenance.cursor.fetchall()) print(f"Total root frontiers used: {len(directories)}") provenance.cursor.execute(f"""SELECT dir FROM directory_in_rev INNER JOIN location ON loc=location.id WHERE location.path=%s GROUP BY dir""", (b"",)) directories = list(provenance.cursor.fetchall()) print(f"Total distinct root frontiers: {len(directories)}") provenance.cursor.execute(f"""SELECT roots.dir FROM (SELECT dir, loc FROM directory_in_rev INNER JOIN location ON loc=location.id WHERE location.path=%s) AS roots JOIN directory_in_rev ON directory_in_rev.dir=roots.dir WHERE directory_in_rev.loc!=roots.loc""", (b"",)) directories = list(provenance.cursor.fetchall()) print(f"Total other uses of these frontiers: {len(directories)}") provenance.cursor.execute(f"""SELECT roots.dir FROM (SELECT dir, loc FROM directory_in_rev INNER JOIN location ON loc=location.id WHERE location.path=%s) AS roots JOIN directory_in_rev ON directory_in_rev.dir=roots.dir WHERE directory_in_rev.loc!=roots.loc GROUP BY roots.dir""", (b"",)) directories = list(provenance.cursor.fetchall()) print(f"Total distinct other uses of frontiers: {len(directories)}") - - # provenance.cursor.execute(f"""SELECT location.path - # FROM directory_in_rev - # JOIN location - # ON directory_in_rev.loc=location.id""") - # depths = list(map(lambda row: os.fsdecode(row[0]).count('/'), provenance.cursor.fetchall())) - # with io.open("directory_in_rev.json", "w") as outfile: - # outfile.write(json.dumps(depths)) - - # provenance.cursor.execute(f"""SELECT location.path - # FROM content_in_dir - # JOIN location - # ON content_in_dir.loc=location.id""") - # depths = list(map(lambda row: os.fsdecode(row[0]).count('/'), provenance.cursor.fetchall())) - # with io.open("content_in_dir.json", "w") as outfile: - # outfile.write(json.dumps(depths)) - - - - - -# Query the 'limit' most common files inside any isochrone frontier. -# f"SELECT blob, COUNT(blob) AS occur FROM content_early_in_rev GROUP BY blob ORDER BY occur DESC LIMIT {limit}" - -# Query the 'limit' most common files outside any isochrone frontier. -# f"SELECT blob, COUNT(blob) AS occur FROM content_in_dir GROUP BY blob ORDER BY occur DESC LIMIT {limit}" -# blob 141557 | occur 34610802 - -# f"SELECT dir FROM directory_in_rev INNER JOIN location ON loc=location.id WHERE location.path=%s" - -# f"SELECT blob, COUNT(blob) AS occur FROM content_in_dir GROUP BY blob ORDER BY occur DESC LIMIT {limit}" - -# f"SELECT depth, COUNT(depth) AS occur FROM (SELECT ARRAY_LENGTH(STRING_TO_ARRAY(path, '/'), 1) - 1 AS depth FROM location) GROUP BY depth ORDER BY occur ASC" - -# f"SELECT path FROM location JOIN content_in_dir ON location.id=content_in_dir.loc WHERE blob=%s GROUP BY path" -# f"SELECT ENCODE(location.path::bytea, 'escape'), COUNT(*) FROM content_in_dir INNER JOIN location ON loc=location.id WHERE blob=%s GROUP BY 1 ORDER BY 2 DESC" -# f"SELECT ENCODE(sha1::bytea, 'escape') FROM content WHERE id=%s"