diff --git a/dump.py b/dump.py index d00a2ff..530770e 100755 --- a/dump.py +++ b/dump.py @@ -1,137 +1,137 @@ #!/usr/bin/env python import logging import os from swh.model.hashutil import hash_to_hex from swh.provenance import get_provenance from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } def dump(type, hash, time, path="", header="", table=""): return ( f"{str(header).ljust(5)} | " f"{str(table).ljust(5)} | " f"{str(path).ljust(30)} | " f"{type} {hash_to_hex(hash)} | " f"{str(time).rjust(10)}" ) if __name__ == "__main__": # Set minimum logging level to INFO. logging.getLogger().setLevel(logging.INFO) # Get provenance object. - provenance = get_provenance(**conninfo) - # TODO: use ProvenanceStorageInterface instead! - assert isinstance(provenance.storage, ProvenanceStoragePostgreSql) - - with provenance.storage.transaction() as cursor: - cursor.execute("""SELECT sha1, date FROM revision ORDER BY date""") - revisions = list(cursor.fetchall()) - - for idx, (revision, date) in enumerate(revisions): - # Display current revision information. - header = f"R{idx:04}" - timestamp = date.timestamp() - print(f"{timestamp} {hash_to_hex(revision)} {header}") - print(dump("R", revision, timestamp, header=header)) - - # Display content found early in current revision. + with get_provenance(**conninfo) as provenance: + # TODO: use ProvenanceStorageInterface instead! + assert isinstance(provenance.storage, ProvenanceStoragePostgreSql) + with provenance.storage.transaction() as cursor: - cursor.execute( - """SELECT content.sha1 AS blob, - content.date AS date, - content_location.path AS path - FROM (SELECT content_in_rev.blob, - location.path - FROM (SELECT content_early_in_rev.blob, - content_early_in_rev.loc - FROM content_early_in_rev - JOIN revision - ON revision.id=content_early_in_rev.rev - WHERE revision.sha1=%s - ) AS content_in_rev - JOIN location - ON location.id=content_in_rev.loc - ) AS content_location - JOIN content - ON content.id=content_location.blob - ORDER BY path""", - (revision,), - ) - content = list(cursor.fetchall()) - - for blob, date, path in content: - delta = date.timestamp() - timestamp - location = os.fsdecode(path) - print(dump("C", blob, delta, path=location, table="R---C")) - - # Display isochrone frontiers found in current revision. - cursor.execute( - """SELECT directory.sha1 AS dir, - directory.date AS date, - directory_location.path AS path - FROM (SELECT isochrone_frontier.dir, - location.path - FROM (SELECT directory_in_rev.dir, - directory_in_rev.loc - FROM directory_in_rev - JOIN revision - ON revision.id=directory_in_rev.rev - WHERE revision.sha1=%s - ) AS isochrone_frontier - JOIN location - ON location.id=isochrone_frontier.loc - ) AS directory_location - JOIN directory - ON directory.id=directory_location.dir - ORDER BY path""", - (revision,), - ) - directories = list(cursor.fetchall()) - - for directory, date, path in directories: - delta = date.timestamp() - timestamp - location = os.fsdecode(path) + "/" - if location == "/": - location = "./" - print(dump("D", directory, delta, path=location, table="R-D ")) - - # Display content found outside the current isochrone frontier. + cursor.execute("""SELECT sha1, date FROM revision ORDER BY date""") + revisions = list(cursor.fetchall()) + + for idx, (revision, date) in enumerate(revisions): + # Display current revision information. + header = f"R{idx:04}" + timestamp = date.timestamp() + print(f"{timestamp} {hash_to_hex(revision)} {header}") + print(dump("R", revision, timestamp, header=header)) + + # Display content found early in current revision. + with provenance.storage.transaction() as cursor: cursor.execute( """SELECT content.sha1 AS blob, content.date AS date, content_location.path AS path - FROM (SELECT content_outside.blob, - location.path - FROM (SELECT content_in_dir.blob, - content_in_dir.loc - FROM content_in_dir - JOIN directory - ON directory.id=content_in_dir.dir - WHERE directory.sha1=%s - ) AS content_outside + FROM (SELECT content_in_rev.blob, + location.path + FROM (SELECT content_early_in_rev.blob, + content_early_in_rev.loc + FROM content_early_in_rev + JOIN revision + ON revision.id=content_early_in_rev.rev + WHERE revision.sha1=%s + ) AS content_in_rev JOIN location - ON location.id=content_outside.loc + ON location.id=content_in_rev.loc ) AS content_location JOIN content ON content.id=content_location.blob ORDER BY path""", - (directory,), + (revision,), ) content = list(cursor.fetchall()) for blob, date, path in content: delta = date.timestamp() - timestamp - location = " + " + os.fsdecode(path) - print(dump("C", blob, delta, path=location, table=" D-C")) + location = os.fsdecode(path) + print(dump("C", blob, delta, path=location, table="R---C")) - print("") + # Display isochrone frontiers found in current revision. + cursor.execute( + """SELECT directory.sha1 AS dir, + directory.date AS date, + directory_location.path AS path + FROM (SELECT isochrone_frontier.dir, + location.path + FROM (SELECT directory_in_rev.dir, + directory_in_rev.loc + FROM directory_in_rev + JOIN revision + ON revision.id=directory_in_rev.rev + WHERE revision.sha1=%s + ) AS isochrone_frontier + JOIN location + ON location.id=isochrone_frontier.loc + ) AS directory_location + JOIN directory + ON directory.id=directory_location.dir + ORDER BY path""", + (revision,), + ) + directories = list(cursor.fetchall()) + + for directory, date, path in directories: + delta = date.timestamp() - timestamp + location = os.fsdecode(path) + "/" + if location == "/": + location = "./" + print(dump("D", directory, delta, path=location, table="R-D ")) + + # Display content found outside the current isochrone frontier. + cursor.execute( + """SELECT content.sha1 AS blob, + content.date AS date, + content_location.path AS path + FROM (SELECT content_outside.blob, + location.path + FROM (SELECT content_in_dir.blob, + content_in_dir.loc + FROM content_in_dir + JOIN directory + ON directory.id=content_in_dir.dir + WHERE directory.sha1=%s + ) AS content_outside + JOIN location + ON location.id=content_outside.loc + ) AS content_location + JOIN content + ON content.id=content_location.blob + ORDER BY path""", + (directory,), + ) + content = list(cursor.fetchall()) + + for blob, date, path in content: + delta = date.timestamp() - timestamp + location = " + " + os.fsdecode(path) + print(dump("C", blob, delta, path=location, table=" D-C")) + + print("") diff --git a/find-blob.py b/find-blob.py index 111fc87..2652c9e 100755 --- a/find-blob.py +++ b/find-blob.py @@ -1,65 +1,66 @@ #!/usr/bin/env python import logging import os import sys from swh.model.cli import identify_object from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.provenance import get_provenance # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } if __name__ == "__main__": # Set minimum logging level to INFO. logging.getLogger().setLevel(logging.INFO) if len(sys.argv) < 2: print("usage: find-blob [limit]") exit(-1) - # Get provenance object. - provenance = get_provenance(**conninfo) - obj, swhid = identify_object("content", True, True, sys.argv[1]) sha1 = hash_to_bytes(swhid.split(":")[-1]) print(f"Identifier of object {obj}: {swhid}") limit = sys.argv[2] if len(sys.argv) > 2 else None - first = provenance.content_find_first(sha1) + # Get provenance object. + with get_provenance(**conninfo) as provenance: + first = provenance.content_find_first(sha1) - if first is not None: - print( - "===========================================================================" - ) - print(f"First occurrence of {obj}:") - print( - f" content: swh:1:cnt:{hash_to_hex(first[0])}," - f" revision: swh:1:rev:{hash_to_hex(first[1])}," - f" date: {first[2]}," - f" location: {os.fsdecode(first[3])}" - ) + if first is not None: + print( + "======================================================================" + ) + print(f"First occurrence of {obj}:") + print( + f" content: swh:1:cnt:{hash_to_hex(first[0])}," + f" revision: swh:1:rev:{hash_to_hex(first[1])}," + f" date: {first[2]}," + f" location: {os.fsdecode(first[3])}" + ) - print( - "===========================================================================" - ) - if limit is None: - print(f"All occurrences of {obj}:") - else: - print(f"First {limit} occurrences of {obj}:") - for occur in provenance.content_find_all(sha1, limit=limit): print( - f" content: swh:1:cnt:{hash_to_hex(occur[0])}," - f" revision: swh:1:rev:{hash_to_hex(occur[1])}," - f" date: {occur[2]}," - f" location: {os.fsdecode(occur[3])}" + "======================================================================" ) + if limit is None: + print(f"All occurrences of {obj}:") + else: + print(f"First {limit} occurrences of {obj}:") + for occur in provenance.content_find_all(sha1, limit=limit): + print( + f" content: swh:1:cnt:{hash_to_hex(occur[0])}," + f" revision: swh:1:rev:{hash_to_hex(occur[1])}," + f" date: {occur[2]}," + f" location: {os.fsdecode(occur[3])}" + ) - else: - logging.warning("Requested content not available in the provenance database.") + else: + logging.warning( + "Requested content not available in the provenance database." + ) diff --git a/histogram.py b/histogram.py index 44ab2e4..9e2e629 100755 --- a/histogram.py +++ b/histogram.py @@ -1,47 +1,52 @@ #!/usr/bin/env python import io from swh.provenance import get_provenance from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } if __name__ == "__main__": # Get provenance object. - provenance = get_provenance(**conninfo) - # TODO: use ProvenanceStorageInterface instead! - assert isinstance(provenance.storage, ProvenanceStoragePostgreSql) - - tables = ["directory_in_rev", "content_in_dir"] - - for table in tables: - with provenance.storage.transaction() as cursor: - cursor.execute( - f"""SELECT depths.depth, COUNT(depths.depth) - FROM (SELECT - CASE location.path - WHEN '' THEN 0 - WHEN '.' THEN 0 - ELSE 1 + CHAR_LENGTH(ENCODE(location.path, 'escape')) - - CHAR_LENGTH(REPLACE(ENCODE(location.path, 'escape'), '/', '')) - END AS depth - FROM {table} - JOIN location - ON {table}.loc=location.id - ) AS depths - GROUP BY depths.depth - ORDER BY depths.depth""" - ) - - filename = "depths_" + conninfo["db"]["dbname"] + f"_{table}.csv" - - with io.open(filename, "w") as outfile: - outfile.write(f"{table} depth,{table} count\n") - for depth, count in cursor.fetchall(): - outfile.write(f"{depth},{count}\n") + with get_provenance(**conninfo) as provenance: + # TODO: use ProvenanceStorageInterface instead! + assert isinstance(provenance.storage, ProvenanceStoragePostgreSql) + + tables = ["directory_in_rev", "content_in_dir"] + + for table in tables: + with provenance.storage.transaction() as cursor: + cursor.execute( + f"""SELECT depths.depth, COUNT(depths.depth) + FROM (SELECT + CASE location.path + WHEN '' THEN 0 + WHEN '.' THEN 0 + ELSE 1 + + CHAR_LENGTH(ENCODE(location.path, 'escape')) + - CHAR_LENGTH( + REPLACE( + ENCODE(location.path, 'escape'), '/', '' + ) + ) + END AS depth + FROM {table} + JOIN location + ON {table}.loc=location.id + ) AS depths + GROUP BY depths.depth + ORDER BY depths.depth""" + ) + + filename = "depths_" + conninfo["db"]["dbname"] + f"_{table}.csv" + + with io.open(filename, "w") as outfile: + outfile.write(f"{table} depth,{table} count\n") + for depth, count in cursor.fetchall(): + outfile.write(f"{depth},{count}\n") diff --git a/metrics.py b/metrics.py index 82e9c97..25d6123 100755 --- a/metrics.py +++ b/metrics.py @@ -1,158 +1,158 @@ #!/usr/bin/env python from swh.provenance import get_provenance from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql from swh.provenance.provenance import ProvenanceInterface # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } def get_tables_stats(provenance: ProvenanceInterface): # TODO: use ProvenanceStorageInterface instead! assert isinstance(provenance.storage, ProvenanceStoragePostgreSql) tables = { "content": dict(), "content_early_in_rev": dict(), "content_in_dir": dict(), "directory": dict(), "directory_in_rev": dict(), "location": dict(), "revision": dict(), } for table in tables: with provenance.storage.transaction() as cursor: cursor.execute(f"SELECT COUNT(*) FROM {table}") tables[table]["row_count"] = cursor.fetchone()[0] cursor.execute(f"SELECT pg_table_size('{table}')") tables[table]["table_size"] = cursor.fetchone()[0] cursor.execute(f"SELECT pg_indexes_size('{table}')") tables[table]["indexes_size"] = cursor.fetchone()[0] # cursor.execute(f"SELECT pg_total_relation_size('{table}')") # relation_size[table] = cursor.fetchone()[0] tables[table]["relation_size"] = ( tables[table]["table_size"] + tables[table]["indexes_size"] ) return tables if __name__ == "__main__": # Get provenance object. - provenance = get_provenance(**conninfo) - # TODO: use ProvenanceStorageInterface instead! - assert isinstance(provenance.storage, ProvenanceStoragePostgreSql) - - tables = get_tables_stats(provenance) + with get_provenance(**conninfo) as provenance: + # TODO: use ProvenanceStorageInterface instead! + assert isinstance(provenance.storage, ProvenanceStoragePostgreSql) + + tables = get_tables_stats(provenance) + + for table in tables: + row_count = tables[table]["row_count"] + table_size = tables[table]["table_size"] + indexes_size = tables[table]["indexes_size"] + relation_size = tables[table]["relation_size"] + + print(f"{table}:") + print(f" total rows: {row_count}") + if row_count == 0: + row_count = 1 + print( + f" table size: {table_size} bytes ({table_size / row_count:.2f} per row)" + ) + print( + f" index size: {indexes_size} bytes ({indexes_size / row_count:.2f} per row)" + ) + print( + f" total size: {relation_size} bytes ({relation_size / row_count:.2f} per row)" + ) - for table in tables: - row_count = tables[table]["row_count"] - table_size = tables[table]["table_size"] - indexes_size = tables[table]["indexes_size"] - relation_size = tables[table]["relation_size"] - - print(f"{table}:") - print(f" total rows: {row_count}") - if row_count == 0: - row_count = 1 + # Ratios between de different entities/relations. + print("ratios:") print( - f" table size: {table_size} bytes ({table_size / row_count:.2f} per row)" + f" content/revision: {tables['content']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" ) print( - f" index size: {indexes_size} bytes ({indexes_size / row_count:.2f} per row)" + f" content_early_in_rev/content: {tables['content_early_in_rev']['row_count'] / (tables['content']['row_count'] if tables['content']['row_count'] != 0 else 1):.2f}" ) print( - f" total size: {relation_size} bytes ({relation_size / row_count:.2f} per row)" + f" content_in_dir/content: {tables['content_in_dir']['row_count'] / (tables['content']['row_count'] if tables['content']['row_count'] != 0 else 1):.2f}" ) - - # Ratios between de different entities/relations. - print("ratios:") - print( - f" content/revision: {tables['content']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" - ) - print( - f" content_early_in_rev/content: {tables['content_early_in_rev']['row_count'] / (tables['content']['row_count'] if tables['content']['row_count'] != 0 else 1):.2f}" - ) - print( - f" content_in_dir/content: {tables['content_in_dir']['row_count'] / (tables['content']['row_count'] if tables['content']['row_count'] != 0 else 1):.2f}" - ) - print( - f" directory/revision: {tables['directory']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" - ) - print( - f" directory_in_rev/directory: {tables['directory_in_rev']['row_count'] / (tables['directory']['row_count'] if tables['directory']['row_count'] != 0 else 1):.2f}" - ) - print(f" ==============================") - print( - f" content_early_in_rev/revision: {tables['content_early_in_rev']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" - ) - print( - f" content_in_dir/directory: {tables['content_in_dir']['row_count'] / (tables['directory']['row_count'] if tables['directory']['row_count'] != 0 else 1):.2f}" - ) - print( - f" directory_in_rev/revision: {tables['directory_in_rev']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" - ) - - # Metrics for frontiers defined in root directories. - with provenance.storage.transaction() as cursor: - cursor.execute( - f"""SELECT dir - FROM directory_in_rev - INNER JOIN location - ON loc=location.id - WHERE location.path=%s""", - (b"",), + print( + f" directory/revision: {tables['directory']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" + ) + print( + f" directory_in_rev/directory: {tables['directory_in_rev']['row_count'] / (tables['directory']['row_count'] if tables['directory']['row_count'] != 0 else 1):.2f}" ) - directories = list(cursor.fetchall()) - print(f"Total root frontiers used: {len(directories)}") - - cursor.execute( - f"""SELECT dir - FROM directory_in_rev - INNER JOIN location - ON loc=location.id - WHERE location.path=%s - GROUP BY dir""", - (b"",), + print(f" ==============================") + print( + f" content_early_in_rev/revision: {tables['content_early_in_rev']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" ) - directories = list(cursor.fetchall()) - print(f"Total distinct root frontiers: {len(directories)}") - - cursor.execute( - f"""SELECT roots.dir - FROM (SELECT dir, loc - FROM directory_in_rev - INNER JOIN location - ON loc=location.id - WHERE location.path=%s) AS roots - JOIN directory_in_rev - ON directory_in_rev.dir=roots.dir - WHERE directory_in_rev.loc!=roots.loc""", - (b"",), + print( + f" content_in_dir/directory: {tables['content_in_dir']['row_count'] / (tables['directory']['row_count'] if tables['directory']['row_count'] != 0 else 1):.2f}" ) - directories = list(cursor.fetchall()) - print(f"Total other uses of these frontiers: {len(directories)}") - - cursor.execute( - f"""SELECT roots.dir - FROM (SELECT dir, loc - FROM directory_in_rev - INNER JOIN location - ON loc=location.id - WHERE location.path=%s) AS roots - JOIN directory_in_rev - ON directory_in_rev.dir=roots.dir - WHERE directory_in_rev.loc!=roots.loc - GROUP BY roots.dir""", - (b"",), + print( + f" directory_in_rev/revision: {tables['directory_in_rev']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" ) - directories = list(cursor.fetchall()) - print(f"Total distinct other uses of frontiers: {len(directories)}") + + # Metrics for frontiers defined in root directories. + with provenance.storage.transaction() as cursor: + cursor.execute( + f"""SELECT dir + FROM directory_in_rev + INNER JOIN location + ON loc=location.id + WHERE location.path=%s""", + (b"",), + ) + directories = list(cursor.fetchall()) + print(f"Total root frontiers used: {len(directories)}") + + cursor.execute( + f"""SELECT dir + FROM directory_in_rev + INNER JOIN location + ON loc=location.id + WHERE location.path=%s + GROUP BY dir""", + (b"",), + ) + directories = list(cursor.fetchall()) + print(f"Total distinct root frontiers: {len(directories)}") + + cursor.execute( + f"""SELECT roots.dir + FROM (SELECT dir, loc + FROM directory_in_rev + INNER JOIN location + ON loc=location.id + WHERE location.path=%s) AS roots + JOIN directory_in_rev + ON directory_in_rev.dir=roots.dir + WHERE directory_in_rev.loc!=roots.loc""", + (b"",), + ) + directories = list(cursor.fetchall()) + print(f"Total other uses of these frontiers: {len(directories)}") + + cursor.execute( + f"""SELECT roots.dir + FROM (SELECT dir, loc + FROM directory_in_rev + INNER JOIN location + ON loc=location.id + WHERE location.path=%s) AS roots + JOIN directory_in_rev + ON directory_in_rev.dir=roots.dir + WHERE directory_in_rev.loc!=roots.loc + GROUP BY roots.dir""", + (b"",), + ) + directories = list(cursor.fetchall()) + print(f"Total distinct other uses of frontiers: {len(directories)}")