diff --git a/compare-all.py b/compare-all.py index 3616a78..fc10abf 100755 --- a/compare-all.py +++ b/compare-all.py @@ -1,180 +1,186 @@ #!/usr/bin/env python import glob import io import logging import os -import psycopg2 from swh.model.hashutil import hash_to_hex from swh.provenance import get_provenance - +from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql # TODO: take conninfo as command line arguments. conninfo1 = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "old"}, } conninfo2 = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } # Write log file with occurrence detail. def logdiff(filename, occurrences): with io.open(filename, "a") as outfile: for row in occurrences: try: # Try to decode path. path = os.fsdecode(row[3]).decode("utf-8", "replace") except: # Use its raw value if not possible path = row[3] outfile.write( "{blob},{rev},{date},{path}\n".format( blob=hash_to_hex(row[0]), rev=hash_to_hex(row[1]), date=row[2], path=path, ) ) # Write log file with list of occurrences. def loglist(filename, occurrences): with io.open(filename, "a") as outfile: for blobid in occurrences: outfile.write("{blob}\n".format(blob=hash_to_hex(blobid))) # Output log file name. nextidx = None def outfilename(suffix): global nextidx basename, _ = os.path.splitext(os.path.basename(os.path.abspath(__file__))) prefix = os.path.join(os.getcwd(), basename + "-") if nextidx is None: nextidx = 0 for filename in glob.glob(f"{prefix}*.log"): try: lastidx = int(filename.strip(prefix).split("-")[0]) nextidx = max(nextidx, lastidx + 1) except: continue return f"{prefix}{nextidx:02}-{suffix}.log" # Print iterations progress. # TODO: move to utils module. def progress( iteration, total, prefix="Progress:", suffix="Complete", decimals=1, length=50, fill="█", printEnd="\r", ): """ Call in a loop to create terminal progress bar @params: iteration - Required : current iteration (Int) total - Required : total iterations (Int) prefix - Optional : prefix string (Str) suffix - Optional : suffix string (Str) decimals - Optional : positive number of decimals in percent complete (Int) length - Optional : character length of bar (Int) fill - Optional : bar fill character (Str) printEnd - Optional : end character (e.g. "\r", "\r\n") (Str) """ percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) filledLength = int(length * iteration // total) bar = fill * filledLength + "-" * (length - filledLength) print(f"\r{prefix} |{bar}| {percent}% {suffix}", end=printEnd) # Print New Line on Complete if iteration == total: print() if __name__ == "__main__": # Set minimum logging level to INFO. logging.getLogger().setLevel(logging.INFO) # Get provenance object for both databases and query its lists of content. provenance1 = get_provenance(**conninfo1) provenance2 = get_provenance(**conninfo2) - provenance1.cursor.execute("""SELECT id FROM content ORDER BY id""") - content1 = set(map(lambda row: row[0], provenance1.cursor.fetchall())) + # TODO: use ProvenanceStorageInterface instead! + assert isinstance(provenance1.storage, ProvenanceStoragePostgreSql) + assert isinstance(provenance2.storage, ProvenanceStoragePostgreSql) + + with provenance1.storage.transaction() as cursor: + cursor.execute("""SELECT id FROM content ORDER BY id""") + content1 = set(map(lambda row: row[0], cursor.fetchall())) - provenance2.cursor.execute("""SELECT sha1 FROM content ORDER BY sha1""") - content2 = set(map(lambda row: row[0], provenance2.cursor.fetchall())) + with provenance2.storage.transaction() as cursor: + cursor.execute("""SELECT sha1 FROM content ORDER BY sha1""") + content2 = set(map(lambda row: row[0], cursor.fetchall())) if content1 == content2: # If lists of content match, we check that occurrences does as well. total = len(content1) progress(0, total) mismatch = False # Iterate over all content querying all its occurrences on both databases. for i, blobid in enumerate(content1): - provenance1.cursor.execute( - """(SELECT content_early_in_rev.blob, - content_early_in_rev.rev, - revision.date, - content_early_in_rev.path - FROM content_early_in_rev - JOIN revision - ON revision.id=content_early_in_rev.rev - WHERE content_early_in_rev.blob=%s - ) - UNION - (SELECT content_in_rev.blob, - content_in_rev.rev, - revision.date, - content_in_rev.path - FROM (SELECT content_in_dir.blob, - directory_in_rev.rev, - CASE directory_in_rev.path - WHEN '' THEN content_in_dir.path - WHEN '.' THEN content_in_dir.path - ELSE (directory_in_rev.path || '/' || - content_in_dir.path)::unix_path - END AS path - FROM content_in_dir - JOIN directory_in_rev - ON content_in_dir.dir=directory_in_rev.dir - WHERE content_in_dir.blob=%s - ) AS content_in_rev - JOIN revision - ON revision.id=content_in_rev.rev - ) - ORDER BY date, rev, path""", - (blobid, blobid), - ) - occurrences1 = list(provenance1.cursor.fetchall()) + with provenance1.storage.transaction() as cursor: + cursor.execute( + """(SELECT content_early_in_rev.blob, + content_early_in_rev.rev, + revision.date, + content_early_in_rev.path + FROM content_early_in_rev + JOIN revision + ON revision.id=content_early_in_rev.rev + WHERE content_early_in_rev.blob=%s + ) + UNION + (SELECT content_in_rev.blob, + content_in_rev.rev, + revision.date, + content_in_rev.path + FROM (SELECT content_in_dir.blob, + directory_in_rev.rev, + CASE directory_in_rev.path + WHEN '' THEN content_in_dir.path + WHEN '.' THEN content_in_dir.path + ELSE (directory_in_rev.path || '/' || + content_in_dir.path)::unix_path + END AS path + FROM content_in_dir + JOIN directory_in_rev + ON content_in_dir.dir=directory_in_rev.dir + WHERE content_in_dir.blob=%s + ) AS content_in_rev + JOIN revision + ON revision.id=content_in_rev.rev + ) + ORDER BY date, rev, path""", + (blobid, blobid), + ) + occurrences1 = list(cursor.fetchall()) occurrences2 = list(provenance2.content_find_all(blobid)) # If there is a mismatch log it to file. if len(occurrences1) != len(occurrences2) or set(occurrences1) != set( occurrences2 ): mismatch = True logging.warning(f"Occurrencies mismatch for {hash_to_hex(blobid)}") logdiff(outfilename(conninfo1["db"]["dbname"]), occurrences1) logdiff(outfilename(conninfo2["db"]["dbname"]), occurrences2) progress(i + 1, total) if not mismatch: logging.info("Databases are equivalent!") else: # If lists of content don't match, we are done. loglist(outfilename(conninfo1["db"]["dbname"]), content1) loglist(outfilename(conninfo2["db"]["dbname"]), content2) logging.warning("Content lists are different") diff --git a/compare-first.py b/compare-first.py index 18d9cbb..dfc8354 100755 --- a/compare-first.py +++ b/compare-first.py @@ -1,156 +1,162 @@ #!/usr/bin/env python import glob import io import logging import os -import psycopg2 from swh.model.hashutil import hash_to_hex from swh.provenance import get_provenance - +from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql # TODO: take conninfo as command line arguments. conninfo1 = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "old"}, } conninfo2 = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } # Write log file with occurrence detail. def logdiff(filename, occurrence): with io.open(filename, "a") as outfile: try: # Try to decode path. path = os.fsdecode(occurrence[3]).decode("utf-8", "replace") except: # Use its raw value if not possible path = occurrence[3] outfile.write( "{blob},{rev},{date},{path}\n".format( blob=hash_to_hex(occurrence[0]), rev=hash_to_hex(occurrence[1]), date=occurrence[2], path=path, ) ) # Write log file with list of occurrences. def loglist(filename, occurrences): with io.open(filename, "a") as outfile: for blobid in occurrences: outfile.write("{blob}\n".format(blob=hash_to_hex(blobid))) # Output log file name. nextidx = None def outfilename(suffix): global nextidx basename, _ = os.path.splitext(os.path.basename(os.path.abspath(__file__))) prefix = os.path.join(os.getcwd(), basename + "-") if nextidx is None: nextidx = 0 for filename in glob.glob(f"{prefix}*.log"): try: lastidx = int(filename.strip(prefix).split("-")[0]) nextidx = max(nextidx, lastidx + 1) except: continue return f"{prefix}{nextidx:02}-{suffix}.log" # Print iterations progress. # TODO: move to utils module. def progress( iteration, total, prefix="Progress:", suffix="Complete", decimals=1, length=50, fill="█", printEnd="\r", ): """ Call in a loop to create terminal progress bar @params: iteration - Required : current iteration (Int) total - Required : total iterations (Int) prefix - Optional : prefix string (Str) suffix - Optional : suffix string (Str) decimals - Optional : positive number of decimals in percent complete (Int) length - Optional : character length of bar (Int) fill - Optional : bar fill character (Str) printEnd - Optional : end character (e.g. "\r", "\r\n") (Str) """ percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) filledLength = int(length * iteration // total) bar = fill * filledLength + "-" * (length - filledLength) print(f"\r{prefix} |{bar}| {percent}% {suffix}", end=printEnd) # Print New Line on Complete if iteration == total: print() if __name__ == "__main__": # Set minimum logging level to INFO. logging.getLogger().setLevel(logging.INFO) # Get provenance object for both databases and query its lists of content. provenance1 = get_provenance(**conninfo1) provenance2 = get_provenance(**conninfo2) - provenance1.cursor.execute("""SELECT id FROM content ORDER BY id""") - content1 = set(map(lambda row: row[0], provenance1.cursor.fetchall())) + # TODO: use ProvenanceStorageInterface instead! + assert isinstance(provenance1.storage, ProvenanceStoragePostgreSql) + assert isinstance(provenance2.storage, ProvenanceStoragePostgreSql) + + with provenance1.storage.transaction() as cursor: + cursor.execute("""SELECT id FROM content ORDER BY id""") + content1 = set(map(lambda row: row[0], cursor.fetchall())) - provenance2.cursor.execute("""SELECT sha1 FROM content ORDER BY sha1""") - content2 = set(map(lambda row: row[0], provenance2.cursor.fetchall())) + with provenance2.storage.transaction() as cursor: + cursor.execute("""SELECT sha1 FROM content ORDER BY sha1""") + content2 = set(map(lambda row: row[0], cursor.fetchall())) if content1 == content2: # If lists of content match, we check that occurrences does as well. total = len(content1) progress(0, total) mismatch = False # Iterate over all content querying all its occurrences on both databases. for i, blobid in enumerate(content1): - provenance1.cursor.execute( - """SELECT content_early_in_rev.blob, - content_early_in_rev.rev, - revision.date, - content_early_in_rev.path - FROM content_early_in_rev - JOIN revision - ON revision.id=content_early_in_rev.rev - WHERE content_early_in_rev.blob=%s - ORDER BY date, rev, path ASC LIMIT 1""", - (blobid,), - ) - occurrence1 = provenance1.cursor.fetchone() + with provenance1.storage.transaction() as cursor: + cursor.execute( + """SELECT content_early_in_rev.blob, + content_early_in_rev.rev, + revision.date, + content_early_in_rev.path + FROM content_early_in_rev + JOIN revision + ON revision.id=content_early_in_rev.rev + WHERE content_early_in_rev.blob=%s + ORDER BY date, rev, path ASC LIMIT 1""", + (blobid,), + ) + occurrence1 = cursor.fetchone() occurrence2 = provenance2.content_find_first(blobid) # If there is a mismatch log it to file. We can only compare the timestamp # as the same blob might be seen for the first time in different locations. if occurrence1[2] != occurrence2[2]: mismatch = True logging.warning(f"Occurrencies mismatch for {hash_to_hex(blobid)}") logdiff(outfilename(conninfo1["db"]["dbname"]), occurrence1) logdiff(outfilename(conninfo2["db"]["dbname"]), occurrence2) progress(i + 1, total) if not mismatch: logging.info("Databases are equivalent!") else: # If lists of content don't match, we are done. loglist(outfilename(conninfo1["db"]["dbname"]), content1) loglist(outfilename(conninfo2["db"]["dbname"]), content2) logging.warning("Content lists are different") diff --git a/count.py b/count.py index 82b920f..b0b7312 100755 --- a/count.py +++ b/count.py @@ -1,18 +1,19 @@ #!/usr/bin/env python +import io import sys -def linecount(filename): +def linecount(filename: str) -> None: count = 0 - for line in open(filename).xreadlines(): + for _ in io.open(filename).xreadlines(): count += 1 return count if __name__ == "__main__": if len(sys.argv) != 2: print("usage: count ") exit(-1) print(linecount(sys.argv[1])) diff --git a/dump.py b/dump.py index 82260b9..d00a2ff 100755 --- a/dump.py +++ b/dump.py @@ -1,130 +1,137 @@ #!/usr/bin/env python -import glob -import io import logging import os -import psycopg2 from swh.model.hashutil import hash_to_hex from swh.provenance import get_provenance - +from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } def dump(type, hash, time, path="", header="", table=""): - return f"{str(header).ljust(5)} | {str(table).ljust(5)} | {str(path).ljust(30)} | {type} {hash_to_hex(hash)} | {str(time).rjust(10)}" + return ( + f"{str(header).ljust(5)} | " + f"{str(table).ljust(5)} | " + f"{str(path).ljust(30)} | " + f"{type} {hash_to_hex(hash)} | " + f"{str(time).rjust(10)}" + ) if __name__ == "__main__": # Set minimum logging level to INFO. logging.getLogger().setLevel(logging.INFO) # Get provenance object. provenance = get_provenance(**conninfo) + # TODO: use ProvenanceStorageInterface instead! + assert isinstance(provenance.storage, ProvenanceStoragePostgreSql) - provenance.cursor.execute("""SELECT sha1, date FROM revision ORDER BY date""") - revisions = list(provenance.cursor.fetchall()) + with provenance.storage.transaction() as cursor: + cursor.execute("""SELECT sha1, date FROM revision ORDER BY date""") + revisions = list(cursor.fetchall()) for idx, (revision, date) in enumerate(revisions): # Display current revision information. header = f"R{idx:04}" timestamp = date.timestamp() print(f"{timestamp} {hash_to_hex(revision)} {header}") print(dump("R", revision, timestamp, header=header)) # Display content found early in current revision. - provenance.cursor.execute( - """SELECT content.sha1 AS blob, - content.date AS date, - content_location.path AS path - FROM (SELECT content_in_rev.blob, - location.path - FROM (SELECT content_early_in_rev.blob, - content_early_in_rev.loc - FROM content_early_in_rev - JOIN revision - ON revision.id=content_early_in_rev.rev - WHERE revision.sha1=%s - ) AS content_in_rev - JOIN location - ON location.id=content_in_rev.loc - ) AS content_location - JOIN content - ON content.id=content_location.blob - ORDER BY path""", - (revision,), - ) - content = list(provenance.cursor.fetchall()) - - for blob, date, path in content: - delta = date.timestamp() - timestamp - location = os.fsdecode(path) - print(dump("C", blob, delta, path=location, table="R---C")) - - # Display isochrone frontiers found in current revision. - provenance.cursor.execute( - """SELECT directory.sha1 AS dir, - directory.date AS date, - directory_location.path AS path - FROM (SELECT isochrone_frontier.dir, - location.path - FROM (SELECT directory_in_rev.dir, - directory_in_rev.loc - FROM directory_in_rev - JOIN revision - ON revision.id=directory_in_rev.rev - WHERE revision.sha1=%s - ) AS isochrone_frontier - JOIN location - ON location.id=isochrone_frontier.loc - ) AS directory_location - JOIN directory - ON directory.id=directory_location.dir - ORDER BY path""", - (revision,), - ) - directories = list(provenance.cursor.fetchall()) - - for directory, date, path in directories: - delta = date.timestamp() - timestamp - location = os.fsdecode(path) + "/" - if location == "/": - location = "./" - print(dump("D", directory, delta, path=location, table="R-D ")) - - # Display content found outside the current isochrone frontier. - provenance.cursor.execute( + with provenance.storage.transaction() as cursor: + cursor.execute( """SELECT content.sha1 AS blob, content.date AS date, content_location.path AS path - FROM (SELECT content_outside.blob, + FROM (SELECT content_in_rev.blob, location.path - FROM (SELECT content_in_dir.blob, - content_in_dir.loc - FROM content_in_dir - JOIN directory - ON directory.id=content_in_dir.dir - WHERE directory.sha1=%s - ) AS content_outside + FROM (SELECT content_early_in_rev.blob, + content_early_in_rev.loc + FROM content_early_in_rev + JOIN revision + ON revision.id=content_early_in_rev.rev + WHERE revision.sha1=%s + ) AS content_in_rev JOIN location - ON location.id=content_outside.loc + ON location.id=content_in_rev.loc ) AS content_location - JOIN content - ON content.id=content_location.blob - ORDER BY path""", - (directory,), + JOIN content + ON content.id=content_location.blob + ORDER BY path""", + (revision,), ) - content = list(provenance.cursor.fetchall()) + content = list(cursor.fetchall()) for blob, date, path in content: delta = date.timestamp() - timestamp - location = " + " + os.fsdecode(path) - print(dump("C", blob, delta, path=location, table=" D-C")) + location = os.fsdecode(path) + print(dump("C", blob, delta, path=location, table="R---C")) + + # Display isochrone frontiers found in current revision. + cursor.execute( + """SELECT directory.sha1 AS dir, + directory.date AS date, + directory_location.path AS path + FROM (SELECT isochrone_frontier.dir, + location.path + FROM (SELECT directory_in_rev.dir, + directory_in_rev.loc + FROM directory_in_rev + JOIN revision + ON revision.id=directory_in_rev.rev + WHERE revision.sha1=%s + ) AS isochrone_frontier + JOIN location + ON location.id=isochrone_frontier.loc + ) AS directory_location + JOIN directory + ON directory.id=directory_location.dir + ORDER BY path""", + (revision,), + ) + directories = list(cursor.fetchall()) - print("") + for directory, date, path in directories: + delta = date.timestamp() - timestamp + location = os.fsdecode(path) + "/" + if location == "/": + location = "./" + print(dump("D", directory, delta, path=location, table="R-D ")) + + # Display content found outside the current isochrone frontier. + cursor.execute( + """SELECT content.sha1 AS blob, + content.date AS date, + content_location.path AS path + FROM (SELECT content_outside.blob, + location.path + FROM (SELECT content_in_dir.blob, + content_in_dir.loc + FROM content_in_dir + JOIN directory + ON directory.id=content_in_dir.dir + WHERE directory.sha1=%s + ) AS content_outside + JOIN location + ON location.id=content_outside.loc + ) AS content_location + JOIN content + ON content.id=content_location.blob + ORDER BY path""", + (directory,), + ) + content = list(cursor.fetchall()) + + for blob, date, path in content: + delta = date.timestamp() - timestamp + location = " + " + os.fsdecode(path) + print(dump("C", blob, delta, path=location, table=" D-C")) + + print("") diff --git a/find-blob.py b/find-blob.py index d0c9cd4..111fc87 100755 --- a/find-blob.py +++ b/find-blob.py @@ -1,70 +1,65 @@ #!/usr/bin/env python import logging import os import sys from swh.model.cli import identify_object from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.provenance import get_provenance - # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } if __name__ == "__main__": # Set minimum logging level to INFO. logging.getLogger().setLevel(logging.INFO) if len(sys.argv) < 2: print("usage: find-blob [limit]") exit(-1) # Get provenance object. provenance = get_provenance(**conninfo) obj, swhid = identify_object("content", True, True, sys.argv[1]) sha1 = hash_to_bytes(swhid.split(":")[-1]) print(f"Identifier of object {obj}: {swhid}") limit = sys.argv[2] if len(sys.argv) > 2 else None first = provenance.content_find_first(sha1) if first is not None: print( - "===============================================================================" + "===========================================================================" ) print(f"First occurrence of {obj}:") print( - " content: swh:1:cnt:{cnt}, revision: swh:1:rev:{rev}, date: {date}, location: {path}".format( - cnt=hash_to_hex(first[0]), - rev=hash_to_hex(first[1]), - date=first[2], - path=os.fsdecode(first[3]), - ) + f" content: swh:1:cnt:{hash_to_hex(first[0])}," + f" revision: swh:1:rev:{hash_to_hex(first[1])}," + f" date: {first[2]}," + f" location: {os.fsdecode(first[3])}" ) print( - "===============================================================================" + "===========================================================================" ) if limit is None: print(f"All occurrences of {obj}:") else: print(f"First {limit} occurrences of {obj}:") for occur in provenance.content_find_all(sha1, limit=limit): print( - " content: swh:1:cnt:{cnt}, revision: swh:1:rev:{rev}, date: {date}, location: {path}".format( - cnt=hash_to_hex(occur[0]), - rev=hash_to_hex(occur[1]), - date=occur[2], - path=os.fsdecode(occur[3]), - ) + f" content: swh:1:cnt:{hash_to_hex(occur[0])}," + f" revision: swh:1:rev:{hash_to_hex(occur[1])}," + f" date: {occur[2]}," + f" location: {os.fsdecode(occur[3])}" ) else: logging.warning("Requested content not available in the provenance database.") diff --git a/histogram.py b/histogram.py index 6e6b0e1..44ab2e4 100755 --- a/histogram.py +++ b/histogram.py @@ -1,44 +1,47 @@ #!/usr/bin/env python import io from swh.provenance import get_provenance - +from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } if __name__ == "__main__": # Get provenance object. provenance = get_provenance(**conninfo) + # TODO: use ProvenanceStorageInterface instead! + assert isinstance(provenance.storage, ProvenanceStoragePostgreSql) tables = ["directory_in_rev", "content_in_dir"] for table in tables: - provenance.cursor.execute( - f"""SELECT depths.depth, COUNT(depths.depth) - FROM (SELECT - CASE location.path - WHEN '' THEN 0 - WHEN '.' THEN 0 - ELSE 1 + CHAR_LENGTH(ENCODE(location.path, 'escape')) - - CHAR_LENGTH(REPLACE(ENCODE(location.path, 'escape'), '/', '')) - END AS depth - FROM {table} - JOIN location - ON {table}.loc=location.id - ) AS depths - GROUP BY depths.depth - ORDER BY depths.depth""" - ) - - filename = "depths_" + conninfo["db"]["dbname"] + f"_{table}.csv" - - with io.open(filename, "w") as outfile: - outfile.write(f"{table} depth,{table} count\n") - for depth, count in provenance.cursor.fetchall(): - outfile.write(f"{depth},{count}\n") + with provenance.storage.transaction() as cursor: + cursor.execute( + f"""SELECT depths.depth, COUNT(depths.depth) + FROM (SELECT + CASE location.path + WHEN '' THEN 0 + WHEN '.' THEN 0 + ELSE 1 + CHAR_LENGTH(ENCODE(location.path, 'escape')) - + CHAR_LENGTH(REPLACE(ENCODE(location.path, 'escape'), '/', '')) + END AS depth + FROM {table} + JOIN location + ON {table}.loc=location.id + ) AS depths + GROUP BY depths.depth + ORDER BY depths.depth""" + ) + + filename = "depths_" + conninfo["db"]["dbname"] + f"_{table}.csv" + + with io.open(filename, "w") as outfile: + outfile.write(f"{table} depth,{table} count\n") + for depth, count in cursor.fetchall(): + outfile.write(f"{depth},{count}\n") diff --git a/metrics.py b/metrics.py index e331bdb..82e9c97 100755 --- a/metrics.py +++ b/metrics.py @@ -1,157 +1,158 @@ #!/usr/bin/env python -import io -import json -import os -import sys - -from swh.model.hashutil import hash_to_hex from swh.provenance import get_provenance +from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql from swh.provenance.provenance import ProvenanceInterface - # TODO: take conninfo as command line arguments. conninfo = { "cls": "local", "db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"}, } def get_tables_stats(provenance: ProvenanceInterface): + # TODO: use ProvenanceStorageInterface instead! + assert isinstance(provenance.storage, ProvenanceStoragePostgreSql) + tables = { "content": dict(), "content_early_in_rev": dict(), "content_in_dir": dict(), "directory": dict(), "directory_in_rev": dict(), "location": dict(), "revision": dict(), } for table in tables: - provenance.cursor.execute(f"SELECT COUNT(*) FROM {table}") - tables[table]["row_count"] = provenance.cursor.fetchone()[0] + with provenance.storage.transaction() as cursor: + cursor.execute(f"SELECT COUNT(*) FROM {table}") + tables[table]["row_count"] = cursor.fetchone()[0] - provenance.cursor.execute(f"SELECT pg_table_size('{table}')") - tables[table]["table_size"] = provenance.cursor.fetchone()[0] + cursor.execute(f"SELECT pg_table_size('{table}')") + tables[table]["table_size"] = cursor.fetchone()[0] - provenance.cursor.execute(f"SELECT pg_indexes_size('{table}')") - tables[table]["indexes_size"] = provenance.cursor.fetchone()[0] + cursor.execute(f"SELECT pg_indexes_size('{table}')") + tables[table]["indexes_size"] = cursor.fetchone()[0] - # provenance.cursor.execute(f"SELECT pg_total_relation_size('{table}')") - # relation_size[table] = provenance.cursor.fetchone()[0] - tables[table]["relation_size"] = ( - tables[table]["table_size"] + tables[table]["indexes_size"] - ) + # cursor.execute(f"SELECT pg_total_relation_size('{table}')") + # relation_size[table] = cursor.fetchone()[0] + tables[table]["relation_size"] = ( + tables[table]["table_size"] + tables[table]["indexes_size"] + ) return tables if __name__ == "__main__": # Get provenance object. provenance = get_provenance(**conninfo) + # TODO: use ProvenanceStorageInterface instead! + assert isinstance(provenance.storage, ProvenanceStoragePostgreSql) tables = get_tables_stats(provenance) for table in tables: row_count = tables[table]["row_count"] table_size = tables[table]["table_size"] indexes_size = tables[table]["indexes_size"] relation_size = tables[table]["relation_size"] print(f"{table}:") print(f" total rows: {row_count}") if row_count == 0: row_count = 1 print( f" table size: {table_size} bytes ({table_size / row_count:.2f} per row)" ) print( f" index size: {indexes_size} bytes ({indexes_size / row_count:.2f} per row)" ) print( f" total size: {relation_size} bytes ({relation_size / row_count:.2f} per row)" ) # Ratios between de different entities/relations. print("ratios:") print( f" content/revision: {tables['content']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" ) print( f" content_early_in_rev/content: {tables['content_early_in_rev']['row_count'] / (tables['content']['row_count'] if tables['content']['row_count'] != 0 else 1):.2f}" ) print( f" content_in_dir/content: {tables['content_in_dir']['row_count'] / (tables['content']['row_count'] if tables['content']['row_count'] != 0 else 1):.2f}" ) print( f" directory/revision: {tables['directory']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" ) print( f" directory_in_rev/directory: {tables['directory_in_rev']['row_count'] / (tables['directory']['row_count'] if tables['directory']['row_count'] != 0 else 1):.2f}" ) print(f" ==============================") print( f" content_early_in_rev/revision: {tables['content_early_in_rev']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" ) print( f" content_in_dir/directory: {tables['content_in_dir']['row_count'] / (tables['directory']['row_count'] if tables['directory']['row_count'] != 0 else 1):.2f}" ) print( f" directory_in_rev/revision: {tables['directory_in_rev']['row_count'] / (tables['revision']['row_count'] if tables['revision']['row_count'] != 0 else 1):.2f}" ) # Metrics for frontiers defined in root directories. - provenance.cursor.execute( - f"""SELECT dir - FROM directory_in_rev - INNER JOIN location - ON loc=location.id - WHERE location.path=%s""", - (b"",), - ) - directories = list(provenance.cursor.fetchall()) - print(f"Total root frontiers used: {len(directories)}") - - provenance.cursor.execute( - f"""SELECT dir - FROM directory_in_rev - INNER JOIN location - ON loc=location.id - WHERE location.path=%s - GROUP BY dir""", - (b"",), - ) - directories = list(provenance.cursor.fetchall()) - print(f"Total distinct root frontiers: {len(directories)}") - - provenance.cursor.execute( - f"""SELECT roots.dir - FROM (SELECT dir, loc - FROM directory_in_rev - INNER JOIN location - ON loc=location.id - WHERE location.path=%s) AS roots - JOIN directory_in_rev - ON directory_in_rev.dir=roots.dir - WHERE directory_in_rev.loc!=roots.loc""", - (b"",), - ) - directories = list(provenance.cursor.fetchall()) - print(f"Total other uses of these frontiers: {len(directories)}") - - provenance.cursor.execute( - f"""SELECT roots.dir - FROM (SELECT dir, loc - FROM directory_in_rev - INNER JOIN location - ON loc=location.id - WHERE location.path=%s) AS roots - JOIN directory_in_rev - ON directory_in_rev.dir=roots.dir - WHERE directory_in_rev.loc!=roots.loc - GROUP BY roots.dir""", - (b"",), - ) - directories = list(provenance.cursor.fetchall()) - print(f"Total distinct other uses of frontiers: {len(directories)}") + with provenance.storage.transaction() as cursor: + cursor.execute( + f"""SELECT dir + FROM directory_in_rev + INNER JOIN location + ON loc=location.id + WHERE location.path=%s""", + (b"",), + ) + directories = list(cursor.fetchall()) + print(f"Total root frontiers used: {len(directories)}") + + cursor.execute( + f"""SELECT dir + FROM directory_in_rev + INNER JOIN location + ON loc=location.id + WHERE location.path=%s + GROUP BY dir""", + (b"",), + ) + directories = list(cursor.fetchall()) + print(f"Total distinct root frontiers: {len(directories)}") + + cursor.execute( + f"""SELECT roots.dir + FROM (SELECT dir, loc + FROM directory_in_rev + INNER JOIN location + ON loc=location.id + WHERE location.path=%s) AS roots + JOIN directory_in_rev + ON directory_in_rev.dir=roots.dir + WHERE directory_in_rev.loc!=roots.loc""", + (b"",), + ) + directories = list(cursor.fetchall()) + print(f"Total other uses of these frontiers: {len(directories)}") + + cursor.execute( + f"""SELECT roots.dir + FROM (SELECT dir, loc + FROM directory_in_rev + INNER JOIN location + ON loc=location.id + WHERE location.path=%s) AS roots + JOIN directory_in_rev + ON directory_in_rev.dir=roots.dir + WHERE directory_in_rev.loc!=roots.loc + GROUP BY roots.dir""", + (b"",), + ) + directories = list(cursor.fetchall()) + print(f"Total distinct other uses of frontiers: {len(directories)}") diff --git a/revisions_format.py b/revisions_format.py index 8afc200..7f5bf92 100755 --- a/revisions_format.py +++ b/revisions_format.py @@ -1,43 +1,74 @@ #!/usr/bin/env python -import io -import json +import gzip import sys +from typing import IO, Iterable -from swh.model.hashutil import hash_to_hex, hash_to_bytes -from swh.provenance.postgresql.db_utils import connect - +import psycopg2 +from swh.core.db import BaseDb +from swh.model.hashutil import hash_to_bytes, hash_to_hex +from swh.model.model import Sha1Git conninfo = { "host": "db.internal.softwareheritage.org", "dbname": "softwareheritage", "user": "guest", } +def write_output( + cursor: psycopg2.cursor, ids: Iterable[Sha1Git], outfile: IO[bytes] +) -> None: + cursor.execute( + """SELECT id, date, directory + FROM revision + WHERE id IN %s + AND date IS NOT NULL + ORDER BY date""", + (tuple(ids),), + ) + for rev in cursor.fetchall(): + assert rev is not None, rev + assert rev[1] is not None, rev + outfile.write(f"{hash_to_hex(rev[0])},{rev[1]},{hash_to_hex(rev[2])}\n") + + if __name__ == "__main__": if len(sys.argv) != 3: print("usage: revisions_format ") exit(-1) print(f"Connection to database: {conninfo}...") - conn = connect(conninfo) + conn = BaseDb.connect(**conninfo).conn + BaseDb.adapt_conn(conn) + cursor = conn.cursor() infilename = sys.argv[1] outfilename = sys.argv[2] - with io.open(infilename) as infile: - with io.open(outfilename, "w") as outfile: - ids = json.loads(infile.read()) - print(f"Formatting {len(ids)} revisions") - for id in ids: - cursor = conn.cursor() - cursor.execute( - """SELECT id, date, directory - FROM revision - WHERE id=%s AND date IS NOT NULL""", - (hash_to_bytes(id),), - ) - rev = cursor.fetchone() - assert rev is not None - outfile.write(f"{hash_to_hex(rev[0])},{rev[1]},{hash_to_hex(rev[2])}\n") + # with io.open(infilename) as infile: + # with io.open(outfilename, "w") as outfile: + # ids = json.loads(infile.read()) + # print(f"Formatting {len(ids)} revisions") + # for id in ids: + # cursor.execute( + # """SELECT id, date, directory + # FROM revision + # WHERE id=%s AND date IS NOT NULL""", + # (hash_to_bytes(id),), + # ) + # rev = cursor.fetchone() + # assert rev is not None + # outfile.write(f"{hash_to_hex(rev[0])},{rev[1]},{hash_to_hex(rev[2])}\n") + + with gzip.open(infilename, "rt") as infile: + with gzip.open(outfilename, "wt") as outfile: + ids = [] + for idx, line in enumerate(infile.readlines(), start=1): + if line.strip(): + ids.append(hash_to_bytes(line.split(",")[0])) + if idx % 100 == 0: + write_output(cursor, ids, outfile) + ids = [] + if ids: + write_output(cursor, ids, outfile) diff --git a/revisions_pick.py b/revisions_pick.py index 4a05647..f54e62a 100755 --- a/revisions_pick.py +++ b/revisions_pick.py @@ -1,59 +1,60 @@ #!/usr/bin/env python import io import sys -from swh.model.hashutil import hash_to_hex, hash_to_bytes -from swh.provenance.postgresql.db_utils import connect - +import psycopg2 +from swh.core.db import BaseDb +from swh.model.hashutil import hash_to_bytes, hash_to_hex conninfo = { "host": "db.internal.softwareheritage.org", "dbname": "softwareheritage", "user": "guest", } if __name__ == "__main__": if len(sys.argv) != 2: print("usage: listrevs ") exit(-1) filename = sys.argv[1] print(f"Connection to database: {conninfo}...") - conn = connect(conninfo) + conn: psycopg2.connection = BaseDb.connect(**conninfo).conn + BaseDb.adapt_conn(conn) cursor = conn.cursor() revisions = set( [ hash_to_bytes("1363496c1106606684d40447f5d1149b2c66a9f8"), hash_to_bytes("b91a781cbc1285d441aa682926d93d8c23678b0b"), hash_to_bytes("313315d9790c36e22bb5bb034e9c7d7f470cdf73"), hash_to_bytes("a3b54f0f5de1ad17889fd23aee7c230eefc300cd"), hash_to_bytes("74deb33d12bf275a3b3a9afc833f4760be90f031"), ] ) pending = revisions while pending: cursor.execute( """SELECT parent_id FROM revision_history WHERE id IN %s""", (tuple(pending),), ) parents = set(map(lambda row: row[0], cursor.fetchall())) pending = parents - revisions revisions = revisions | parents # print(f"Requesting {count} revisions out of {total} (probability {probability}).") cursor.execute( """SELECT id, date, directory FROM revision WHERE id IN %s""", (tuple(revisions),), ) ordered = [row for row in cursor.fetchall() if row[1] is not None] ordered.sort(key=lambda rev: rev[1]) print(f"Obtained {len(ordered)} revisions.") with io.open(filename, "w") as outfile: for rev in ordered: outfile.write(f"{hash_to_hex(rev[0])},{rev[1]},{hash_to_hex(rev[2])}\n") diff --git a/revisions_sort.py b/revisions_sort.py new file mode 100755 index 0000000..2eb647d --- /dev/null +++ b/revisions_sort.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +import gzip +import sys +from datetime import datetime + +from swh.model.hashutil import hash_to_bytes, hash_to_hex + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("usage: revisions_sort ") + exit(-1) + + infilename = sys.argv[1] + outfilename = sys.argv[2] + + with gzip.open(infilename, "rt") as infile: + revisions = [] + sort = False + for idx, line in enumerate(infile.readlines(), start=1): + if line.strip(): + splitted = line.split(",") + revision = hash_to_bytes(splitted[0]) + date = datetime.fromisoformat(splitted[1]) + root = hash_to_bytes(splitted[2]) + + assert date is not None + + if revisions: + last = revisions[-1] + if date < last[1]: + print("Out of order", last, f"({revision},{date},{root})") + sort = True + + revisions.append((revision, date, root)) + + if sort: + revisions = sorted(revisions, key=lambda rev: rev[1]) + + date = None + with gzip.open(outfilename, "wt") as outfile: + for rev in revisions: + assert date == None or date <= rev[1] + date = rev[1] + outfile.write( + f"{hash_to_hex(rev[0])},{rev[1]},{hash_to_hex(rev[2])}\n" + )