Page MenuHomeSoftware Heritage

compare-first.py
No OneTemporary

compare-first.py

#!/usr/bin/env python
import glob
import io
import logging
import os
from swh.model.hashutil import hash_to_hex
from swh.provenance import get_provenance
from swh.provenance.postgresql.provenance import ProvenanceStoragePostgreSql
# TODO: take conninfo as command line arguments.
conninfo1 = {
"cls": "local",
"db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "old"},
}
conninfo2 = {
"cls": "local",
"db": {"host": "/var/run/postgresql", "port": "5436", "dbname": "provenance"},
}
# Write log file with occurrence detail.
def logdiff(filename, occurrence):
with io.open(filename, "a") as outfile:
try:
# Try to decode path.
path = os.fsdecode(occurrence[3]).decode("utf-8", "replace")
except:
# Use its raw value if not possible
path = occurrence[3]
outfile.write(
"{blob},{rev},{date},{path}\n".format(
blob=hash_to_hex(occurrence[0]),
rev=hash_to_hex(occurrence[1]),
date=occurrence[2],
path=path,
)
)
# Write log file with list of occurrences.
def loglist(filename, occurrences):
with io.open(filename, "a") as outfile:
for blobid in occurrences:
outfile.write("{blob}\n".format(blob=hash_to_hex(blobid)))
# Output log file name.
nextidx = None
def outfilename(suffix):
global nextidx
basename, _ = os.path.splitext(os.path.basename(os.path.abspath(__file__)))
prefix = os.path.join(os.getcwd(), basename + "-")
if nextidx is None:
nextidx = 0
for filename in glob.glob(f"{prefix}*.log"):
try:
lastidx = int(filename.strip(prefix).split("-")[0])
nextidx = max(nextidx, lastidx + 1)
except:
continue
return f"{prefix}{nextidx:02}-{suffix}.log"
# Print iterations progress.
# TODO: move to utils module.
def progress(
iteration,
total,
prefix="Progress:",
suffix="Complete",
decimals=1,
length=50,
fill="█",
printEnd="\r",
):
"""
Call in a loop to create terminal progress bar
@params:
iteration - Required : current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)
suffix - Optional : suffix string (Str)
decimals - Optional : positive number of decimals in percent complete (Int)
length - Optional : character length of bar (Int)
fill - Optional : bar fill character (Str)
printEnd - Optional : end character (e.g. "\r", "\r\n") (Str)
"""
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + "-" * (length - filledLength)
print(f"\r{prefix} |{bar}| {percent}% {suffix}", end=printEnd)
# Print New Line on Complete
if iteration == total:
print()
if __name__ == "__main__":
# Set minimum logging level to INFO.
logging.getLogger().setLevel(logging.INFO)
# Get provenance object for both databases and query its lists of content.
provenance1 = get_provenance(**conninfo1)
provenance2 = get_provenance(**conninfo2)
# TODO: use ProvenanceStorageInterface instead!
assert isinstance(provenance1.storage, ProvenanceStoragePostgreSql)
assert isinstance(provenance2.storage, ProvenanceStoragePostgreSql)
with provenance1.storage.transaction() as cursor:
cursor.execute("""SELECT id FROM content ORDER BY id""")
content1 = set(map(lambda row: row[0], cursor.fetchall()))
with provenance2.storage.transaction() as cursor:
cursor.execute("""SELECT sha1 FROM content ORDER BY sha1""")
content2 = set(map(lambda row: row[0], cursor.fetchall()))
if content1 == content2:
# If lists of content match, we check that occurrences does as well.
total = len(content1)
progress(0, total)
mismatch = False
# Iterate over all content querying all its occurrences on both databases.
for i, blobid in enumerate(content1):
with provenance1.storage.transaction() as cursor:
cursor.execute(
"""SELECT content_early_in_rev.blob,
content_early_in_rev.rev,
revision.date,
content_early_in_rev.path
FROM content_early_in_rev
JOIN revision
ON revision.id=content_early_in_rev.rev
WHERE content_early_in_rev.blob=%s
ORDER BY date, rev, path ASC LIMIT 1""",
(blobid,),
)
occurrence1 = cursor.fetchone()
occurrence2 = provenance2.content_find_first(blobid)
# If there is a mismatch log it to file. We can only compare the timestamp
# as the same blob might be seen for the first time in different locations.
if occurrence1[2] != occurrence2[2]:
mismatch = True
logging.warning(f"Occurrencies mismatch for {hash_to_hex(blobid)}")
logdiff(outfilename(conninfo1["db"]["dbname"]), occurrence1)
logdiff(outfilename(conninfo2["db"]["dbname"]), occurrence2)
progress(i + 1, total)
if not mismatch:
logging.info("Databases are equivalent!")
else:
# If lists of content don't match, we are done.
loglist(outfilename(conninfo1["db"]["dbname"]), content1)
loglist(outfilename(conninfo2["db"]["dbname"]), content2)
logging.warning("Content lists are different")

File Metadata

Mime Type
text/x-python
Expires
Tue, Apr 15, 12:21 AM (1 w, 19 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3281001

Event Timeline