diff --git a/vlorentz/analyze_consistency_failures.py b/vlorentz/analyze_consistency_failures.py index 45550d5..a8ad979 100644 --- a/vlorentz/analyze_consistency_failures.py +++ b/vlorentz/analyze_consistency_failures.py @@ -1,897 +1,1012 @@ import collections import difflib import hashlib import multiprocessing import multiprocessing.dummy +import os import pathlib +import pickle import re import socket import ssl import subprocess import sys import tempfile import traceback import urllib.parse import attr import dulwich.client import dulwich.errors import dulwich.object_store import dulwich.pack import dulwich.repo import requests import tqdm from swh.core.utils import grouper from swh.graph.client import RemoteGraphClient, GraphArgumentException +from swh.loader.git.converters import ( + dulwich_tree_to_directory, + dulwich_commit_to_revision, +) from swh.model.hashutil import hash_to_bytes, hash_to_hex, hash_to_bytehex from swh.model.git_objects import directory_git_object, revision_git_object from swh.model.model import Directory, Origin, Person, RevisionType from swh.model.swhids import ObjectType, CoreSWHID, ExtendedSWHID from swh.storage import get_storage CLONES_BASE_DIR = pathlib.Path( "/srv/softwareheritage/cassandra-test-0/scratch/integrity_clones/" ).expanduser() MISMATCH = re.compile( "Checksum mismatch on (?P[a-z]+): (?P[0-9a-f]{40}) in journal, but recomputed as .*" ) MISMATCH_SIGNED_OFF = re.compile( "Possibly missing 'gpgsig' header: (?P[0-9a-f]{40})" ) MISMATCH_HG_TO_GIT = re.compile( "Possibly missing 'HG:extra' header: (?P[0-9a-f]{40})" ) SVN_MISMATCH = re.compile("Possibly unfixable SVN revision: (?P[0-9a-f]{40})") FIXABLE = re.compile( r"Fixable (?P[a-z]+) (?P[0-9a-f]{40}) \((?P.*)\)" ) UNORDERED_DIRECTORY = re.compile( r"Weird directory checksum (?P[0-9a-f]{40}) \(computed without sorting\)" ) NOISE = re.compile(r"Called Ctrl-C\, exiting\.") ENCODINGS = ( b"SHIFT_JIS", b"Shift-JIS", b"shift-jis", b"shift_jis", + b"Shift_JIS", b"SJIS", b"iso8859-1", b"iso-8859-1", b"ISO-8859-1", b" ISO-8859-1", b"iso8859-15", + b"ISO-8859-1]", b"UTF8]", b"UTF-8 UTF8", b"{utf-8}", b"iso-latin-1", b"'Latin-1'", b"ISO8859-15", b"iso-8859-15", b"ISO-8859-15", b"euc-kr", b"EUC-JP", b"koi8-r", b"big5", b"ISO-8859-2", b"iso8859-2", b"ru_RU.KOI8-R", b"cp1250", - b"CP-1252", + b"CP-1250", b"cp-1251", + b"CP-1252", b"cp932", b"latin-1", b"Latin-1", b"latin1", b"Latin1", b"ISO-2022-JP", b"KOI8-R", b"windows-1252", b"windows-1250", b"euckr", b"ISO-88592", b"iso10646-1", b"iso-8859-7", b"=", b"CP950", b"win", b"win-1251", b"utf", b"{UTF-8|GBK}", b"GBKe", b"UTF-16", b"utf-16", b"GB18030", b"GB23", b"true", # wat b"BIG5", b"cp866", b"CP-1251", b"cp1251", + b"cp949", b"latin2", b"'windows-1252'", b"utf-8logoutputencoding=gbk", # wat b"gb18030", b"UTF-8-MAC UTF8-MAC", b"cp", b"ANSI", b"ru_RU.UTF8", b"ru_RU.utf8", b"UTF-8", b"utf-8", b"zh_CN.GB18030", b"window-1252", b"iso-2022-jp", + b"en_US.UTF-8", ) graph = RemoteGraphClient("http://graph.internal.softwareheritage.org:5009/graph/") REVISIONS = {} def get_clone_path(origin_url): origin_id = Origin(url=origin_url).swhid() dirname = f"{origin_id}_{origin_url.replace('/', '_')}" return CLONES_BASE_DIR / dirname def clone(origin_url): clone_path = get_clone_path(origin_url) if clone_path.is_dir(): # already cloned return # print("Cloning", origin_url) subprocess.run( ["git", "clone", "--bare", origin_url, clone_path], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) def get_object_from_clone(origin_url, obj_id): clone_path = get_clone_path(origin_url) repo = dulwich.repo.Repo(str(clone_path)) try: return repo[hash_to_bytehex(obj_id)] except dulwich.errors.ObjectFormatException: # fallback to git if dulwich can't parse it object_type = ( subprocess.check_output( ["git", "-C", clone_path, "cat-file", "-t", hash_to_hex(obj_id)] ) .decode() .strip() ) manifest = subprocess.check_output( ["git", "-C", clone_path, "cat-file", object_type, hash_to_hex(obj_id)] ) print(f"Dulwich failed to parse: {manifest!r}") traceback.print_exc() def _load_revisions(ids): ids = list(ids) storage = get_storage( "remote", url="http://webapp1.internal.softwareheritage.org:5002/" ) return dict(zip(ids, storage.revision_get(ids))) def main(input_fd): digest = collections.defaultdict(set) # Parse logs from check_consistency.py to 'digest' for line in tqdm.tqdm( list(input_fd), desc="parsing input", unit="line", unit_scale=True ): handle_line(digest, line) # preload revisions in batches - revision_id_groups = list(grouper(digest["mismatch_misc_revision"], 1000))[0:200] + # revision_id_groups = list(grouper(digest["mismatch_misc_revision"], 1000))[0:100] # revision_id_groups = list(grouper(digest["mismatch_hg_to_git"], 1000)) - # revision_id_groups = list( - # grouper(digest["mismatch_misc_revision"] | digest["mismatch_hg_to_git"], 1000) - # ) + revision_id_groups = list( + grouper(digest["mismatch_misc_revision"] | digest["mismatch_hg_to_git"], 1000) + ) with multiprocessing.dummy.Pool(10) as p: for revisions in tqdm.tqdm( p.imap_unordered(_load_revisions, revision_id_groups), desc="loading revisions", unit="k revs", total=len(revision_id_groups), ): REVISIONS.update(revisions) # Try to fix objects one by one with multiprocessing.Pool(32) as p: - # for key in ("mismatch_misc_revision",): - # for key in ("mismatch_hg_to_git",): for key in ( "mismatch_misc_revision", "mismatch_hg_to_git", ): obj_ids = list(digest.pop(key)) for (obj_id, new_key) in tqdm.tqdm( p.imap_unordered(try_revision_recovery, obj_ids, chunksize=100), desc=f"recovering {key}", unit="rev", total=len(obj_ids), smoothing=0.01, ): digest[new_key].add(obj_id) for (type_, obj_ids) in sorted(digest.items()): - print(f"{(type_ + ':'):20} {len(obj_ids)}") + print(f"{len(obj_ids)}\t{(type_ + ':')}") + + with open("analyze_consistency_failures/results.pickle", "wb") as fd: + pickle.dump(dict(digest), fd) + + +def write_fixed_manifest(swhid, manifest): + dir_path = os.path.join( + "analyze_consistency_failures", hash_to_hex(swhid.object_id)[0:2] + ) + os.makedirs(dir_path, exist_ok=True) + with open(f"{dir_path}/{swhid}.git_manifest", "wb") as fd: + fd.write(manifest) + + +def write_fixed_object(swhid, obj): + dir_path = os.path.join( + "analyze_consistency_failures", hash_to_hex(swhid.object_id)[0:2] + ) + os.makedirs(dir_path, exist_ok=True) + with open(f"{dir_path}/{swhid}.pickle", "wb") as fd: + pickle.dump(obj.to_dict(), fd) def handle_line(digest, line): line = line.strip() if not line: return if NOISE.fullmatch(line): return m = MISMATCH.fullmatch(line) if m: obj_type = m.group("obj_type") obj_id = m.group("obj_id") digest[f"mismatch_misc_{obj_type}"].add(hash_to_bytes(obj_id)) return m = MISMATCH_SIGNED_OFF.fullmatch(line) if m: obj_id = m.group("obj_id") digest["mismatch_misc_revision"].add(hash_to_bytes(obj_id)) return m = MISMATCH_HG_TO_GIT.fullmatch(line) if m: obj_id = m.group("obj_id") digest["mismatch_hg_to_git"].add(hash_to_bytes(obj_id)) return m = SVN_MISMATCH.fullmatch(line) if m: digest["mismatch_svn"].add(hash_to_bytes(m.group("obj_id"))) return m = FIXABLE.fullmatch(line) if m: digest["fixable"].add(hash_to_bytes(m.group("obj_id"))) return m = UNORDERED_DIRECTORY.fullmatch(line) if m: digest["unordered_dir"].add(hash_to_bytes(m.group("obj_id"))) return # Two messages sometimes ended up on the same line; try to split it for regexp in ( MISMATCH, MISMATCH_SIGNED_OFF, MISMATCH_HG_TO_GIT, SVN_MISMATCH, FIXABLE, UNORDERED_DIRECTORY, NOISE, ): match = regexp.match(line) if match: first_message = match.group(0) handle_line(digest, first_message) handle_line(digest, line[len(first_message) :]) break else: assert False, line def try_revision_recovery(obj_id): try: return (obj_id, _try_recovery(ObjectType.REVISION, obj_id)) except KeyboardInterrupt: # clean up stack traces a bit raise KeyboardInterrupt from None def _try_recovery(obj_type, obj_id): """Try fixing the given obj_id, and returns what digest key it should be added to""" obj_id = hash_to_bytes(obj_id) swhid = CoreSWHID(object_type=obj_type, object_id=obj_id) storage = get_storage( "remote", url="http://webapp1.internal.softwareheritage.org:5002/" ) if obj_type == ObjectType.REVISION: stored_obj = REVISIONS[obj_id] if stored_obj is None: return "revision_missing_from_storage" if stored_obj.type != RevisionType.GIT: return f"mismatch_misc_revision_{stored_obj.type.value}" stored_manifest = revision_git_object(stored_obj) elif obj_type == ObjectType.DIRECTORY: stored_obj = Directory( id=obj_id, entries=list( stream_results_optional(storage.directory_get_entries, obj_id) ), ) stored_manifest = revision_git_object(stored_obj) else: assert False, obj_type assert obj_id == stored_obj.id assert obj_id != stored_obj.compute_hash(), "Hash matches this time?!" - # Try adding spaces between name and email - for committer_padding in (0, 1, 2, 4, 5, 8, 16, 32): - for author_padding in (0, 1, 2, 4, 5, 8, 16, 32): - fixed_stored_obj = attr.evolve( - stored_obj, - author=Person( - fullname=stored_obj.author.fullname.replace( - b" <", b" " + b" " * author_padding + b"<" - ), - name=b"", - email=b"", - ), - committer=Person( - fullname=stored_obj.committer.fullname.replace( - b" <", b" " + b" " * committer_padding + b"<" - ), - name=b"", - email=b"", - ), - ) - if fixed_stored_obj.compute_hash() == obj_id: - return "fixable_author_middle_spaces" - # Try adding leading space to email # (very crude, this assumes author = committer) fullname = stored_obj.author.fullname.replace(b" <", b" < ") fixed_stored_obj = attr.evolve( stored_obj, author=Person(fullname=fullname, name=b"", email=b""), committer=Person(fullname=fullname, name=b"", email=b""), ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_author_email_leading_space" # Try adding trailing spaces to email for trailing in [b" " * i for i in range(8)] + [b"\r", b" \r", b"\t"]: for (pad_author, pad_committer) in ((1, 0), (0, 1), (1, 1)): fixed_stored_obj = attr.evolve( stored_obj, author=attr.evolve( stored_obj.author, fullname=stored_obj.author.fullname[0:-1] + trailing + b">", ) if pad_author else stored_obj.author, committer=attr.evolve( stored_obj.committer, fullname=stored_obj.committer.fullname[0:-1] + trailing + b">", ) if pad_committer else stored_obj.committer, ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_author_email_trailing_whitespace" + # Try adding carriage return to name *and* email + for (pad_author, pad_committer) in ((1, 0), (0, 1), (1, 1)): + fixed_stored_obj = attr.evolve( + stored_obj, + author=attr.evolve( + stored_obj.author, + fullname=stored_obj.author.fullname.replace(b" <", b"\r <").replace( + b">", b"\r>" + ), + ) + if pad_author + else stored_obj.author, + committer=attr.evolve( + stored_obj.committer, + fullname=stored_obj.committer.fullname.replace(b" <", b"\r <").replace( + b">", b"\r>" + ), + ) + if pad_committer + else stored_obj.committer, + ) + if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) + return "fixable_author_name_email_trailing_whitespace" + # Try adding spaces before the name for i in range(1, 4): fullname = b" " * i + stored_obj.author.fullname fixed_stored_obj = attr.evolve( stored_obj, author=Person(fullname=fullname, name=b"", email=b""), committer=Person(fullname=fullname, name=b"", email=b""), ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_author_leading_spaces" # Try adding spaces between name and email - for i in range(1, 31): + for i in range(1, 32): fullname = stored_obj.author.fullname.replace(b" <", b" " * i + b"<", 1) fixed_stored_obj = attr.evolve( stored_obj, author=Person(fullname=fullname, name=b"", email=b""), committer=Person(fullname=fullname, name=b"", email=b""), ) if fixed_stored_obj.compute_hash() == obj_id: - return "fixable_author_leading_spaces" + write_fixed_object(swhid, fixed_stored_obj) + return "fixable_author_middle_spaces" + + # Try again but with differing values + for committer_padding in (0, 1, 2, 4, 5, 8, 16, 32): + for author_padding in (0, 1, 2, 4, 5, 8, 16, 32): + fixed_stored_obj = attr.evolve( + stored_obj, + author=Person( + fullname=stored_obj.author.fullname.replace( + b" <", b" " + b" " * author_padding + b"<" + ), + name=b"", + email=b"", + ), + committer=Person( + fullname=stored_obj.committer.fullname.replace( + b" <", b" " + b" " * committer_padding + b"<" + ), + name=b"", + email=b"", + ), + ) + if fixed_stored_obj.compute_hash() == obj_id: + return "fixable_author_middle_spaces" # Try adding spaces around the name for i in range(1, 4): fullname = b" " * i + stored_obj.author.fullname.replace( b" <", b" " * i + b" <" ) fixed_stored_obj = attr.evolve( stored_obj, author=Person(fullname=fullname, name=b"", email=b""), committer=Person(fullname=fullname, name=b"", email=b""), ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_author_leading_and_middle_spaces" # Try adding spaces after the fullname fullname = stored_obj.author.fullname + b" " fixed_stored_obj = attr.evolve( stored_obj, author=Person(fullname=fullname, name=b"", email=b""), committer=Person(fullname=fullname, name=b"", email=b""), ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_author_trailing_space" for _ in range(2): fixed_stored_obj = attr.evolve( fixed_stored_obj, message=b"\n" + (fixed_stored_obj.message or b"") ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_author_trailing_space_and_leading_newlines" # Try adding leading newlines if stored_obj.message is not None: fixed_stored_obj = stored_obj for _ in range(23): # seen in the wild: any from 1 to 8, 13, 15, 22, 23 fixed_stored_obj = attr.evolve( fixed_stored_obj, message=b"\n" + fixed_stored_obj.message, ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "leading_newlines" # Try some hardcoded fullname susbstitutions substitutions = { b"name ": b" name < email >", b"unknown ": b"unknown ", b"unknown ": b"unknown ", b"from site ": b" from site < kevoree@kevoree.org >", + b" <>": b"", } fixed_stored_obj = attr.evolve( stored_obj, author=attr.evolve( stored_obj.author, fullname=substitutions.get( stored_obj.author.fullname, stored_obj.author.fullname ), ), committer=attr.evolve( stored_obj.committer, fullname=substitutions.get( stored_obj.committer.fullname, stored_obj.committer.fullname ), ), ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_author_hardcoded" if fixed_stored_obj.author.fullname == b"unknown ": fixed_stored_obj = attr.evolve( fixed_stored_obj, extra_headers=( *fixed_stored_obj.extra_headers, (b"encoding", b"ISO-8859-1"), ), ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_author_and_encoding_hardcoded" + # Try removing leading space: + author = stored_obj.author + committer = stored_obj.committer + if author.fullname.startswith(b" "): + author = attr.evolve(author, fullname=author.fullname[1:]) + if committer.fullname.startswith(b" "): + committer = attr.evolve(committer, fullname=committer.fullname[1:]) + fixed_stored_obj = attr.evolve(stored_obj, author=author, committer=committer) + if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) + return "fixable_author_fullname_strip_leading_space" + # When the fullname is in both the name and the email # have: xxx > # want: xxx > author = stored_obj.author committer = stored_obj.committer if b">" in author.name and b">" in author.email: - email = author.email.replace(b" ", b"") - author = Person( - fullname=author.name + b" <" + email + b">", name=author.name, email=email + author = attr.evolve( + author, + fullname=b"<".join(author.fullname.rsplit(b" <", 1)), # replace last occur ) if b">" in committer.name and b">" in committer.email: - email = committer.email.replace(b" ", b"") - committer = Person( - fullname=committer.name + b" <" + email + b">", - name=committer.name, - email=email, + committer = attr.evolve( + committer, fullname=b"<".join(committer.fullname.rsplit(b" <", 1)), # ditto ) fixed_stored_obj = attr.evolve(stored_obj, author=author, committer=committer) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_author_fullname_in_name_and_email" # If the timezone is 0, try some other ones offsets = {i * 60 + (+1 if i >= 0 else -1) * 59 for i in range(-12, 13)} | { -22 * 60 - 0, 0, 12 * 60 + 0, 14 * 60 + 0, 20 * 60 + 0, 80 * 60 + 0, stored_obj.committer_date.offset, stored_obj.date.offset, } for committer_offset in ( offsets if stored_obj.committer_date.offset == 0 else [stored_obj.committer_date.offset] ): for author_offset in ( offsets if stored_obj.date.offset == 0 else [stored_obj.date.offset] ): fixed_stored_obj = attr.evolve( stored_obj, date=attr.evolve( stored_obj.date, offset=author_offset, negative_utc=False ), committer_date=attr.evolve( stored_obj.committer_date, offset=committer_offset, negative_utc=False, ), ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_offset" fixable_offset = attr.evolve( fixed_stored_obj, message=b"\n" + (fixed_stored_obj.message or b"") ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_offset_and_newline" if stored_obj.date.offset == stored_obj.committer_date.offset == (6 * 60 + 15): fixed_stored_manifest = stored_manifest.replace(b"+0615", b"+0575") if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id: + write_fixed_manifest(swhid, fixed_stored_manifest) return "weird-offset=+0575" if stored_obj.date.offset == stored_obj.committer_date.offset == (7 * 60 + 0): fixed_stored_manifest = stored_manifest.replace(b"+0700", b"--700") if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id: + write_fixed_manifest(swhid, fixed_stored_manifest) return "weird-offset=--700" for offset in ( b"-041800", b"-12255", b"-72000", b"0000", b"+0575", b"+041800", b"+051800", b"+091800", b"+1558601", b"+1558010", b"+15094728", + b"+27455236", ): fixed_stored_manifest = stored_manifest.replace( b" +0000", b" " + offset ).replace(b"+51800", offset) object_header, rest = fixed_stored_manifest.split(b"\x00", 1) fixed_stored_manifest = b"commit " + str(len(rest)).encode() + b"\x00" + rest if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id: + write_fixed_manifest(swhid, fixed_stored_manifest) return f"weird-offset={offset.decode()}" # Try replacing +0002 with +02 if stored_obj.date.offset == 2 or stored_obj.committer_date.offset == 2: for (unpad_author, unpad_committer) in ((0, 1), (1, 0), (1, 1)): fixed_stored_manifest = b"\n".join( line.replace(b" +0002", b" +02") if (unpad_author and line.startswith(b"author ")) or (unpad_committer and line.startswith(b"committer ")) else line for line in stored_manifest.split(b"\n") ) (*_, rest) = fixed_stored_manifest.split(b"\x00", 1) fixed_stored_manifest = ( b"commit " + str(len(rest)).encode() + b"\x00" + rest ) if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id: + write_fixed_manifest(swhid, fixed_stored_manifest) return f"weird-offset={offset.decode()}" if fixed_stored_manifest.endswith(b"\n"): fixed_stored_manifest = fixed_stored_manifest.rstrip() (*_, rest) = fixed_stored_manifest.split(b"\x00", 1) fixed_stored_manifest = ( b"commit " + str(len(rest)).encode() + b"\x00" + rest ) - if fixed_stored_obj.compute_hash() == obj_id: - return f"weird-offset={offset.decode()}_and_no_message" + if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id: + write_fixed_manifest(swhid, fixed_stored_manifest) + return f"weird-offset={offset.decode()}" if ( stored_obj.date.offset == stored_obj.committer_date.offset == 0 and stored_obj.author.fullname.startswith(b" ") ): fixed_stored_obj = attr.evolve( stored_obj, author=attr.evolve( stored_obj.author, fullname=stored_obj.author.fullname[1:] ), committer=attr.evolve( stored_obj.committer, fullname=stored_obj.committer.fullname[1:] ), date=attr.evolve(stored_obj.date, negative_utc=True), committer_date=attr.evolve(stored_obj.committer_date, negative_utc=True), ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return f"fixable_space_and_negative_utc" fixed_stored_obj = attr.evolve( fixed_stored_obj, message=(stored_obj.message or b"") + b"\n", ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return f"fixable_space_and_newline_and_negative_utc" # Try adding an encoding header if b"encoding" not in dict(stored_obj.extra_headers): for encoding in ENCODINGS: fixed_stored_obj = attr.evolve( stored_obj, extra_headers=(*stored_obj.extra_headers, (b"encoding", encoding)), ) if fixed_stored_obj.compute_hash() == obj_id: - return f"fixable_add_encoding_{encoding}" + write_fixed_object(swhid, fixed_stored_obj) + return f"fixable_add_encoding_{encoding.decode()}" if fixed_stored_obj.message is not None: for _ in range(3): fixed_stored_obj = attr.evolve( fixed_stored_obj, message=b"\n" + (fixed_stored_obj.message or b""), ) if fixed_stored_obj.compute_hash() == obj_id: - return f"fixable_add_encoding_{encoding}_and_leading_newlines" + write_fixed_object(swhid, fixed_stored_obj) + return f"fixable_add_encoding_{encoding.decode()}_and_leading_newlines" # Try capitalizing the 'parent' revision stored_manifest_lines = stored_manifest.split(b"\n") fixed_stored_manifest_lines = [ b"parent " + line.split(b" ")[1].upper() if line.startswith(b"parent ") else line for line in stored_manifest_lines ] fixed_stored_manifest = b"\n".join(fixed_stored_manifest_lines) if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id: + write_fixed_manifest(swhid, fixed_stored_manifest) return "capitalized_revision_parent" # Try removing leading zero in date offsets (very crude...) stored_manifest_lines = stored_manifest.split(b"\n") for (unpad_author, unpad_committer) in [(0, 1), (1, 0), (1, 1)]: fixed_stored_manifest_lines = list(stored_manifest_lines) if unpad_author: fixed_stored_manifest_lines = [ re.sub(br"([+-])0", lambda m: m.group(1), line) if line.startswith(b"author ") else line for line in fixed_stored_manifest_lines ] if unpad_committer: fixed_stored_manifest_lines = [ re.sub(br"([+-])0", lambda m: m.group(1), line) if line.startswith(b"committer ") else line for line in fixed_stored_manifest_lines ] fixed_stored_manifest = b"\n".join(fixed_stored_manifest_lines) object_header, rest = fixed_stored_manifest.split(b"\x00", 1) fixed_stored_manifest = b"commit " + str(len(rest)).encode() + b"\x00" + rest if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id: - return "unpadded_time_offset_{unpad_author}_{unpad_committer}" + write_fixed_manifest(swhid, fixed_stored_manifest) + return f"unpadded_time_offset_{unpad_author}_{unpad_committer}" # Try moving the nonce at the end if b"nonce" in dict(stored_obj.extra_headers): fixed_stored_obj = attr.evolve( stored_obj, extra_headers=( *[(k, v) for (k, v) in stored_obj.extra_headers if k != b"nonce"], *[(k, v) for (k, v) in stored_obj.extra_headers if k == b"nonce"], ), ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "fixable_move_nonce" for _ in range(10): try: origin_swhids = [ ExtendedSWHID.from_string(line) for line in graph.leaves(swhid, direction="backward") if line.startswith("swh:1:ori:") ] except GraphArgumentException: return "unrecoverable_not-in-swh-graph" except: pass else: break else: return "unrecoverable_swh-grap-crashes" origins = [ origin["url"] for origin in storage.origin_get_by_sha1( [origin_swhid.object_id for origin_swhid in origin_swhids] ) ] # swh-graph results are in non-deterministic order; so a bit of sorting avoids # fetching lots of different forks of the same project. # And for big projects with lots of forks and/or broken commits, # let's manually hardcode the repo with the most commits. PRIOTIZED_ORIGINS = [ "https://github.com/torvalds/linux.git", "https://github.com/git/git.git", "https://github.com/nixos/nixpkgs.git", ] origins.sort(key=lambda url: "" if url in PRIOTIZED_ORIGINS else url) for origin_url in origins: if not origin_url.endswith(".git"): origin_url += ".git" if origin_url == "https://github.com/reingart/python.git": # Fails very often... continue if ".googlecode.com/" in origin_url: # Does not exist anymore continue data = b"0032want " + hash_to_bytehex(obj_id) + b"\n" for parent in stored_obj.parents: data += b"0032have " + hash_to_bytehex(parent) + b"\n" data += b"0000" data += b"0009done\n" clone_path = get_clone_path(origin_url) if not clone_path.is_dir(): # First, check if we can access the origin and if it still has the # commit we want. parsed_url = urllib.parse.urlparse(origin_url) if parsed_url.scheme == "git": # TODO: use the dumb git proto to check? try: clone(origin_url) except subprocess.CalledProcessError: continue elif parsed_url.scheme in ("http", "https"): # This is silly, but neither requests or dulwich properly handle # some connection terminations for some reason, so we need # this home-made HTTP client hostname = parsed_url.netloc context = ssl.create_default_context() try: with socket.create_connection((hostname, 443)) as sock: with context.wrap_socket( sock, server_hostname=hostname ) as ssock: ssock.write( b"POST " + parsed_url.path.encode() + b"/git-upload-pack HTTP/1.0\r\n" ) ssock.write(b"Host: " + hostname.encode() + b"\r\n") ssock.write( b"Content-Type: application/x-git-upload-pack-request\r\n" ) ssock.write(b"\r\n") ssock.write(data) response = b"" while True: new_data = ssock.read() if not new_data: break response += new_data except (TimeoutError, socket.gaierror): # Could not connect continue except ConnectionResetError: # Could happen for variousreasons, let's try anyway pass else: (headers, body) = response.split(b"\r\n\r\n", 1) (status_line, headers) = headers.split(b"\r\n", 1) if b"401" in status_line or b"404" in status_line: # Repo not available continue elif any( - code in status_line for code in (b"200", b"301", b"302", b"500") + code in status_line + for code in (b"200", b"301", b"302", b"429", b"500", b"502") ): - # 500 happens on gitlab for some reason + # 500 and 502 happen on gitlab for some reason pass else: assert False, ( f"unexpected response when querying {hash_to_hex(obj_id)} " f"on {origin_url}: {status_line}\n{body}" ) try: clone(origin_url) except subprocess.CalledProcessError: continue try: cloned_obj = get_object_from_clone(origin_url, obj_id) except KeyError: # try next origin continue if cloned_obj is None: return b"found_but_unparseable" break else: return "unrecoverable_no-origin" object_header = ( cloned_obj.type_name + b" " + str(cloned_obj.raw_length()).encode() + b"\x00" ) cloned_manifest = object_header + cloned_obj.as_raw_string() rehash = hashlib.sha1(cloned_manifest).digest() assert ( obj_id == rehash ), f"Mismatch between origin hash and original object: {obj_id.hex()} != {rehash.hex()}" if obj_type == ObjectType.REVISION: fixed_stored_obj = stored_obj # Try adding gpgsig if ( b"gpgsig" not in dict(stored_obj.extra_headers) and cloned_obj.gpgsig is not None ): fixed_stored_obj = attr.evolve( stored_obj, extra_headers=( *[(k, v) for (k, v) in stored_obj.extra_headers if k != b"nonce"], (b"gpgsig", cloned_obj.gpgsig), *[(k, v) for (k, v) in stored_obj.extra_headers if k == b"nonce"], ), ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "recoverable_missing_gpgsig" # Try adding mergetag (on top of gpgsig) if ( b"mergetag" not in dict(stored_obj.extra_headers) and cloned_obj.mergetag is not None ): # fixed_stored_obj = stored_obj # commented out to reuse the gpgsig-fixed mergetags = [] for mergetag in cloned_obj.mergetag: mergetag = mergetag.as_raw_string() assert mergetag.endswith(b"\n") mergetags.append((b"mergetag", mergetag[0:-1])) fixed_stored_obj = attr.evolve( fixed_stored_obj, extra_headers=(*mergetags, *stored_obj.extra_headers,), ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "recoverable_missing_mergetag_and_maybe_gpgsig" # Try adding a magic string at the end of the message if stored_obj.message and stored_obj.message.endswith(b"--HG--\nbranch : "): # Probably https://github.com/GWBasic/ObjectCloud.git assert cloned_obj.message.startswith(stored_obj.message) fixed_stored_obj = attr.evolve(stored_obj, message=cloned_obj.message) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "recoverable_hg_branch_nullbytes_truncated" # Try copying extra headers (including gpgsig) extra_headers = cloned_obj.extra if cloned_obj.gpgsig is not None: extra_headers = (*extra_headers, (b"gpgsig", cloned_obj.gpgsig)) fixed_stored_obj = attr.evolve(stored_obj, extra_headers=extra_headers) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "recoverable_extra_headers" if {b"HG:extra", b"HG:rename-source", b"HG:rename"} & set(dict(extra_headers)): for n in range(4): fixed_stored_obj = attr.evolve( fixed_stored_obj, message=b"\n" + fixed_stored_obj.message ) if fixed_stored_obj.compute_hash() == obj_id: + write_fixed_object(swhid, fixed_stored_obj) return "recoverable_extra_headers_and_leading_newlines" print("=" * 100) print("Failed to fix:") print("origin_url", origin_url) print("original", repr(cloned_manifest.split(b"\x00", 1)[1])) print("stored ", repr(stored_manifest.split(b"\x00", 1)[1])) print( "\n".join( difflib.ndiff( cloned_manifest.split(b"\x00", 1)[1] .decode(errors="backslashreplace") .split("\n"), stored_manifest.split(b"\x00", 1)[1] .decode(errors="backslashreplace") .split("\n"), ) ) ) print("=" * 100) - return "recoverable_misc" + + try: + if obj_type == ObjectType.REVISION: + cloned_obj = dulwich_commit_to_revision(cloned_obj) + roundtripped_cloned_manifest = revision_git_object(cloned_obj) + elif obj_type == ObjectType.DIRECTORY: + cloned_obj = dulwich_tree_to_directory(cloned_obj) + roundtripped_cloned_manifest = directory_git_object(cloned_obj) + else: + assert False, obj_type + except: + roundtripped_cloned_manifest = None + + if roundtripped_cloned_manifest == cloned_manifest.split(b"\x00", 1)[1]: + write_fixed_object(swhid, cloned_obj) + return f"recoverable_misc_{obj_type.value}" + else: + write_fixed_manifest(swhid, cloned_manifest) + return f"weird_misc_{obj_type.value}" if __name__ == "__main__": main(sys.stdin) diff --git a/vlorentz/vault_repro.py b/vlorentz/vault_repro.py index bcccf36..d56f0a2 100644 --- a/vlorentz/vault_repro.py +++ b/vlorentz/vault_repro.py @@ -1,209 +1,211 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import contextlib import datetime import io import os.path import pathlib import subprocess import sys import tempfile from typing import Dict, Set import yaml from swh.loader.git.from_disk import GitLoaderFromDisk from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.model.identifiers import CoreSWHID from swh.model.model import Sha1Git from swh.storage import get_storage CONFIG_PATH = pathlib.Path("vault-local.yml") def list_object_ids(path: pathlib.Path) -> Set[Sha1Git]: """Returns the set of all object hashes in the given repo path.""" print(f"Reading {path}...") stdout1 = subprocess.check_output( ["git", "rev-list", "--objects", "--all"], cwd=path ) stdout2 = subprocess.check_output( ["git", "rev-list", "--objects", "-g", "--no-walk", "--all"], cwd=path, ) obj_ids = [ line.split()[0].decode("ascii") for line in (stdout1 + stdout2).split(b"\n") if line ] return set(map(hash_to_bytes, obj_ids)) def list_refs(path: pathlib.Path) -> Dict[bytes, bytes]: packed_refs_path = path / "packed-refs" if packed_refs_path.is_file(): refs = { ref_name: target for (target, ref_name) in [ line.split() for line in open(packed_refs_path) if ( line and not line.startswith("#") # header and not line.startswith("^") # wat? ) ] } else: refs = {} # refs/ takes precedence over packed-refs (although it's unlikely they would # ever disagree on a target) refs.update( { str(ref_name.relative_to(path)): open(ref_name).read() for ref_name in path.glob("refs/**/*") if ref_name.is_file() } ) return refs def load_repo(path: pathlib.Path, storage_config) -> CoreSWHID: """Loads a repo to the storage and returns its snapshot id""" print(f"Loading {path}...") storage = get_storage( "pipeline", steps=[ dict(cls="validate"), dict( cls="buffer", min_batch_size=dict( content=10000, content_bytes=104857600, directory=1000, revision=1000, ), ), dict(cls="filter"), storage_config, ], ) loader = GitLoaderFromDisk( storage, f"file://{path.resolve()}", directory=str(path), visit_date=datetime.datetime.now(datetime.timezone.utc), ) loader.load() return loader.snapshot.swhid() def cook_repo(config_path, storage, swhid: CoreSWHID, path: pathlib.Path) -> None: """Reads the storage to cook the swhid to the given path""" print(f"Cooking {swhid} to {path}...") with tempfile.NamedTemporaryFile(prefix="vault-repro-", suffix=".tar") as tar_fd: tar_path = tar_fd.name subprocess.run( [ "swh", "vault", "cook", "-C", str(config_path), str(swhid), str(tar_path), "--bundle-type", "git_bare", ], check=True, ) subprocess.run(["tar", "-xf", tar_path], cwd=path, check=True) def repro_repo(config_path, path: pathlib.Path) -> None: with open(config_path) as fd: vault_config = yaml.safe_load(fd) storage = get_storage(**vault_config["storage"]) swhid = load_repo(path, vault_config["storage"]) - with tempfile.TemporaryDirectory(prefix="vault-repro-") as cooked_output_path_str: + with tempfile.TemporaryDirectory( + prefix="vault-repro-", dir="/home/dev/tmp/" + ) as cooked_output_path_str: cooked_output_path = pathlib.Path(cooked_output_path_str) cook_repo(config_path, storage, swhid, cooked_output_path) cooked_ids = list_object_ids(cooked_output_path / f"{swhid}.git") cooked_refs = list_refs(cooked_output_path / f"{swhid}.git") original_ids = list_object_ids(path) missing_ids = original_ids - cooked_ids extra_ids = cooked_ids - original_ids print(f"{len(original_ids)} original objects, {len(cooked_ids)} cooked objects.") if missing_ids: print("Missing objects:", " ".join(map(hash_to_hex, sorted(missing_ids)))) if extra_ids: print("Extra objects:", " ".join(map(hash_to_hex, sorted(extra_ids)))) if missing_ids or extra_ids: exit(1) original_refs = list_refs(path) missing_refs = set(original_refs) - set(cooked_refs) extra_refs = set(cooked_refs) - set(original_refs) if missing_refs: print("Missing refs:", " ".join(map(repr, missing_refs))) if extra_refs: print("Extra refs:", " ".join(map(repr, extra_refs))) mismatched_refs = False for ref_name in original_refs: if original_refs[ref_name] != cooked_refs[ref_name]: mismatched_refs = True print( "Mismatched ref:", ref_name, "points to", cooked_refs[ref_name], "instead of", original_refs[ref_name], ) if missing_refs or extra_refs or missing_refs: exit(1) print("All good!") @contextlib.contextmanager def clone_repo(repo_path_or_url): if os.path.isdir(repo_path_or_url): # nothing to do, it's already cloned yield repo_path_or_url else: with tempfile.TemporaryDirectory( prefix="vault-repro-", suffix=".git" ) as cloned_path: os.rmdir(cloned_path) subprocess.run( ["git", "clone", "--bare", repo_path_or_url, cloned_path], check=True, ) yield cloned_path def main() -> None: (_, config_path, repo_path_or_url) = sys.argv with clone_repo(repo_path_or_url) as repo_path: repro_repo(config_path, pathlib.Path(repo_path)) if __name__ == "__main__": main()