analyze_consistency_failures.py
No OneTemporary
Actions

Size

42 KB

Subscribers

None

analyze_consistency_failures.py
View Options

	import collections
	import difflib
	import hashlib
	import multiprocessing
	import multiprocessing.dummy
	import os
	import pathlib
	import pickle
	import re
	import secrets
	import signal
	import socket
	import ssl
	import subprocess
	import sys
	import tempfile
	import time
	import traceback
	import urllib.parse

	import attr
	import dulwich.client
	import dulwich.errors
	import dulwich.object_store
	import dulwich.pack
	import dulwich.repo
	import requests
	import tqdm

	from swh.core.utils import grouper
	from swh.graph.client import RemoteGraphClient, GraphArgumentException
	from swh.loader.git.converters import (
	dulwich_tree_to_directory,
	dulwich_commit_to_revision,
	)
	from swh.model.hashutil import hash_to_bytes, hash_to_hex, hash_to_bytehex
	from swh.model.git_objects import (
	directory_git_object,
	release_git_object,
	revision_git_object,
	)
	from swh.model.model import (
	Directory,
	Origin,
	Person,
	RevisionType,
	Timestamp,
	TimestampWithTimezone,
	)
	from swh.model.swhids import ObjectType, CoreSWHID, ExtendedSWHID
	from swh.storage import get_storage

	CLONES_BASE_DIR = pathlib.Path(
	"/srv/softwareheritage/cassandra-test-0/scratch/integrity_clones/"
	).expanduser()

	MISMATCH = re.compile(
	"Checksum mismatch on (?P<obj_type>[a-z]+): (?P<obj_id>[0-9a-f]{40}) in journal, but recomputed as .*"
	)
	MISMATCH_SIGNED_OFF = re.compile(
	"Possibly missing 'gpgsig' header: (?P<obj_id>[0-9a-f]{40})"
	)
	MISMATCH_HG_TO_GIT = re.compile(
	"Possibly missing 'HG:extra' header: (?P<obj_id>[0-9a-f]{40})"
	)
	SVN_MISMATCH = re.compile("Possibly unfixable SVN revision: (?P<obj_id>[0-9a-f]{40})")
	FIXABLE = re.compile(
	r"Fixable (?P<obj_type>[a-z]+) (?P<obj_id>[0-9a-f]{40}) \((?P<how>.*)\)"
	)
	UNORDERED_DIRECTORY = re.compile(
	r"Weird directory checksum (?P<obj_id>[0-9a-f]{40}) \(computed without sorting\)"
	)
	NOISE = re.compile(r"Called Ctrl-C\, exiting\.")

	ENCODINGS = (
	b"SHIFT_JIS",
	b"Shift-JIS",
	b"shift-jis",
	b"shift_jis",
	b"Shift_JIS",
	b"SJIS",
	b"iso8859-1",
	b"iso-8859-1",
	b"ISO-8859-1",
	b" ISO-8859-1",
	b"iso8859-15",
	b"ISO-8859-1]",
	b"UTF8]",
	b"UTF-8 UTF8",
	b"{utf-8}",
	b"iso-latin-1",
	b"'Latin-1'",
	b"ISO8859-15",
	b"iso-8859-15",
	b"ISO-8859-15",
	b"euc-kr",
	b"EUC-JP",
	b"koi8-r",
	b"big5",
	b"ISO-8859-2",
	b"iso8859-2",
	b"ru_RU.KOI8-R",
	b"cp1250",
	b"CP-1250",
	b"cp-1251",
	b"CP-1252",
	b"cp932",
	b"latin-1",
	b"Latin-1",
	b"latin1",
	b"Latin1",
	b"ISO-2022-JP",
	b"KOI8-R",
	b"windows-1250",
	b"window-1252",
	b"windows-1252",
	b"'windows-1252'",
	b"WINDOWS-1251",
	b"Windows-1257",
	b"euckr",
	b"ISO-88592",
	b"iso10646-1",
	b"iso-8859-7",
	b"=",
	b"CP950",
	b"win",
	b"win-1251",
	b"utf",
	b"{UTF-8\|GBK}",
	b"GBKe",
	b"UTF-16",
	b"utf-16",
	b"GB18030",
	b"GB23",
	b"true", # wat
	b"BIG5",
	b"cp866",
	b"CP-1251",
	b"cp1251",
	b"cp949",
	b"latin2",
	b"utf-8logoutputencoding=gbk", # wat
	b"gb18030",
	b"UTF-8-MAC UTF8-MAC",
	b"cp",
	b"ANSI",
	b"ru_RU.UTF8",
	b"ru_RU.utf8",
	b"UTF-8",
	b"utf-8",
	b"zh_CN.GB18030",
	b"iso-2022-jp",
	b"en_US.UTF-8",
	b"dos",
	b"iso8859-13",
	)


	ZERO_TIMESTAMP = TimestampWithTimezone(
	Timestamp(seconds=0, microseconds=0), offset=0, negative_utc=False
	)

	graph = RemoteGraphClient("http://graph.internal.softwareheritage.org:5009/graph/")


	REVISIONS = {}
	RELEASES = {}


	def get_clone_path(origin_url):
	origin_id = Origin(url=origin_url).swhid()
	dirname = f"{origin_id}_{origin_url.replace('/', '_')}"
	return CLONES_BASE_DIR / dirname


	def clone(origin_url):
	clone_path = get_clone_path(origin_url)
	if clone_path.is_dir():
	# already cloned
	return
	# print("Cloning", origin_url)
	subprocess.run(
	["git", "clone", "--bare", origin_url, clone_path],
	check=True,
	stdout=subprocess.DEVNULL,
	stderr=subprocess.DEVNULL,
	)


	def get_object_from_clone(origin_url, obj_id):
	clone_path = get_clone_path(origin_url)
	try:
	repo = dulwich.repo.Repo(str(clone_path))
	except dulwich.errors.NotGitRepository:
	return None

	try:
	return repo[hash_to_bytehex(obj_id)]
	except dulwich.errors.ObjectFormatException:
	# fallback to git if dulwich can't parse it
	object_type = (
	subprocess.check_output(
	["git", "-C", clone_path, "cat-file", "-t", hash_to_hex(obj_id)]
	)
	.decode()
	.strip()
	)
	manifest = subprocess.check_output(
	["git", "-C", clone_path, "cat-file", object_type, hash_to_hex(obj_id)]
	)
	print(f"Dulwich failed to parse: {manifest!r}")
	traceback.print_exc()


	def _load_revisions(ids):
	ids = list(ids)
	storage = get_storage(
	"remote", url="http://webapp1.internal.softwareheritage.org:5002/"
	)
	return dict(zip(ids, storage.revision_get(ids)))


	def _load_releases(ids):
	ids = list(ids)
	storage = get_storage(
	"remote", url="http://webapp1.internal.softwareheritage.org:5002/"
	)
	return dict(zip(ids, storage.release_get(ids)))


	def main(input_fd):
	digest = collections.defaultdict(set)

	# Parse logs from check_consistency.py to 'digest'
	for line in tqdm.tqdm(
	list(input_fd), desc="parsing input", unit="line", unit_scale=True
	):
	handle_line(digest, line)

	# preload revisions in batches
	# revision_id_groups = list(grouper(digest["mismatch_misc_revision"], 1000))[0:100]
	# revision_id_groups = list(grouper(digest["mismatch_hg_to_git"], 1000))
	revision_id_groups = list(
	grouper(
	digest.get("mismatch_misc_revision", set())
	\| digest.get("mismatch_hg_to_git", set()),
	1000,
	)
	)
	with multiprocessing.dummy.Pool(10) as p:
	for revisions in tqdm.tqdm(
	p.imap_unordered(_load_revisions, revision_id_groups),
	desc="loading revisions",
	unit="k revs",
	total=len(revision_id_groups),
	):
	REVISIONS.update(revisions)

	release_id_groups = list(grouper(digest.get("mismatch_misc_release", []), 1000))
	with multiprocessing.dummy.Pool(10) as p:
	for releases in tqdm.tqdm(
	p.imap_unordered(_load_releases, release_id_groups),
	desc="loading releases",
	unit="k rels",
	total=len(release_id_groups),
	):
	RELEASES.update(releases)

	# Try to fix objects one by one
	with multiprocessing.Pool(32, maxtasksperchild=1000) as p:
	for (f, key) in (
	(try_revision_recovery, "mismatch_misc_revision"),
	(try_revision_recovery, "mismatch_hg_to_git"),
	(try_release_recovery, "mismatch_misc_release"),
	):
	obj_ids = list(digest.pop(key, []))
	for (obj_id, new_key) in tqdm.tqdm(
	p.imap_unordered(f, obj_ids, chunksize=100),
	desc=f"recovering {key}",
	unit="obj",
	total=len(obj_ids),
	smoothing=0.01,
	):
	digest[new_key].add(obj_id)

	for (type_, obj_ids) in sorted(digest.items()):
	print(f"{len(obj_ids)}\t{type_}")

	with open("analyze_consistency_failures/results.pickle", "wb") as fd:
	pickle.dump(dict(digest), fd)


	def write_fixed_manifest(swhid, manifest):
	dir_path = os.path.join(
	"analyze_consistency_failures", hash_to_hex(swhid.object_id)[0:2]
	)
	os.makedirs(dir_path, exist_ok=True)
	with open(f"{dir_path}/{swhid}.git_manifest", "wb") as fd:
	fd.write(manifest)


	def write_fixed_object(swhid, obj):
	dir_path = os.path.join(
	"analyze_consistency_failures", hash_to_hex(swhid.object_id)[0:2]
	)
	os.makedirs(dir_path, exist_ok=True)
	with open(f"{dir_path}/{swhid}.pickle", "wb") as fd:
	pickle.dump(obj.to_dict(), fd)


	def handle_line(digest, line):
	line = line.strip()
	if not line:
	return
	if NOISE.fullmatch(line):
	return
	m = MISMATCH.fullmatch(line)
	if m:
	obj_type = m.group("obj_type")
	obj_id = m.group("obj_id")
	digest[f"mismatch_misc_{obj_type}"].add(hash_to_bytes(obj_id))
	return
	m = MISMATCH_SIGNED_OFF.fullmatch(line)
	if m:
	obj_id = m.group("obj_id")
	digest["mismatch_misc_revision"].add(hash_to_bytes(obj_id))
	return
	m = MISMATCH_HG_TO_GIT.fullmatch(line)
	if m:
	obj_id = m.group("obj_id")
	digest["mismatch_hg_to_git"].add(hash_to_bytes(obj_id))
	return
	m = SVN_MISMATCH.fullmatch(line)
	if m:
	digest["mismatch_misc_revision_svn"].add(hash_to_bytes(m.group("obj_id")))
	return
	m = FIXABLE.fullmatch(line)
	if m:
	digest["fixable_trivial"].add(hash_to_bytes(m.group("obj_id")))
	return
	m = UNORDERED_DIRECTORY.fullmatch(line)
	if m:
	digest["weird_unordered_dir"].add(hash_to_bytes(m.group("obj_id")))
	return

	# Two messages sometimes ended up on the same line; try to split it
	for regexp in (
	MISMATCH,
	MISMATCH_SIGNED_OFF,
	MISMATCH_HG_TO_GIT,
	SVN_MISMATCH,
	FIXABLE,
	UNORDERED_DIRECTORY,
	NOISE,
	):
	match = regexp.match(line)
	if match:
	first_message = match.group(0)
	handle_line(digest, first_message)
	handle_line(digest, line[len(first_message) :])
	break
	else:
	assert False, line


	def try_revision_recovery(obj_id):
	return (obj_id, _try_recovery(ObjectType.REVISION, obj_id))


	def try_release_recovery(obj_id):
	return (obj_id, _try_recovery(ObjectType.RELEASE, obj_id))


	def _try_recovery(obj_type, obj_id):
	"""Try fixing the given obj_id, and returns what digest key it should be added to"""
	obj_id = hash_to_bytes(obj_id)
	swhid = CoreSWHID(object_type=obj_type, object_id=obj_id)
	storage = get_storage(
	"pipeline",
	steps=[
	dict(cls="retry"),
	dict(
	cls="remote", url="http://webapp1.internal.softwareheritage.org:5002/"
	),
	],
	)

	if obj_type == ObjectType.REVISION:
	stored_obj = REVISIONS[obj_id]
	if stored_obj is None:
	return "revision_missing_from_storage"
	if stored_obj.type != RevisionType.GIT:
	return f"mismatch_misc_{stored_obj.type.value}"
	stored_manifest = revision_git_object(stored_obj)
	elif obj_type == ObjectType.RELEASE:
	stored_obj = RELEASES[obj_id]
	if stored_obj is None:
	return "release_missing_from_storage"
	stored_manifest = release_git_object(stored_obj)
	elif obj_type == ObjectType.DIRECTORY:
	stored_obj = Directory(
	id=obj_id,
	entries=list(
	stream_results_optional(storage.directory_get_entries, obj_id)
	),
	)
	stored_manifest = revision_git_object(stored_obj)
	else:
	assert False, obj_type

	assert obj_id == stored_obj.id
	assert obj_id != stored_obj.compute_hash(), "Hash matches this time?!"

	if obj_type == ObjectType.REVISION:
	bucket = try_fix_revision(swhid, stored_obj, stored_manifest)
	elif obj_type == ObjectType.RELEASE:
	bucket = try_fix_release(swhid, stored_obj, stored_manifest)
	elif obj_type == ObjectType.DIRECTORY:
	bucket = try_fix_directory(swhid, stored_obj, stored_manifest)
	else:
	assert False, obj_id

	if bucket is not None:
	return bucket

	res = get_origins(swhid, stored_obj)
	if res[0]:
	(_, origin_url, cloned_obj) = res
	else:
	(_, bucket) = res
	return bucket

	object_header = (
	cloned_obj.type_name + b" " + str(cloned_obj.raw_length()).encode() + b"\x00"
	)
	cloned_manifest = object_header + cloned_obj.as_raw_string()
	rehash = hashlib.sha1(cloned_manifest).digest()
	assert (
	obj_id == rehash
	), f"Mismatch between origin hash and original object: {obj_id.hex()} != {rehash.hex()}"

	if obj_type == ObjectType.REVISION:
	bucket = try_recover_revision(
	swhid, stored_obj, stored_manifest, cloned_obj, cloned_manifest
	)
	elif obj_type == ObjectType.RELEASE:
	bucket = try_recover_release(
	swhid, stored_obj, stored_manifest, cloned_obj, cloned_manifest
	)
	elif obj_type == ObjectType.DIRECTORY:
	bucket = try_recover_directory(
	swhid, stored_obj, stored_manifest, cloned_obj, cloned_manifest
	)
	else:
	assert False, obj_id

	if bucket is not None:
	return bucket

	print("=" * 100)
	print("Failed to fix:")
	print("origin_url", origin_url)
	print("original", repr(cloned_manifest.split(b"\x00", 1)[1]))
	print("stored ", repr(stored_manifest.split(b"\x00", 1)[1]))
	print(
	"\n".join(
	difflib.ndiff(
	cloned_manifest.split(b"\x00", 1)[1]
	.decode(errors="backslashreplace")
	.split("\n"),
	stored_manifest.split(b"\x00", 1)[1]
	.decode(errors="backslashreplace")
	.split("\n"),
	)
	)
	)
	print("=" * 100)

	try:
	if obj_type == ObjectType.REVISION:
	cloned_obj = dulwich_commit_to_revision(cloned_obj)
	roundtripped_cloned_manifest = revision_git_object(cloned_obj)
	elif obj_type == ObjectType.DIRECTORY:
	cloned_obj = dulwich_tree_to_directory(cloned_obj)
	roundtripped_cloned_manifest = directory_git_object(cloned_obj)
	else:
	assert False, obj_type
	except:
	roundtripped_cloned_manifest = None

	if roundtripped_cloned_manifest == cloned_manifest:
	write_fixed_object(swhid, cloned_obj)
	return f"recoverable_misc_{obj_type.value}"
	else:
	write_fixed_manifest(swhid, cloned_manifest)
	return f"weird_misc_{obj_type.value}"


	def try_fix_revision(swhid, stored_obj, stored_manifest):
	obj_id = swhid.object_id

	# Try adding leading space to email
	# (very crude, this assumes author = committer)
	fullname = stored_obj.author.fullname.replace(b" <", b" < ")
	fixed_stored_obj = attr.evolve(
	stored_obj,
	author=Person(fullname=fullname, name=b"", email=b""),
	committer=Person(fullname=fullname, name=b"", email=b""),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_email_leading_space"

	# Try adding trailing spaces to email
	for trailing in [b" " * i for i in range(8)] + [b"\r", b" \r", b"\t"]:
	for (pad_author, pad_committer) in ((1, 0), (0, 1), (1, 1)):
	fixed_stored_obj = attr.evolve(
	stored_obj,
	author=attr.evolve(
	stored_obj.author,
	fullname=stored_obj.author.fullname[0:-1] + trailing + b">",
	)
	if pad_author
	else stored_obj.author,
	committer=attr.evolve(
	stored_obj.committer,
	fullname=stored_obj.committer.fullname[0:-1] + trailing + b">",
	)
	if pad_committer
	else stored_obj.committer,
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_email_trailing_whitespace"

	# Try adding carriage return to name and email
	for (pad_author, pad_committer) in ((1, 0), (0, 1), (1, 1)):
	fixed_stored_obj = attr.evolve(
	stored_obj,
	author=attr.evolve(
	stored_obj.author,
	fullname=stored_obj.author.fullname.replace(b" <", b"\r <").replace(
	b">", b"\r>"
	),
	)
	if pad_author
	else stored_obj.author,
	committer=attr.evolve(
	stored_obj.committer,
	fullname=stored_obj.committer.fullname.replace(b" <", b"\r <").replace(
	b">", b"\r>"
	),
	)
	if pad_committer
	else stored_obj.committer,
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_name_email_trailing_whitespace"

	# Try adding spaces before the name
	for author_pad in range(0, 4):
	for committer_pad in range(0, 4):
	fixed_stored_obj = attr.evolve(
	stored_obj,
	author=attr.evolve(
	stored_obj.author,
	fullname=b" " * author_pad + stored_obj.author.fullname,
	),
	committer=attr.evolve(
	stored_obj.committer,
	fullname=b" " * committer_pad + stored_obj.committer.fullname,
	),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_leading_spaces"

	# Try adding spaces between name and email
	for i in range(1, 32):
	fullname = stored_obj.author.fullname.replace(b" <", b" " * i + b"<", 1)
	fixed_stored_obj = attr.evolve(
	stored_obj,
	author=Person(fullname=fullname, name=b"", email=b""),
	committer=Person(fullname=fullname, name=b"", email=b""),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_middle_spaces"

	# Try again but with differing values
	for committer_padding in (0, 1, 2, 4, 5, 8, 16, 32):
	for author_padding in (0, 1, 2, 4, 5, 8, 16, 32):
	fixed_stored_obj = attr.evolve(
	stored_obj,
	author=Person(
	fullname=stored_obj.author.fullname.replace(
	b" <", b" " + b" " * author_padding + b"<"
	),
	name=b"",
	email=b"",
	),
	committer=Person(
	fullname=stored_obj.committer.fullname.replace(
	b" <", b" " + b" " * committer_padding + b"<"
	),
	name=b"",
	email=b"",
	),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	return "fixable_author_middle_spaces"

	# Try adding spaces around the name
	for i in range(1, 4):
	fullname = b" " * i + stored_obj.author.fullname.replace(
	b" <", b" " * i + b" <"
	)
	fixed_stored_obj = attr.evolve(
	stored_obj,
	author=Person(fullname=fullname, name=b"", email=b""),
	committer=Person(fullname=fullname, name=b"", email=b""),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_leading_and_middle_spaces"

	# Try adding spaces after the fullname
	fullname = stored_obj.author.fullname + b" "
	fixed_stored_obj = attr.evolve(
	stored_obj,
	author=Person(fullname=fullname, name=b"", email=b""),
	committer=Person(fullname=fullname, name=b"", email=b""),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_trailing_space"
	for _ in range(2):
	fixed_stored_obj = attr.evolve(
	fixed_stored_obj, message=b"\n" + (fixed_stored_obj.message or b"")
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_trailing_space_and_leading_newlines"

	# Try adding leading newlines
	if stored_obj.message is not None:
	fixed_stored_obj = stored_obj
	for _ in range(23): # seen in the wild: any from 1 to 8, 13, 15, 22, 23
	fixed_stored_obj = attr.evolve(
	fixed_stored_obj, message=b"\n" + fixed_stored_obj.message,
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_leading_newlines"

	# Try some hardcoded fullname susbstitutions
	substitutions = {
	b"name <email>": b" name < email >",
	b"unknown <Cl\xc3\xa9ment@.(none)>": b"unknown <Cl\xe9ment@.(none)>",
	b"unknown <J\xef\xbf\xbdrgen@Aspire.(none)>": b"unknown <J\xfcrgen@Aspire.(none)>",
	b"from site <kevoree@kevoree.org>": b" from site < kevoree@kevoree.org >",
	b" <>": b"",
	}
	fixed_stored_obj = attr.evolve(
	stored_obj,
	author=attr.evolve(
	stored_obj.author,
	fullname=substitutions.get(
	stored_obj.author.fullname, stored_obj.author.fullname
	),
	),
	committer=attr.evolve(
	stored_obj.committer,
	fullname=substitutions.get(
	stored_obj.committer.fullname, stored_obj.committer.fullname
	),
	),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_hardcoded"
	if fixed_stored_obj.author.fullname == b"unknown <Cl\xe9ment@.(none)>":
	fixed_stored_obj = attr.evolve(
	fixed_stored_obj,
	extra_headers=(
	*fixed_stored_obj.extra_headers,
	(b"encoding", b"ISO-8859-1"),
	),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_and_encoding_hardcoded"

	# Try removing leading space:
	author = stored_obj.author
	committer = stored_obj.committer
	if author.fullname.startswith(b" "):
	author = attr.evolve(author, fullname=author.fullname[1:])
	if committer.fullname.startswith(b" "):
	committer = attr.evolve(committer, fullname=committer.fullname[1:])
	fixed_stored_obj = attr.evolve(stored_obj, author=author, committer=committer)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_fullname_strip_leading_space"

	# When the fullname is in both the name and the email
	# have: xxx<yyy@zzz> <xxx <yyy@zzz>>
	# want: xxx<yyy@zzz> <xxx<yyy@zzz>>
	author = stored_obj.author
	committer = stored_obj.committer
	if author.name and author.email and b">" in author.name and b">" in author.email:
	author = attr.evolve(
	author,
	fullname=b"<".join(author.fullname.rsplit(b" <", 1)), # replace last occur
	)
	if (
	committer.name
	and committer.email
	and b">" in committer.name
	and b">" in committer.email
	):
	committer = attr.evolve(
	committer, fullname=b"<".join(committer.fullname.rsplit(b" <", 1)), # ditto
	)
	fixed_stored_obj = attr.evolve(stored_obj, author=author, committer=committer)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_author_fullname_in_name_and_email"

	# If the timezone is 0, try some other ones
	offsets = {i * 60 + (+1 if i >= 0 else -1) * 59 for i in range(-12, 13)} \| {
	-22 * 60 - 0,
	0,
	12 * 60 + 0,
	14 * 60 + 0,
	20 * 60 + 0,
	80 * 60 + 0,
	stored_obj.committer_date.offset,
	stored_obj.date.offset,
	}
	for committer_offset in (
	offsets
	if stored_obj.committer_date.offset == 0
	else [stored_obj.committer_date.offset]
	):
	for author_offset in (
	offsets if stored_obj.date.offset == 0 else [stored_obj.date.offset]
	):
	fixed_stored_obj = attr.evolve(
	stored_obj,
	date=attr.evolve(
	stored_obj.date, offset=author_offset, negative_utc=False
	),
	committer_date=attr.evolve(
	stored_obj.committer_date,
	offset=committer_offset,
	negative_utc=False,
	),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_offset"
	fixed_stored_obj = attr.evolve(
	fixed_stored_obj, message=b"\n" + (fixed_stored_obj.message or b"")
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_offset_and_newline"

	if stored_obj.date.offset == stored_obj.committer_date.offset == (6 * 60 + 15):
	fixed_stored_manifest = stored_manifest.replace(b"+0615", b"+0575")
	if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id:
	write_fixed_manifest(swhid, fixed_stored_manifest)
	return "weird-offset=+0575"

	if stored_obj.date.offset == stored_obj.committer_date.offset == (7 * 60 + 0):
	fixed_stored_manifest = stored_manifest.replace(b"+0700", b"--700")
	if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id:
	write_fixed_manifest(swhid, fixed_stored_manifest)
	return "weird-offset=--700"

	for offset in (
	b"-041800",
	b"-12257",
	b"-12255",
	b"-72000",
	b"-12242",
	b"-12310",
	b"-3600",
	b"-1900",
	b"0000",
	b"+0575",
	b"+041800",
	b"+051800",
	b"+091800",
	b"+1073603",
	b"+1558601",
	b"+1558010",
	b"+1559432",
	b"+1670119",
	b"+15094352",
	b"+15094728",
	b"+27455236",
	b"+40347417",
	):
	fixed_stored_manifest = stored_manifest.replace(
	b" +0000", b" " + offset
	).replace(b"+51800", offset)
	object_header, rest = fixed_stored_manifest.split(b"\x00", 1)
	fixed_stored_manifest = b"commit " + str(len(rest)).encode() + b"\x00" + rest
	if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id:
	write_fixed_manifest(swhid, fixed_stored_manifest)
	return f"weird-offset-misc"

	# Try replacing +0002 with +02
	if stored_obj.date.offset == 2 or stored_obj.committer_date.offset == 2:
	for (unpad_author, unpad_committer) in ((0, 1), (1, 0), (1, 1)):
	fixed_stored_manifest = b"\n".join(
	line.replace(b" +0002", b" +02")
	if (unpad_author and line.startswith(b"author "))
	or (unpad_committer and line.startswith(b"committer "))
	else line
	for line in stored_manifest.split(b"\n")
	)
	(*_, rest) = fixed_stored_manifest.split(b"\x00", 1)
	fixed_stored_manifest = (
	b"commit " + str(len(rest)).encode() + b"\x00" + rest
	)
	if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id:
	write_fixed_manifest(swhid, fixed_stored_manifest)
	return f"weird-offset={offset.decode()}"
	if fixed_stored_manifest.endswith(b"\n"):
	fixed_stored_manifest = fixed_stored_manifest.rstrip()
	(*_, rest) = fixed_stored_manifest.split(b"\x00", 1)
	fixed_stored_manifest = (
	b"commit " + str(len(rest)).encode() + b"\x00" + rest
	)
	if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id:
	write_fixed_manifest(swhid, fixed_stored_manifest)
	return f"weird-offset={offset.decode()}"

	if (
	stored_obj.date.offset == stored_obj.committer_date.offset == 0
	and stored_obj.author.fullname.startswith(b" ")
	):
	fixed_stored_obj = attr.evolve(
	stored_obj,
	author=attr.evolve(
	stored_obj.author, fullname=stored_obj.author.fullname[1:]
	),
	committer=attr.evolve(
	stored_obj.committer, fullname=stored_obj.committer.fullname[1:]
	),
	date=attr.evolve(stored_obj.date, negative_utc=True),
	committer_date=attr.evolve(stored_obj.committer_date, negative_utc=True),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return f"fixable_space_and_negative_utc"

	fixed_stored_obj = attr.evolve(
	fixed_stored_obj, message=(stored_obj.message or b"") + b"\n",
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return f"fixable_space_and_newline_and_negative_utc"

	# Try adding an encoding header
	if b"encoding" not in dict(stored_obj.extra_headers):
	for encoding in ENCODINGS:
	fixed_stored_obj = attr.evolve(
	stored_obj,
	extra_headers=(*stored_obj.extra_headers, (b"encoding", encoding)),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return f"fixable_add_encoding"
	if fixed_stored_obj.message is not None:
	for _ in range(3):
	fixed_stored_obj = attr.evolve(
	fixed_stored_obj,
	message=b"\n" + (fixed_stored_obj.message or b""),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return f"fixable_add_encoding_and_leading_newlines"

	# Try capitalizing the 'parent' revision
	stored_manifest_lines = stored_manifest.split(b"\n")
	fixed_stored_manifest_lines = [
	b"parent " + line.split(b" ")[1].upper()
	if line.startswith(b"parent ")
	else line
	for line in stored_manifest_lines
	]
	fixed_stored_manifest = b"\n".join(fixed_stored_manifest_lines)
	if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id:
	write_fixed_manifest(swhid, fixed_stored_manifest)
	return "capitalized_revision_parent"

	# Try removing leading zero in date offsets (very crude...)
	stored_manifest_lines = stored_manifest.split(b"\n")
	for (unpad_author, unpad_committer) in [(0, 1), (1, 0), (1, 1)]:
	fixed_stored_manifest_lines = list(stored_manifest_lines)
	if unpad_author:
	fixed_stored_manifest_lines = [
	re.sub(br"([+-])0", lambda m: m.group(1), line)
	if line.startswith(b"author ")
	else line
	for line in fixed_stored_manifest_lines
	]
	if unpad_committer:
	fixed_stored_manifest_lines = [
	re.sub(br"([+-])0", lambda m: m.group(1), line)
	if line.startswith(b"committer ")
	else line
	for line in fixed_stored_manifest_lines
	]
	fixed_stored_manifest = b"\n".join(fixed_stored_manifest_lines)
	object_header, rest = fixed_stored_manifest.split(b"\x00", 1)
	fixed_stored_manifest = b"commit " + str(len(rest)).encode() + b"\x00" + rest
	if hashlib.new("sha1", fixed_stored_manifest).digest() == obj_id:
	write_fixed_manifest(swhid, fixed_stored_manifest)
	return f"weird-unpadded_time_offset"

	# Try moving the nonce at the end
	if b"nonce" in dict(stored_obj.extra_headers):
	fixed_stored_obj = attr.evolve(
	stored_obj,
	extra_headers=(
	*[(k, v) for (k, v) in stored_obj.extra_headers if k != b"nonce"],
	*[(k, v) for (k, v) in stored_obj.extra_headers if k == b"nonce"],
	),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_move_nonce"

	return None


	def try_fix_release(swhid, stored_obj, stored_manifest):
	obj_id = swhid.object_id

	# Try nullifying a zero date
	if stored_obj.date is not None and stored_obj.date.timestamp.seconds == 0:
	fixed_stored_obj = attr.evolve(stored_obj, date=None,)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_nullify_zero_date"

	# Try zeroing a null date
	if stored_obj.date is None:
	fixed_stored_obj = attr.evolve(stored_obj, date=ZERO_TIMESTAMP)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "fixable_zero_null_date"

	return None


	def get_origins(swhid, stored_obj):
	obj_id = swhid.object_id

	storage = get_storage(
	"pipeline",
	steps=[
	dict(cls="retry"),
	dict(
	cls="remote", url="http://webapp1.internal.softwareheritage.org:5002/"
	),
	],
	)
	dir_ = f"graph_backward_leaves/{hash_to_hex(swhid.object_id)[0:2]}"
	os.makedirs(dir_, exist_ok=True)
	graph_cache_file = f"{dir_}/{swhid}.txt"
	if os.path.isfile(graph_cache_file):
	with open(graph_cache_file) as fd:
	origin_swhids = [
	ExtendedSWHID.from_string(line.strip()) for line in fd if line.strip()
	]
	else:
	for _ in range(10):
	try:
	origin_swhids = [
	ExtendedSWHID.from_string(line)
	for line in graph.leaves(swhid, direction="backward")
	if line.startswith("swh:1:ori:")
	]
	except GraphArgumentException:
	return (False, "unrecoverable_not-in-swh-graph")
	except:
	pass
	else:
	break
	else:
	return (False, "unrecoverable_swh-graph-crashes")
	tmp_path = graph_cache_file + ".tmp" + secrets.token_hex(8)
	with open(tmp_path, "wt") as fd:
	fd.write("\n".join(map(str, origin_swhids)))
	fd.write("\n")
	os.rename(tmp_path, graph_cache_file) # atomic
	origins = [
	origin["url"]
	for origin in storage.origin_get_by_sha1(
	[origin_swhid.object_id for origin_swhid in origin_swhids]
	)
	]

	# swh-graph results are in non-deterministic order; so a bit of sorting avoids
	# fetching lots of different forks of the same project.
	# And for big projects with lots of forks and/or broken commits,
	# let's manually hardcode the repo with the most commits.
	PRIOTIZED_ORIGINS = [
	"https://github.com/torvalds/linux.git",
	"https://github.com/git/git.git",
	"https://github.com/nixos/nixpkgs.git",
	]
	origins.sort(key=lambda url: "" if url in PRIOTIZED_ORIGINS else url)

	for origin_url in origins:
	if not origin_url.endswith(".git"):
	origin_url += ".git"
	if origin_url == "https://github.com/reingart/python.git":
	# Fails very often...
	continue
	if ".googlecode.com/" in origin_url:
	# Does not exist anymore
	continue

	data = b"0032want " + hash_to_bytehex(obj_id) + b"\n"
	if swhid.object_type == ObjectType.REVISION:
	for parent in stored_obj.parents:
	data += b"0032have " + hash_to_bytehex(parent) + b"\n"
	elif swhid.object_type == ObjectType.RELEASE:
	data += b"0032have " + hash_to_bytehex(stored_obj.target) + b"\n"
	data += b"0000"
	data += b"0009done\n"

	clone_path = get_clone_path(origin_url)
	if not clone_path.is_dir():
	# First, check if we can access the origin and if it still has the
	# commit we want.

	parsed_url = urllib.parse.urlparse(origin_url)
	if parsed_url.scheme == "git":
	# TODO: use the dumb git proto to check?
	try:
	clone(origin_url)
	except subprocess.CalledProcessError:
	continue
	elif parsed_url.scheme in ("http", "https"):
	# This is silly, but neither requests or dulwich properly handle
	# some connection terminations for some reason, so we need
	# this home-made HTTP client
	hostname = parsed_url.netloc
	context = ssl.create_default_context()
	try:
	with socket.create_connection((hostname, 443)) as sock:
	with context.wrap_socket(
	sock, server_hostname=hostname
	) as ssock:
	ssock.write(
	b"POST "
	+ parsed_url.path.encode()
	+ b"/git-upload-pack HTTP/1.0\r\n"
	)
	ssock.write(b"Host: " + hostname.encode() + b"\r\n")
	ssock.write(
	b"Content-Type: application/x-git-upload-pack-request\r\n"
	)
	ssock.write(b"\r\n")
	ssock.write(data)
	response = b""
	while True:
	new_data = ssock.read()
	if not new_data:
	break
	response += new_data
	except (TimeoutError, socket.gaierror, ssl.SSLCertVerificationError):
	# Could not connect
	continue
	except (ConnectionResetError, OSError):
	# Could happen for variousreasons, let's try anyway
	pass
	else:
	(headers, body) = response.split(b"\r\n\r\n", 1)
	(status_line, headers) = headers.split(b"\r\n", 1)
	if b"401" in status_line or b"404" in status_line:
	# Repo not available
	continue
	try:
	clone(origin_url)
	except subprocess.CalledProcessError:
	continue

	try:
	cloned_obj = get_object_from_clone(origin_url, obj_id)
	except KeyError:
	# try next origin
	continue
	if cloned_obj is None:
	return (False, "found_but_unparseable")
	break
	else:
	return (False, "unrecoverable_no-origin")

	return (True, origin_url, cloned_obj)


	def try_recover_revision(
	swhid, stored_obj, stored_manifest, cloned_obj, cloned_manifest
	):
	obj_id = swhid.object_id
	fixed_stored_obj = stored_obj

	# Try adding gpgsig
	if (
	b"gpgsig" not in dict(stored_obj.extra_headers)
	and cloned_obj.gpgsig is not None
	):
	fixed_stored_obj = attr.evolve(
	stored_obj,
	extra_headers=(
	*[(k, v) for (k, v) in stored_obj.extra_headers if k != b"nonce"],
	(b"gpgsig", cloned_obj.gpgsig),
	*[(k, v) for (k, v) in stored_obj.extra_headers if k == b"nonce"],
	),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "recoverable_missing_gpgsig"

	# Try adding mergetag (on top of gpgsig)
	if (
	b"mergetag" not in dict(stored_obj.extra_headers)
	and cloned_obj.mergetag is not None
	):
	# fixed_stored_obj = stored_obj # commented out to reuse the gpgsig-fixed
	mergetags = []
	for mergetag in cloned_obj.mergetag:
	mergetag = mergetag.as_raw_string()
	assert mergetag.endswith(b"\n")
	mergetags.append((b"mergetag", mergetag[0:-1]))
	fixed_stored_obj = attr.evolve(
	fixed_stored_obj, extra_headers=(mergetags, stored_obj.extra_headers,),
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "recoverable_missing_mergetag_and_maybe_gpgsig"

	# Try adding a magic string at the end of the message
	if stored_obj.message and stored_obj.message.endswith(b"--HG--\nbranch : "):
	# Probably https://github.com/GWBasic/ObjectCloud.git
	assert cloned_obj.message.startswith(stored_obj.message)
	fixed_stored_obj = attr.evolve(stored_obj, message=cloned_obj.message)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "recoverable_hg_branch_nullbytes_truncated"

	# Try copying extra headers (including gpgsig)
	extra_headers = cloned_obj.extra
	if cloned_obj.gpgsig is not None:
	extra_headers = (*extra_headers, (b"gpgsig", cloned_obj.gpgsig))
	fixed_stored_obj = attr.evolve(stored_obj, extra_headers=extra_headers)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "recoverable_extra_headers"
	if {b"HG:extra", b"HG:rename-source", b"HG:rename"} & set(dict(extra_headers)):
	for n in range(4):
	fixed_stored_obj = attr.evolve(
	fixed_stored_obj, message=b"\n" + fixed_stored_obj.message
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "recoverable_extra_headers_and_leading_newlines"

	return None


	def try_recover_release(
	swhid, stored_obj, stored_manifest, cloned_obj, cloned_manifest
	):
	obj_id = swhid.object_id

	if cloned_obj.signature is not None:
	fixed_stored_obj = attr.evolve(
	stored_obj, message=(stored_obj.message or b"") + cloned_obj.signature
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "recoverable_missing_gpgsig"

	if cloned_obj.signature is not None:
	fixed_stored_obj = attr.evolve(
	stored_obj,
	date=ZERO_TIMESTAMP,
	message=(stored_obj.message or b"") + cloned_obj.signature,
	)
	if fixed_stored_obj.compute_hash() == obj_id:
	write_fixed_object(swhid, fixed_stored_obj)
	return "recoverable_missing_gpgsig_and_zero_date"

	print("original", repr(cloned_manifest.split(b"\x00", 1)[1]))
	print("stored ", repr(stored_manifest.split(b"\x00", 1)[1]))
	print(
	"\n".join(
	difflib.ndiff(
	cloned_manifest.split(b"\x00", 1)[1]
	.decode(errors="backslashreplace")
	.split("\n"),
	stored_manifest.split(b"\x00", 1)[1]
	.decode(errors="backslashreplace")
	.split("\n"),
	)
	)
	)


	def handle_pdb(sig, frame):
	import pdb

	pdb.Pdb().set_trace(frame)


	if __name__ == "__main__":
	signal.signal(signal.SIGUSR1, handle_pdb)
	main(sys.stdin)

File Metadata

Mime Type: text/x-python
Expires: Jun 4 2025, 7:41 PM (10 w, 4 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3266378

analyze_consistency_failures.pyNo OneTemporaryActions

analyze_consistency_failures.pyView Options

File Metadata

Event Timeline

analyze_consistency_failures.py
No OneTemporary
Actions

analyze_consistency_failures.py
View Options