diff --git a/utils/dump_revisions.py b/utils/dump_revisions.py deleted file mode 100755 index 5e5d220..0000000 --- a/utils/dump_revisions.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 - -import os -import pickle - -import psycopg2.extras - -from swh.storage import converters, db -from swh.model import identifiers - - -QUERY = ''' -select - r.id, - r.date, r.date_offset, r.date_neg_utc_offset, - r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, - r.type, r.directory, r.message, - a.id as author_id, a.name as author_name, a.email as author_email, - c.id as committer_id, c.name as committer_name, c.email as committer_email, - r.metadata, r.synthetic, - array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id - order by rh.parent_rank) - as parents -from revision r -left join person a on a.id = r.author -left join person c on c.id = r.committer -where r.id > %s -order by r.id -limit %s -''' - - -def dump_revision(revision): - rev_id = identifiers.identifier_to_str(revision['id']) - dirs = 'revs/%s/%s' % (rev_id[0:2], rev_id[2:4]) - os.makedirs(dirs, exist_ok=True) - with open(os.path.join(dirs, rev_id), 'wb') as f: - pickle.dump(revision, f) - - -def check_revision(revision): - id = identifiers.identifier_to_str(revision['id']) - - computed_id = identifiers.revision_identifier(revision) - if id != computed_id: - dump_revision(revision) - - -if __name__ == '__main__': - swh_db = db.Db.connect('service=swh', - cursor_factory=psycopg2.extras.RealDictCursor) - - last_id = bytes.fromhex('51606a8181f7c6d0aff852106c3ec23ebc186439') - - while True: - with swh_db.transaction() as cur: - cur.execute(QUERY, (last_id, 10000)) - if not cur.rowcount > 0: - break - for db_rev in db.cursor_to_bytes(cur): - revision = converters.db_to_revision(db_rev) - check_revision(revision) - - last_id = revision['id'] diff --git a/utils/fix_revisions_from_dump.py b/utils/fix_revisions_from_dump.py deleted file mode 100755 index 1e9aea4..0000000 --- a/utils/fix_revisions_from_dump.py +++ /dev/null @@ -1,252 +0,0 @@ -#!/usr/bin/env python3 - -import copy -import itertools -import os -import pickle -import sys - -from swh.model import identifiers - - -def author_date_to_notnegutc(rev): - rev['date']['negative_utc'] = False - - -def author_date_to_negutc(rev): - rev['date']['negative_utc'] = True - - -DATE_NEGUTC_FIX = ('set author negutc', [ - (None, None), - (author_date_to_negutc, 'date_neg_utcoffset = true'), -]) - - -def committer_date_to_notnegutc(rev): - rev['committer_date']['negative_utc'] = False - - -def committer_date_to_negutc(rev): - rev['committer_date']['negative_utc'] = True - - -COMMITTER_DATE_NEGUTC_FIX = ('set committer negutc', [ - (None, None), - (committer_date_to_negutc, 'committer_date_neg_utcoffset = true'), -]) - - -def message_to_empty(rev): - rev['message'] = b'' - - -MESSAGE_EMPTY_FIX = ('empty instead of null message', [ - (None, None), - (message_to_empty, "message = ''"), -]) - - -def message_to_null(rev): - rev['message'] = None - - -MESSAGE_NULL_FIX = ('null instead of empty message', [ - (None, None), - (message_to_null, "message = NULL"), -]) - - -def message_add_nl_end(num_nl): - def fix(rev, num_nl=num_nl): - components = [rev['message'] if rev['message'] else b''] - components.extend([b'\n'] * num_nl) - rev['message'] = b''.join(components) - return fix - - -MESSAGE_ADD_NL_END_FIX = ('add newline to end of message', [ - (None, None), - (message_add_nl_end(1), "add 1 newline to end of message"), - (message_add_nl_end(2), "add 2 newlines to end of message"), - (message_add_nl_end(3), "add 3 newlines to end of message"), -]) - - -def message_add_nl_start(num_nl): - def fix(rev, num_nl=num_nl): - components = [b'\n'] * num_nl - components.append(rev['message'] if rev['message'] else b'') - rev['message'] = b''.join(components) - return fix - - -MESSAGE_ADD_NL_START_FIX = ('add newline to start of message', [ - (None, None), - (message_add_nl_start(1), "add 1 newline to start of message"), - (message_add_nl_start(2), "add 2 newlines to start of message"), - (message_add_nl_start(3), "add 3 newlines to start of message"), -]) - - -def author_name_doublespace(rev): - rev['author']['name'] = b''.join([rev['author']['name'], b' ']) - - -AUTHOR_NAME_ADD_SPC_FIX = ('author double space', [ - (None, None), - (author_name_doublespace, 'trailing space author name') -]) - - -def committer_name_doublespace(rev): - rev['committer']['name'] = b''.join([rev['committer']['name'], b' ']) - - -COMMITTER_NAME_ADD_SPC_FIX = ('committer double space', [ - (None, None), - (committer_name_doublespace, 'trailing space committer name') -]) - - -def author_name_null(rev): - rev['author']['name'] = None - - -AUTHOR_NAME_NULL_FIX = ('author name null', [ - (None, None), - (author_name_null, 'None author name') -]) - - -def author_email_null(rev): - rev['author']['email'] = None - - -AUTHOR_EMAIL_NULL_FIX = ('author email null', [ - (None, None), - (author_email_null, 'None author email') -]) - - -def committer_name_null(rev): - rev['committer']['name'] = None - - -COMMITTER_NAME_NULL_FIX = ('committer name null', [ - (None, None), - (committer_name_null, 'None committer name') -]) - - -def committer_email_null(rev): - rev['committer']['email'] = None - - -COMMITTER_EMAIL_NULL_FIX = ('committer email null', [ - (None, None), - (committer_email_null, 'None committer email') -]) - - -def author_add_spc(rev): - rev['author'] = b''.join([ - identifiers.normalize_author(rev['author']), b' ']) - - -AUTHOR_ADD_SPC_FIX = ('add trailing space to author specification', [ - (None, None), - (author_add_spc, 'add trailing space to author spec') -]) - - -def committer_add_spc(rev): - rev['committer'] = b''.join([ - identifiers.normalize_author(rev['committer']), b' ']) - - -COMMITTER_ADD_SPC_FIX = ('add trailing space to committer specification', [ - (None, None), - (committer_add_spc, 'add trailing space to committer spec') -]) - - -def fix_revision(revision): - data_fixups = [] - - id = identifiers.identifier_to_str(revision['id']) - - if revision['message'] is None: - data_fixups.append(MESSAGE_EMPTY_FIX) - - if revision['message'] == b'': - data_fixups.append(MESSAGE_NULL_FIX) - data_fixups.append(MESSAGE_ADD_NL_END_FIX) - data_fixups.append(MESSAGE_ADD_NL_START_FIX) - - if revision['date']['offset'] == 0 and \ - not revision['date']['negative_utc']: - data_fixups.append(DATE_NEGUTC_FIX) - - if revision['committer_date']['offset'] == 0 and \ - not revision['committer_date']['negative_utc']: - data_fixups.append(COMMITTER_DATE_NEGUTC_FIX) - - if not data_fixups: - computed_id = identifiers.revision_identifier(revision) - if id == computed_id: - return - - # Less credible fixups are first in the list, so they run last - data_fixups.insert(0, COMMITTER_ADD_SPC_FIX) - data_fixups.insert(0, AUTHOR_ADD_SPC_FIX) - - if revision['author']['name'] == b'': - data_fixups.insert(0, AUTHOR_NAME_NULL_FIX) - - if revision['author']['email'] == b'': - data_fixups.insert(0, AUTHOR_EMAIL_NULL_FIX) - - if revision['committer']['name'] == b'': - data_fixups.insert(0, COMMITTER_NAME_NULL_FIX) - - if revision['committer']['email'] == b'': - data_fixups.insert(0, COMMITTER_EMAIL_NULL_FIX) - - data_fixups.insert(0, COMMITTER_NAME_ADD_SPC_FIX) - data_fixups.insert(0, AUTHOR_NAME_ADD_SPC_FIX) - - data_fixup_functions = [functions for title, functions in data_fixups] - for corrections in itertools.product(*data_fixup_functions): - sql_fixups = [] - new_revision = copy.deepcopy(revision) - - for fun, sql_fixup in corrections: - if fun: - fun(new_revision) - if sql_fixup: - sql_fixups.append(sql_fixup) - - computed_id = identifiers.revision_identifier(new_revision) - if id == computed_id: - if not sql_fixups: - return - - return id, sql_fixups - else: - return id, [] - - -if __name__ == '__main__': - - for hash in sys.stdin.readlines(): - hash = hash.strip() - filename = os.path.join('revs', hash[0:2], hash[2:4], hash) - with open(filename, 'rb') as f: - revision = pickle.load(f) - - id, fixes = fix_revision(revision) - if not fixes: - print(id) - else: - print(';'.join([id, 'FIXED'] + fixes))