diff --git a/bin/swh-storage-add-dir b/bin/swh-storage-add-dir index 32627c1a..2bccc7b3 100755 --- a/bin/swh-storage-add-dir +++ b/bin/swh-storage-add-dir @@ -1,39 +1,39 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import sys from swh import storage from swh.core.hashutil import _hash_fname if __name__ == '__main__': try: db_connstring = sys.argv[1] obj_root = sys.argv[2] dirname = sys.argv[3] except IndexError: print('Usage: swh-storage-add-dir' + ' DB_CONNSTRING OBJ_STORAGE_DIR DATA_DIR') print('Example: swh-storage-add-dir "dbname=swh user=foo"' + ' /srv/softwareheritage/objects /usr/src/linux-4.2') sys.exit(1) logging.basicConfig(level=logging.INFO) storage = storage.Storage(db_connstring, obj_root) def list_content(): for root, _dirs, files in os.walk(dirname): for name in files: path = os.path.join(root, name) cont = _hash_fname(path) cont['data'] = open(path, 'rb').read() yield cont storage.content_add(list_content()) diff --git a/utils/dump_revisions.py b/utils/dump_revisions.py index 8b3f523e..45421f34 100755 --- a/utils/dump_revisions.py +++ b/utils/dump_revisions.py @@ -1,62 +1,62 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 import os import pickle import psycopg2.extras from swh.storage import converters, db from swh.model import identifiers QUERY = ''' select r.id, r.date, r.date_offset, r.date_neg_utc_offset, r.committer_date, r.committer_date_offset, r.committer_date_neg_utc_offset, r.type, r.directory, r.message, a.id as author_id, a.name as author_name, a.email as author_email, c.id as committer_id, c.name as committer_name, c.email as committer_email, r.metadata, r.synthetic, array(select rh.parent_id::bytea from revision_history rh where rh.id = r.id order by rh.parent_rank) as parents from revision r left join person a on a.id = r.author left join person c on c.id = r.committer where r.id > %s order by r.id limit %s ''' def dump_revision(revision): rev_id = identifiers.identifier_to_str(revision['id']) dirs = 'revs/%s/%s' % (rev_id[0:2], rev_id[2:4]) os.makedirs(dirs, exist_ok=True) with open(os.path.join(dirs, rev_id), 'wb') as f: pickle.dump(revision, f) def check_revision(revision): id = identifiers.identifier_to_str(revision['id']) computed_id = identifiers.revision_identifier(revision) if id != computed_id: dump_revision(revision) if __name__ == '__main__': swh_db = db.Db.connect('service=swh', cursor_factory=psycopg2.extras.RealDictCursor) last_id = bytes.fromhex('51606a8181f7c6d0aff852106c3ec23ebc186439') while True: with swh_db.transaction() as cur: cur.execute(QUERY, (last_id, 10000)) if not cur.rowcount > 0: break for db_rev in db.cursor_to_bytes(cur): revision = converters.db_to_revision(db_rev) check_revision(revision) last_id = revision['id'] diff --git a/utils/fix_revisions_from_dump.py b/utils/fix_revisions_from_dump.py index 1b67148c..32fd7d79 100755 --- a/utils/fix_revisions_from_dump.py +++ b/utils/fix_revisions_from_dump.py @@ -1,239 +1,239 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 import copy import itertools import os import pickle import sys from swh.model import identifiers def author_date_to_notnegutc(rev): rev['date']['negative_utc'] = False def author_date_to_negutc(rev): rev['date']['negative_utc'] = True DATE_NEGUTC_FIX = ('set author negutc', [ (None, None), (author_date_to_negutc, 'date_neg_utcoffset = true'), ]) def committer_date_to_notnegutc(rev): rev['committer_date']['negative_utc'] = False def committer_date_to_negutc(rev): rev['committer_date']['negative_utc'] = True COMMITTER_DATE_NEGUTC_FIX = ('set committer negutc', [ (None, None), (committer_date_to_negutc, 'committer_date_neg_utcoffset = true'), ]) def message_to_empty(rev): rev['message'] = b'' MESSAGE_EMPTY_FIX = ('empty instead of null message', [ (None, None), (message_to_empty, "message = ''"), ]) def message_to_null(rev): rev['message'] = None MESSAGE_NULL_FIX = ('null instead of empty message', [ (None, None), (message_to_null, "message = NULL"), ]) def message_add_nl_end(num_nl): def fix(rev, num_nl=num_nl): components = [rev['message'] if rev['message'] else b''] components.extend([b'\n'] * num_nl) rev['message'] = b''.join(components) return fix MESSAGE_ADD_NL_END_FIX = ('add newline to end of message', [ (None, None), (message_add_nl_end(1), "add 1 newline to end of message"), (message_add_nl_end(2), "add 2 newlines to end of message"), (message_add_nl_end(3), "add 3 newlines to end of message"), ]) def message_add_nl_start(num_nl): def fix(rev, num_nl=num_nl): components = [b'\n'] * num_nl components.append(rev['message'] if rev['message'] else b'') rev['message'] = b''.join(components) return fix MESSAGE_ADD_NL_START_FIX = ('add newline to start of message', [ (None, None), (message_add_nl_start(1), "add 1 newline to start of message"), (message_add_nl_start(2), "add 2 newlines to start of message"), (message_add_nl_start(3), "add 3 newlines to start of message"), ]) def author_name_doublespace(rev): rev['author']['name'] = b''.join([rev['author']['name'], b' ']) AUTHOR_NAME_ADD_SPC_FIX = ('author double space', [ (None, None), (author_name_doublespace, 'trailing space author name') ]) def committer_name_doublespace(rev): rev['committer']['name'] = b''.join([rev['committer']['name'], b' ']) COMMITTER_NAME_ADD_SPC_FIX = ('committer double space', [ (None, None), (committer_name_doublespace, 'trailing space committer name') ]) def author_name_null(rev): rev['author']['name'] = None AUTHOR_NAME_NULL_FIX = ('author name null', [ (None, None), (author_name_null, 'None author name') ]) def author_email_null(rev): rev['author']['email'] = None AUTHOR_EMAIL_NULL_FIX = ('author email null', [ (None, None), (author_email_null, 'None author email') ]) def committer_name_null(rev): rev['committer']['name'] = None COMMITTER_NAME_NULL_FIX = ('committer name null', [ (None, None), (committer_name_null, 'None committer name') ]) def committer_email_null(rev): rev['committer']['email'] = None COMMITTER_EMAIL_NULL_FIX = ('committer email null', [ (None, None), (committer_email_null, 'None committer email') ]) def author_add_spc(rev): rev['author'] = b''.join([ identifiers.normalize_author(rev['author']), b' ']) AUTHOR_ADD_SPC_FIX = ('add trailing space to author specification', [ (None, None), (author_add_spc, 'add trailing space to author spec') ]) def committer_add_spc(rev): rev['committer'] = b''.join([ identifiers.normalize_author(rev['committer']), b' ']) COMMITTER_ADD_SPC_FIX = ('add trailing space to committer specification', [ (None, None), (committer_add_spc, 'add trailing space to committer spec') ]) def fix_revision(revision): data_fixups = [] id = identifiers.identifier_to_str(revision['id']) if revision['message'] is None: data_fixups.append(MESSAGE_EMPTY_FIX) if revision['message'] == b'': data_fixups.append(MESSAGE_NULL_FIX) data_fixups.append(MESSAGE_ADD_NL_END_FIX) data_fixups.append(MESSAGE_ADD_NL_START_FIX) if revision['date']['offset'] == 0 and \ not revision['date']['negative_utc']: data_fixups.append(DATE_NEGUTC_FIX) if revision['committer_date']['offset'] == 0 and \ not revision['committer_date']['negative_utc']: data_fixups.append(COMMITTER_DATE_NEGUTC_FIX) if not data_fixups: computed_id = identifiers.revision_identifier(revision) if id == computed_id: return # Less credible fixups are first in the list, so they run last data_fixups.insert(0, COMMITTER_ADD_SPC_FIX) data_fixups.insert(0, AUTHOR_ADD_SPC_FIX) if revision['author']['name'] == b'': data_fixups.insert(0, AUTHOR_NAME_NULL_FIX) if revision['author']['email'] == b'': data_fixups.insert(0, AUTHOR_EMAIL_NULL_FIX) if revision['committer']['name'] == b'': data_fixups.insert(0, COMMITTER_NAME_NULL_FIX) if revision['committer']['email'] == b'': data_fixups.insert(0, COMMITTER_EMAIL_NULL_FIX) data_fixups.insert(0, COMMITTER_NAME_ADD_SPC_FIX) data_fixups.insert(0, AUTHOR_NAME_ADD_SPC_FIX) data_fixup_functions = [functions for title, functions in data_fixups] for corrections in itertools.product(*data_fixup_functions): sql_fixups = [] new_revision = copy.deepcopy(revision) for fun, sql_fixup in corrections: if fun: fun(new_revision) if sql_fixup: sql_fixups.append(sql_fixup) computed_id = identifiers.revision_identifier(new_revision) if id == computed_id: if not sql_fixups: return return id, sql_fixups else: return id, [] if __name__ == '__main__': for hash in sys.stdin.readlines(): hash = hash.strip() filename = os.path.join('revs', hash[0:2], hash[2:4], hash) with open(filename, 'rb') as f: revision = pickle.load(f) id, fixes = fix_revision(revision) if not fixes: print(id) else: print(';'.join([id, 'FIXED'] + fixes))