diff --git a/sql/anon-dump/anon-dump.sql b/sql/anon-dump/anon-dump.sql new file mode 100644 index 0000000..f5bfb31 --- /dev/null +++ b/sql/anon-dump/anon-dump.sql @@ -0,0 +1,16 @@ +\timing on +set bytea_output='escape'; + +\copy content (sha1, sha1_git, sha256, blake2s256, length, ctime, status) to program 'pigz -c > content.csv.gz' (format csv); +\copy skipped_content (sha1, sha1_git, sha256, blake2s256, length, ctime, status, reason) to program 'pigz -c > skipped_content.csv.gz' (format csv); + +\copy directory (id, dir_entries, file_entries, rev_entries) to program 'pigz -c > directory.csv.gz' (format csv); +\copy directory_entry_dir (id, target, name, perms) to program 'pigz -c > directory_entry_dir.csv.gz' (format csv); +\copy directory_entry_file (id, target, name, perms) to program 'pigz -c > directory_entry_file.csv.gz' (format csv); +\copy directory_entry_rev (id, target, name, perms) to program 'pigz -c > directory_entry_rev.csv.gz' (format csv); + +\copy revision (id, date, date_offset, committer_date, committer_date_offset, type, directory, message, author, committer, synthetic, metadata, date_neg_utc_offset, committer_date_neg_utc_offset) to program 'pigz -c > revision.csv.gz' (format csv); +\copy revision_history (id, parent_id, parent_rank) to program 'pigz -c > revision_history.csv.gz' (format csv); +\copy release (id, target, date, date_offset, name, comment, author, synthetic, target_type, date_neg_utc_offset); to program 'pigz -c > release.csv.gz' (format csv); + +\copy person (id, name, email) to program './anonymize-email | pigz -c > person.csv.gz' (format csv); diff --git a/sql/anon-dump/anonymize-email b/sql/anon-dump/anonymize-email new file mode 100755 index 0000000..b5d974d --- /dev/null +++ b/sql/anon-dump/anonymize-email @@ -0,0 +1,34 @@ +#!/usr/bin/python3 + +import csv +import hashlib +import random +import string +import sys + + +SALT_LENGTH = 2 +CSV_DELIMITER = ',' +CSV_QUOTECHAR = '"' + + +def anonymize_email(email): + salt = '' + for i in range(0, 2): + salt += random.choice(string.printable) + + return hashlib.sha1((salt + email).encode('ascii')).hexdigest() + + +def main(): + dump_in = csv.reader(sys.stdin, + delimiter=CSV_DELIMITER, quotechar=CSV_QUOTECHAR) + dump_out = csv.writer(sys.stdout, + delimiter=CSV_DELIMITER, quotechar=CSV_QUOTECHAR) + for (person_id, name, email) in dump_in: + anon_email = anonymize_email(email) + dump_out.writerow([person_id, name, anon_email]) + + +if __name__ == '__main__': + main()