diff --git a/vlorentz/objstorage-replay-exclusion-file/README.txt b/vlorentz/objstorage-replay-exclusion-file/README.txt new file mode 100644 index 0000000..aa2cc37 --- /dev/null +++ b/vlorentz/objstorage-replay-exclusion-file/README.txt @@ -0,0 +1,7 @@ +How to use: + +1. get unsorted_inventory.txt.gz +2. sort it (--temporary-directory is needed if /tmp doesn't have >100GB free): + pv unsorted_inventory.txt.gz | pigz -d | LANG=C sort --temporary-directory=$HOME/tmp/ --parallel=12 --buffer-size=1G --compress-program=pigz | pv --wait --line-mode | pigz > sorted_inventory.txt.gz +3. convert hex digests to binary: + pv sorted_inventory.txt.gz | pigz -d | python3 hash_txt_to_bytes.py > /srv/softwareheritage/cassandra-test-0/scratch/sorted_inventory.bin diff --git a/vlorentz/objstorage-replay-exclusion-file/hash_txt_to_bytes.py b/vlorentz/objstorage-replay-exclusion-file/hash_txt_to_bytes.py new file mode 100644 index 0000000..2f9367d --- /dev/null +++ b/vlorentz/objstorage-replay-exclusion-file/hash_txt_to_bytes.py @@ -0,0 +1,19 @@ +import sys + +SHA1_CHARS = frozenset('0123456789abcdef') + +while True: + line = sys.stdin.readline() + if line == '\n': + break + assert line[-1] == '\n', repr(line) + if not set(line[0:-1]) <= SHA1_CHARS or len(line) != 41: + if len(line) != 3: + print('Rejecting: %r' % line, file=sys.stderr) + continue + try: + sys.stdout.buffer.write(bytes.fromhex(line[0:40])) + except Exception: + print(repr(line), file=sys.stderr) + raise +