diff --git a/ardumont/send-batch-sha1s.sh b/ardumont/send-batch-sha1s.sh new file mode 100755 index 0000000..e029559 --- /dev/null +++ b/ardumont/send-batch-sha1s.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# time to wait for scheduling the next batch +SLEEP_TIME=200 +# number of contents to process as a batch +BATCH_SIZE=1000 +# total number of contents to send in batch number in one iteration +NUMBER_CONTENTS=100000 +START=$([ -f position_indexer ] && cat position_indexer || echo 0) + +while true +do + POSITION=$(($NUMBER_CONTENTS * $START)) + + echo "Sending $NUMBER_CONTENTS new contents from $POSITION" + gzip -dc /srv/storage/space/lists/azure-rehash/contents-sha1-to-rehash.txt.gz | \ + tail -n +$POSITION | \ + head -${NUMBER_CONTENTS} | \ + SWH_WORKER_INSTANCE=swh_indexer_orchestrator python3 -m swh.indexer.producer \ + --batch $BATCH_SIZE + START=$((START + 1)) + echo $START > position_indexer + + echo "Waiting for computations to be done" + echo + sleep $SLEEP_TIME +done