diff --git a/scripts/dataset/build_dataset.sh b/scripts/dataset/build_dataset.sh new file mode 100755 index 0000000..1449fc4 --- /dev/null +++ b/scripts/dataset/build_dataset.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Pipeline of dataset construction + +ROOT_FOLDER=$1 + +if [ $# -eq 1] +then + mkdir $ROOT_FOLDER; + echo "[[Start listing repositories.]]"; + ./repo_lister.sh; + echo "[[Repository lists built.]]"; + echo "[[Start cloning repositories to '$ROOT_FOLDER']]"; + ./repo_clone.sh "$ROOT_FOLDER"; + echo "[[Clone completed.]]"; + echo "[[Start calculating ground truth.]]"; + ./repo_arrange.sh "$ROOT_FOLDER"; + echo "[[JSON ground truth calculated.]]"; + echo "[[Start completing ground truth with text result.]]"; + ./repo_arrange_text.sh "$ROOT_FOLDER"; + echo "[[Ground truth completed.]]"; + echo "[[Start arranging files by language.]]"; + python3 ground_truth_arrange.py; + echo "[[Raw file dataset built in '$ROOT_FOLDER'.]]"; +else + echo "Please enter root folder correctly. Only one folder is needed."; +fi