diff --git a/scripts/dataset/ground_truth_arrange.py b/scripts/dataset/ground_truth_arrange.py index 17b39c4..47e2dd3 100644 --- a/scripts/dataset/ground_truth_arrange.py +++ b/scripts/dataset/ground_truth_arrange.py @@ -1,94 +1,107 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information #!/usr/bin/env python3 # coding: utf-8 import os import sys import io import json -from shutil import copyfile def main(root): root_ground_truth = root + '/../ground_truth' root_ground_truth_text = root_ground_truth + '_text' root_code_by_language = root + '/../code_by_language' counts = dict() - + try: os.mkdir(root_code_by_language) except FileExistsError: pass - + for r in os.listdir(root): if not r.startswith('.'): for d in os.listdir(root + '/' + r): if not d.startswith('.'): try: - ground_truth = io.open(root_ground_truth + '/' + r + '/' + d) + ground_truth = io.open(root_ground_truth + '/' + r + '/' + d + '.json') try: j = json.load(ground_truth) - for k in j.keys(): - root_language = root_code_by_language + '/' + k + for language in j.keys(): + root_language = root_code_by_language + '/' + language try: os.mkdir(root_language) except FileExistsError: pass - for f in j.get(k): - counts[k] = counts.get(k, 0) + 1 + for f in j.get(language): + counts[language] = counts.get(language, 0) + 1 + start = (counts[language] - 1) // 1000 * 1000 + 1 + end = start + 999 + root_count = root_language + '/' + str(start) + '-' + str(end) + if counts[language] % 1000 == 1: + try: + os.mkdir(root_count) + except FileExistsError: + pass (_,ext) = os.path.splitext(f) - new_name = str(counts[k]) + ext + new_name = str(counts[language]) + ext copy_src = root + '/' + r + '/' + d + '/' + f - copy_des = root_language + '/' + new_name - copyfile(copy_src, copy_des) - print(copy_des + ' successfully copied.') + copy_des = root_count + '/' + new_name + os.symlink(copy_src, copy_des) + print(copy_src + ' successfully copied.') except json.decoder.JSONDecodeError: ground_truth.close() - ground_truth = io.open(root_ground_truth_text + '/' + r + '/' + d, 'r', encoding="utf-8") + ground_truth = io.open(root_ground_truth_text + '/' + r + '/' + d, 'r') while(True): line = ground_truth.readline() if line == '\n' or line == '': break else: - stripped = line.strip() - print(stripped) pass - + while(True): line = ground_truth.readline() stripped = line.strip() if line == '': break else: stripped = line.strip() language = stripped.replace(':','') root_language = root_code_by_language + '/' + language try: os.mkdir(root_language) except FileExistsError: pass while(True): line = ground_truth.readline() if line == '\n': break else: f = line.strip() counts[language] = counts.get(language, 0) + 1 + start = (counts[language] - 1) // 1000 * 1000 + 1 + end = start + 999 + root_count = root_language + '/' + str(start) + '-' + str(end) + if counts[language] % 1000 == 1: + try: + os.mkdir(root_count) + except FileExistsError: + pass (_,ext) = os.path.splitext(f) new_name = str(counts[language]) + ext copy_src = root + '/' + r + '/' + d + '/' + f - copy_des = root_language + '/' + new_name - copyfile(copy_src, copy_des) - print('{} successfully copied.'.format(copy_des)) + copy_des = root_count + '/' + new_name + os.symlink(copy_src, copy_des) + print('{} successfully copied.'.format(copy_src)) finally: ground_truth.close() - -if __name__ == "__main__": + +if __name__ == '__main__': if len(sys.argv) != 2: - print("Only argument acceptable is a path.") + print('Only argument acceptable is a path.') else: main(sys.argv[1]) diff --git a/scripts/dataset/repo_arrange_text.sh b/scripts/dataset/repo_arrange_text.sh index dd5057d..dd39810 100755 --- a/scripts/dataset/repo_arrange_text.sh +++ b/scripts/dataset/repo_arrange_text.sh @@ -1,29 +1,29 @@ #!/bin/bash ROOT_FOLDER=$1 OLDIFS=$IFS IFS=$'\n' if [ $# -eq 1 ] then mkdir $ROOT_FOLDER/../ground_truth_text/; for root in $( ls $ROOT_FOLDER -1 ); do for repo in $( ls "$ROOT_FOLDER/$root" -1 ); do - GROUND_TRUTH=$ROOT_FOLDER/../ground_truth/$root/$repo; + GROUND_TRUTH=$ROOT_FOLDER/../ground_truth/$root/$repo.json; if [[ -s $GROUND_TRUTH ]]; then echo "Ground truth of '$repo' exist!"; else mkdir "$ROOT_FOLDER/../ground_truth_text/$root" - linguist "$ROOT_FOLDER/$repo" --breakdown > $ROOT_FOLDER/../ground_truth_text/$root/$repo; + linguist "$ROOT_FOLDER/$root/$repo" --breakdown | grep -ax '.*' > $ROOT_FOLDER/../ground_truth_text/$root/$repo; echo "Done"; fi done done else echo "Please enter root folder correctly. One folder is needed." fi IFS=$OLDIFS