diff --git a/swh/lang-dectection/ground_truth_arrange.py b/swh/lang-dectection/ground_truth_arrange.py index 0e91b4f..dfb205b 100644 --- a/swh/lang-dectection/ground_truth_arrange.py +++ b/swh/lang-dectection/ground_truth_arrange.py @@ -1,52 +1,92 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information #!/usr/bin/env python3 # coding: utf-8 import os import sys import io import json from shutil import copyfile def main(root): - root_ground_truth = root + "/../ground_truth" - root_code_by_language = root + "/../code_by_language" + root_ground_truth = root + '/../ground_truth' + root_ground_truth_text = root_ground_truth + '_text' + root_code_by_language = root + '/../code_by_language' counts = dict() try: os.mkdir(root_code_by_language) except FileExistsError: pass for d in os.listdir(root): if not d.startswith('.'): - ground_truth = io.open(root_ground_truth + "/" + d) try: - j = json.load(ground_truth) - for k in j.keys(): - root_language = root_code_by_language + "/" + k - try: - os.mkdir(root_language) - except FileExistsError: - pass - for f in j.get(k): - counts[k] = counts.get(k, 0) + 1 - (_,ext) = os.path.splitext(f) - new_name = str(counts[k]) + ext - copy_src = root + "/" + d + "/" + f - copy_des = root_language + "/" + new_name - copyfile(copy_src, copy_des) - print(copy_des + " successfully copied.") - except json.decoder.JSONDecodeError: - pass + ground_truth = io.open(root_ground_truth + '/' + d) + try: + j = json.load(ground_truth) + for k in j.keys(): + root_language = root_code_by_language + '/' + k + try: + os.mkdir(root_language) + except FileExistsError: + pass + for f in j.get(k): + counts[k] = counts.get(k, 0) + 1 + (_,ext) = os.path.splitext(f) + new_name = str(counts[k]) + ext + copy_src = root + '/' + d + '/' + f + copy_des = root_language + '/' + new_name + copyfile(copy_src, copy_des) + print(copy_des + ' successfully copied.') + except json.decoder.JSONDecodeError: + ground_truth.close() + ground_truth = io.open(root_ground_truth_text + '/' + d, 'r', encoding="utf-8") + while(True): + line = ground_truth.readline() + if line == '\n' or line == '': + break + else: + stripped = line.strip() + print(stripped) + pass + + while(True): + line = ground_truth.readline() + stripped = line.strip() + if line == '': + break + else: + stripped = line.strip() + language = stripped.replace(':','') + root_language = root_code_by_language + '/' + language + try: + os.mkdir(root_language) + except FileExistsError: + pass + while(True): + line = ground_truth.readline() + if line == '\n': + break + else: + f = line.strip() + counts[language] = counts.get(language, 0) + 1 + (_,ext) = os.path.splitext(f) + new_name = str(counts[language]) + ext + copy_src = root + '/' + d + '/' + f + copy_des = root_language + '/' + new_name + copyfile(copy_src, copy_des) + print(copy_des + ' successfully copied.') + finally: + ground_truth.close() if __name__ == "__main__": if len(sys.argv) != 2: print("Only argument acceptable is a path.") else: main(sys.argv[1]) diff --git a/swh/lang-dectection/repo_arrange_text.sh b/swh/lang-dectection/repo_arrange_text.sh new file mode 100755 index 0000000..3bdc6a0 --- /dev/null +++ b/swh/lang-dectection/repo_arrange_text.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +ROOT_FOLDER=$1 +OLDIFS=$IFS +IFS=$'\n' + +if [ $# -eq 1 ] +then + mkdir $ROOT_FOLDER/../ground_truth_text/; + for repo in $( ls $ROOT_FOLDER -1 ); + do + GROUND_TRUTH=$ROOT_FOLDER/../ground_truth/$repo; + if [[ -s $GROUND_TRUTH ]]; + then + echo "Ground truth of '$repo' exist!"; + else + linguist "$ROOT_FOLDER/$repo" --breakdown > $ROOT_FOLDER/../ground_truth_text/$repo; + echo "Done"; + fi + done +else + echo "Please enter root folder correctly. One folder is needed." +fi + +IFS=$OLDIFS diff --git a/swh/lang-dectection/repo_clone.sh b/swh/lang-dectection/repo_clone.sh index ee02eed..d2e6a83 100755 --- a/swh/lang-dectection/repo_clone.sh +++ b/swh/lang-dectection/repo_clone.sh @@ -1,23 +1,24 @@ #!/bin/bash # This script uses git clone to get head version of git repositories. ROOT_FOLDER=$1 OLDIFS=$IFS IFS=$'\n' if [ $# -eq 1 ] then for i in $( cat ../../languages ); do for url in $( cat "../../dataset/repo_lists/$i" ); do + USER_NAME="$(basename $(dirname $url))"; REPO_NAME="$(basename "$url" .git)"; - git clone --depth=1 $url "$ROOT_FOLDER/$REPO_NAME"; + git clone --depth=1 $url "$ROOT_FOLDER/$USER_NAME>$REPO_NAME"; done done else echo "Please enter root folder correctly. One folder is needed." fi IFS=$OLDIFS