diff --git a/swh/lang-dectection/cloner.py b/swh/lang-dectection/cloner.py deleted file mode 100644 index d1a7d05..0000000 --- a/swh/lang-dectection/cloner.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (C) 2015-2016 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - - diff --git a/swh/lang-dectection/ground_truth_arrange.py b/swh/lang-dectection/ground_truth_arrange.py new file mode 100644 index 0000000..0e91b4f --- /dev/null +++ b/swh/lang-dectection/ground_truth_arrange.py @@ -0,0 +1,52 @@ +# Copyright (C) 2015-2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +#!/usr/bin/env python3 +# coding: utf-8 + +import os +import sys +import io +import json +from shutil import copyfile + +def main(root): + + root_ground_truth = root + "/../ground_truth" + root_code_by_language = root + "/../code_by_language" + counts = dict() + + try: + os.mkdir(root_code_by_language) + except FileExistsError: + pass + + for d in os.listdir(root): + if not d.startswith('.'): + ground_truth = io.open(root_ground_truth + "/" + d) + try: + j = json.load(ground_truth) + for k in j.keys(): + root_language = root_code_by_language + "/" + k + try: + os.mkdir(root_language) + except FileExistsError: + pass + for f in j.get(k): + counts[k] = counts.get(k, 0) + 1 + (_,ext) = os.path.splitext(f) + new_name = str(counts[k]) + ext + copy_src = root + "/" + d + "/" + f + copy_des = root_language + "/" + new_name + copyfile(copy_src, copy_des) + print(copy_des + " successfully copied.") + except json.decoder.JSONDecodeError: + pass + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Only argument acceptable is a path.") + else: + main(sys.argv[1]) diff --git a/swh/lang-dectection/repo_arrange.sh b/swh/lang-dectection/repo_arrange.sh new file mode 100755 index 0000000..18d2297 --- /dev/null +++ b/swh/lang-dectection/repo_arrange.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# This script uses linguist to get ground truth of source code identity. + +ROOT_FOLDER=$1 +OLDIFS=$IFS +IFS=$'\n' + +if [ $# -eq 1 ] +then + mkdir $ROOT_FOLDER/../ground_truth/; + for repo in $( ls $ROOT_FOLDER -1 ); + do + echo "Generating ground truth of '$repo'"; + linguist "$ROOT_FOLDER/$repo" --json > $ROOT_FOLDER/../ground_truth/$repo; + echo "Done"; + done +else + echo "Please enter root folder correctly. One folder is needed." +fi + +IFS=$OLDIFS diff --git a/swh/lang-dectection/repo_clone.sh b/swh/lang-dectection/repo_clone.sh index 4253a2f..af4b848 100755 --- a/swh/lang-dectection/repo_clone.sh +++ b/swh/lang-dectection/repo_clone.sh @@ -1,23 +1,23 @@ #!/bin/bash # This script uses git clone to get head version of git repositories. ROOT_FOLDER=$1 OLDIFS=$IFS IFS=$'\n' if [ $# -eq 1 ] then for i in $( cat ../../languages ); do for url in $( cat "../../dataset/repo_lists/$i" ); do REPO_NAME="$(basename "$url")"; - git clone --depth=1 $url "$ROOT_FOLDER/$i/$REPO_NAME"; + git clone --depth=1 $url "$ROOT_FOLDER/$REPO_NAME"; done done else echo "Please enter root folder correctly. One folder is needed." fi IFS=$OLDIFS