diff --git a/scripts/dataset/ground_truth_arrange.py b/scripts/dataset/ground_truth_arrange.py index 47e2dd3..65b09f0 100644 --- a/scripts/dataset/ground_truth_arrange.py +++ b/scripts/dataset/ground_truth_arrange.py @@ -1,107 +1,113 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information #!/usr/bin/env python3 # coding: utf-8 import os import sys import io import json def main(root): root_ground_truth = root + '/../ground_truth' root_ground_truth_text = root_ground_truth + '_text' root_code_by_language = root + '/../code_by_language' counts = dict() try: os.mkdir(root_code_by_language) except FileExistsError: pass for r in os.listdir(root): if not r.startswith('.'): for d in os.listdir(root + '/' + r): if not d.startswith('.'): try: ground_truth = io.open(root_ground_truth + '/' + r + '/' + d + '.json') try: j = json.load(ground_truth) for language in j.keys(): root_language = root_code_by_language + '/' + language try: os.mkdir(root_language) except FileExistsError: pass for f in j.get(language): counts[language] = counts.get(language, 0) + 1 start = (counts[language] - 1) // 1000 * 1000 + 1 end = start + 999 root_count = root_language + '/' + str(start) + '-' + str(end) if counts[language] % 1000 == 1: try: os.mkdir(root_count) except FileExistsError: pass (_,ext) = os.path.splitext(f) new_name = str(counts[language]) + ext copy_src = root + '/' + r + '/' + d + '/' + f copy_des = root_count + '/' + new_name - os.symlink(copy_src, copy_des) - print(copy_src + ' successfully copied.') + try: + os.symlink(copy_src, copy_des) + print('{} successfully copied.'.format(copy_src)) + except FileExistsError: + pass except json.decoder.JSONDecodeError: ground_truth.close() ground_truth = io.open(root_ground_truth_text + '/' + r + '/' + d, 'r') while(True): line = ground_truth.readline() if line == '\n' or line == '': break else: pass while(True): line = ground_truth.readline() stripped = line.strip() if line == '': break else: stripped = line.strip() language = stripped.replace(':','') root_language = root_code_by_language + '/' + language try: os.mkdir(root_language) except FileExistsError: pass while(True): line = ground_truth.readline() if line == '\n': break else: f = line.strip() counts[language] = counts.get(language, 0) + 1 start = (counts[language] - 1) // 1000 * 1000 + 1 end = start + 999 root_count = root_language + '/' + str(start) + '-' + str(end) if counts[language] % 1000 == 1: try: os.mkdir(root_count) except FileExistsError: pass (_,ext) = os.path.splitext(f) new_name = str(counts[language]) + ext copy_src = root + '/' + r + '/' + d + '/' + f copy_des = root_count + '/' + new_name - os.symlink(copy_src, copy_des) - print('{} successfully copied.'.format(copy_src)) + try: + os.symlink(copy_src, copy_des) + print('{} successfully copied.'.format(copy_src)) + except FileExistsError: + pass finally: ground_truth.close() if __name__ == '__main__': if len(sys.argv) != 2: print('Only argument acceptable is a path.') else: main(sys.argv[1]) diff --git a/swh/langdetect/__init__.py b/swh/langdetect/__init__.py index db2d42b..5f8841b 100644 --- a/swh/langdetect/__init__.py +++ b/swh/langdetect/__init__.py @@ -1,4 +1,5 @@ """ Detectlang detects the programming language of source code file. """ + diff --git a/swh/langdetect/ngram.py b/swh/langdetect/ngram.py new file mode 100644 index 0000000..9db7303 --- /dev/null +++ b/swh/langdetect/ngram.py @@ -0,0 +1,63 @@ +""" +Calculate frequencies for classes +""" + +import os,sys,nltk + +from collections import Counter +from nltk.util import ngrams +from utils.common import tokenizer, file_to_string +from utils.training import build_training_set + +class NGramTrain: + + def __init__(self, root): + self._ngram_list = {} + self._prob = {} + self._root = root + + def train(self): + root_training_set = self._root + '/../training_set' + root_model = '../../dataset/model' + + try: + if len(os.listdir(root_training_set)) == 0: + build_training_set(self._root) + except FileNotFoundError: + os.mkdir(root_training_set) + build_training_set(self._root) + + for language in os.listdir(root_training_set): + if not language.startswith('.'): + _ngram_list[language] = {} + _prob[language] = None + root_training_set_language = root_training_set + '/' + language + for f in os.listdir(root_training_set_language): + if not f.startswith('.'): + _ngrams_max(language, f) + + def _ngrams_max(self, language, filename, n=3): + ngram_lan = self._ngram_list[language] + tokens = tokenizer(file_to_string(filename)) + for i in range(1, n): + ngram_lan[i + 1] = ngram_lan.get(i + 1, []) + \ + list(ngram(tokens, i + 1, + pad_left = True, + pad_right = True, + left_pad_symbol = '$BOS$', + right_pad_symbol = '$EOS$')) + + def _kneser_key_prob(self, language): + c = Counter() + ngram_lan = self._ngram_list[language] + for key in self._ngram_lan.keys(): + c = c + Counter(self.ngram_list[key]) + freq_dist = nltk.FreqDist(c) + self._prob[language] = nltk.KneserNeyProbDist(freq_dist) + +if __name__ == '__main__': + if len(sys.argv) != 2: + print('Only argument acceptable is a path.') + else: + model = NGramTrain(sys.argv[1]) + model.train() diff --git a/swh/langdetect/utils.py b/swh/langdetect/utils.py deleted file mode 100644 index 70d84c1..0000000 --- a/swh/langdetect/utils.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -Here regroup basic preprocessing methods -used in learning stage for different -approaches. - -""" - -import re - -_re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""") -_re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]') - -def replace_string_and_number(text): - """ Replace strings and numbers in a file by special tokens - """ - str_replaced = re.sub(_re_string, '__str__', text) - str_num_replaced = re.sub(_re_number, '__num__', str_replaced) - return str_num_replaced - -def remove_comment(text): - # TODO: remove only inline comments and block comments - # TODO: maybe build a list of comment markers - pass - -def purify(text, lang): - # TODO: for some language like HTML, remove code other than principal language - pass - - - - - - - - - diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py new file mode 100644 index 0000000..37648f9 --- /dev/null +++ b/swh/langdetect/utils/common.py @@ -0,0 +1,75 @@ +""" +Here regroup basic preprocessing methods +used in learning stage for different +approaches. + +""" + +import re, os + +_re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""") +_re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]') +_re_separator = re.compile(r'(\W)') +_not_start_with_point = lambda x: not x.startswith('.') + +def tokenizer(text): + ''' Splits text into tokens ''' + return [word for word in _re_separator.split(text) if word.strip(' \t')] + +def file_to_string(filename): + """ Read a file to a string. """ + with open(filename, 'r') as f: + data = f.read().replace('\n',' ').lower + return data + +def count_files(root_language): + all_folders = natural_sort(filter + (_not_start_with_point, + os.listdir(root_language))) + files = natural_sort(filter + (_not_start_with_point, + os.listdir(root_language + '/' + all_folders[-1]))) + (max,_) = os.path.splitext(files[-1]) + return int(max) + +def find_file(root_language, n): + '''Find the n-th file in language folder''' + if n > count_files(root_language): + return '' + else: + start = (n - 1) // 1000 * 1000 + 1 + end = start + 999 + root_count = root_language + '/' + str(start) + '-' + str(end) + files = natural_sort(filter + (_not_start_with_point, + os.listdir(root_count))) + return root_count + '/' + files[n - start] + +def replace_string_and_number(text): + """ Replace strings and numbers in a file by special tokens + """ + str_replaced = re.sub(_re_string, '__str__', text) + str_num_replaced = re.sub(_re_number, '__num__', str_replaced) + return str_num_replaced + +def natural_sort(l): + convert = lambda text: int(text) if text.isdigit() else text.lower() + alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] + return sorted(l, key = alphanum_key) + +def remove_comment(text): + # TODO: remove only inline comments and block comments + # TODO: maybe build a list of comment markers + pass + +def purify(text, lang): + # TODO: for some language like HTML, remove code other than principal language + pass + + + + + + + + diff --git a/swh/langdetect/utils/training.py b/swh/langdetect/utils/training.py new file mode 100644 index 0000000..07872ab --- /dev/null +++ b/swh/langdetect/utils/training.py @@ -0,0 +1,35 @@ +import os,random + +from utils.common import count_files, find_file + +def build_training_set(root): + root_code = root + '/../code_by_language' + root_training = root + '/../training_set' + for language in os.listdir(root_code): + if not language.startswith('.'): + root_language = root_code + '/' + language + root_training_language = root_training + '/' + language + build_language_training_set(count_files(root_language), + root_language, + root_training_language) + +def build_language_training_set(total, root_language, root_training_language): + # limit defines the size of training set + # upper defines the maximum size + + try: + os.mkdir(root_training_language) + except FileExistsError: + pass + + upper = 4000 + if total >= upper: + limit = upper // 2 + else: + limit = total // 2 + indices = random.sample(range(1, total + 1), limit) + files = map(lambda x : find_file(root_language, x), indices) + for src in files: + basename = os.path.basename(src) + des = root_training_language + '/' + basename + os.symlink(src, des)