diff --git a/swh/langdetect/ngram.py b/swh/langdetect/ngram.py index 9db7303..d9b4a9a 100644 --- a/swh/langdetect/ngram.py +++ b/swh/langdetect/ngram.py @@ -1,63 +1,107 @@ """ Calculate frequencies for classes """ -import os,sys,nltk +import os, sys, nltk +from pickle import dump, load from collections import Counter from nltk.util import ngrams from utils.common import tokenizer, file_to_string from utils.training import build_training_set +from nltk.probability import * class NGramTrain: def __init__(self, root): - self._ngram_list = {} - self._prob = {} self._root = root def train(self): root_training_set = self._root + '/../training_set' root_model = '../../dataset/model' try: if len(os.listdir(root_training_set)) == 0: build_training_set(self._root) + try: + os.mkdir(root_model) + except FileExistsError: + pass except FileNotFoundError: os.mkdir(root_training_set) build_training_set(self._root) for language in os.listdir(root_training_set): if not language.startswith('.'): - _ngram_list[language] = {} - _prob[language] = None root_training_set_language = root_training_set + '/' + language for f in os.listdir(root_training_set_language): + print(f) if not f.startswith('.'): - _ngrams_max(language, f) + ngram_lan = ngrams_max(root_training_set_language + '/' + f) + prob_lan = self._kneser_key_prob(ngram_lan) + #with open(root_model + '/' + language + '.model', 'wb') as f: + # dump(ngram_lan, f) + with open(root_model + '/' + language + '.model', 'wb') as f: + dump(prob_lan, f) - def _ngrams_max(self, language, filename, n=3): - ngram_lan = self._ngram_list[language] + def ngrams_max(filename, n=3): + ngram_lan = {} tokens = tokenizer(file_to_string(filename)) - for i in range(1, n): + for i in range(n - 1, n): ngram_lan[i + 1] = ngram_lan.get(i + 1, []) + \ - list(ngram(tokens, i + 1, + list(ngrams(tokens, i + 1, pad_left = True, pad_right = True, left_pad_symbol = '$BOS$', right_pad_symbol = '$EOS$')) + return ngram_lan - def _kneser_key_prob(self, language): + def _kneser_key_prob(self, ngram_lan): c = Counter() - ngram_lan = self._ngram_list[language] - for key in self._ngram_lan.keys(): - c = c + Counter(self.ngram_list[key]) + for key in ngram_lan.keys(): + c = c + Counter(ngram_lan[key]) freq_dist = nltk.FreqDist(c) - self._prob[language] = nltk.KneserNeyProbDist(freq_dist) + return nltk.KneserNeyProbDist(freq_dist) + + +class NGramTest: + + def test(filename): + NGramTest._guess_file_language(filename) + + def _guess_file_language(filename): + root_model_folder = '../../dataset/model' + trigrams = NGramTrain.ngrams_max(filename) + result = [] + + for model in os.listdir(root_model_folder): + if not model.startswith('.'): + root_model = root_model_folder + '/' + model + (language,_) = os.path.splitext(model) + result.append((NGramTest._prob(root_model, trigrams[3]), language)) + print(sorted(result)) + + def _prob(model, trigrams): + with open(model, 'rb') as f: + kneser_ney = load(f) + result = 0 + for trigram in trigrams: + prob = kneser_ney.prob(trigram) + if prob != 0: + if result == 0: + result = prob + else: + result = result * prob + return result + if __name__ == '__main__': - if len(sys.argv) != 2: - print('Only argument acceptable is a path.') - else: - model = NGramTrain(sys.argv[1]) + if len(sys.argv) != 3: + print('Only acceptable arguments are an option and a path.') + elif sys.argv[1] == '--train': + model = NGramTrain(sys.argv[2]) model.train() + elif sys.argv[1] == '--test': + NGramTest.test(sys.argv[2]) + else: + print('Wrong arguments, please check your input.') diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py index 37648f9..52d387f 100644 --- a/swh/langdetect/utils/common.py +++ b/swh/langdetect/utils/common.py @@ -1,75 +1,75 @@ """ Here regroup basic preprocessing methods used in learning stage for different approaches. """ import re, os _re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""") _re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]') _re_separator = re.compile(r'(\W)') _not_start_with_point = lambda x: not x.startswith('.') def tokenizer(text): ''' Splits text into tokens ''' return [word for word in _re_separator.split(text) if word.strip(' \t')] def file_to_string(filename): """ Read a file to a string. """ - with open(filename, 'r') as f: - data = f.read().replace('\n',' ').lower - return data + with open(filename, 'r', errors='ignore') as f: + data = f.read().replace('\n',' ').lower() + return replace_string_and_number(data) def count_files(root_language): all_folders = natural_sort(filter (_not_start_with_point, os.listdir(root_language))) files = natural_sort(filter (_not_start_with_point, os.listdir(root_language + '/' + all_folders[-1]))) (max,_) = os.path.splitext(files[-1]) return int(max) def find_file(root_language, n): '''Find the n-th file in language folder''' if n > count_files(root_language): return '' else: start = (n - 1) // 1000 * 1000 + 1 end = start + 999 root_count = root_language + '/' + str(start) + '-' + str(end) files = natural_sort(filter (_not_start_with_point, os.listdir(root_count))) return root_count + '/' + files[n - start] def replace_string_and_number(text): """ Replace strings and numbers in a file by special tokens """ str_replaced = re.sub(_re_string, '__str__', text) str_num_replaced = re.sub(_re_number, '__num__', str_replaced) return str_num_replaced def natural_sort(l): convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] return sorted(l, key = alphanum_key) def remove_comment(text): # TODO: remove only inline comments and block comments # TODO: maybe build a list of comment markers pass def purify(text, lang): # TODO: for some language like HTML, remove code other than principal language pass diff --git a/swh/langdetect/utils/training.py b/swh/langdetect/utils/training.py index 07872ab..a9d42df 100644 --- a/swh/langdetect/utils/training.py +++ b/swh/langdetect/utils/training.py @@ -1,35 +1,35 @@ import os,random from utils.common import count_files, find_file def build_training_set(root): root_code = root + '/../code_by_language' root_training = root + '/../training_set' for language in os.listdir(root_code): if not language.startswith('.'): root_language = root_code + '/' + language root_training_language = root_training + '/' + language build_language_training_set(count_files(root_language), root_language, root_training_language) def build_language_training_set(total, root_language, root_training_language): # limit defines the size of training set # upper defines the maximum size try: os.mkdir(root_training_language) except FileExistsError: pass - upper = 4000 + upper = 1000 if total >= upper: limit = upper // 2 else: limit = total // 2 indices = random.sample(range(1, total + 1), limit) files = map(lambda x : find_file(root_language, x), indices) for src in files: basename = os.path.basename(src) des = root_training_language + '/' + basename os.symlink(src, des)