diff --git a/scripts/comparison.pdf b/scripts/comparison.pdf new file mode 100644 index 0000000..82e40f1 Binary files /dev/null and b/scripts/comparison.pdf differ diff --git a/scripts/draw_accuracy.py b/scripts/draw_accuracy.py index 614c34f..979645f 100644 --- a/scripts/draw_accuracy.py +++ b/scripts/draw_accuracy.py @@ -1,77 +1,138 @@ #!/bin/bash/python3 import sys from pickle import load from collections import namedtuple, Counter try: import numpy as np import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator except ImportError: raise ImportError('Please install matplotlib') -def main(path): +def heatmap(path): with open(path, 'rb') as f: data = load(f) mat = process(data) labels = sorted(data) fig, ax = plt.subplots() fig.set_size_inches(100,100) heatmap = ax.matshow(mat, cmap='Blues') fig = plt.gcf() ax.set_frame_on(False) ax.set_yticks(np.arange(len(labels)), minor=False) ax.set_xticks(np.arange(len(labels)), minor=False) - + + ax.set_xlabel('Classification of test files') + ax.set_ylabel('Ground truth class of test files') ax.set_xticklabels(labels, minor=False) ax.set_yticklabels(labels, minor=False) ax.xaxis.tick_top() + ax.xaxis.set_label_position('top') plt.xticks(rotation=90) ax.grid(False) ''' for i in np.arange(len(mat)): for j in np.arange(len(mat[i])): ax.text(i, j, "%.1f" % (mat[i][j] * 100), color='white') ''' ax = plt.gca() for t in ax.xaxis.get_major_ticks(): t.tick1On = False t.tick2On = False for t in ax.yaxis.get_major_ticks(): t.tick1On = False t.tick2On = False fig.savefig("results.pdf", bbox_inches='tight') def process(data): ''' ''' ldata = sorted(data) length = len(ldata) out = [[0 for x in range(length)] for y in range(length)] for lang in ldata: index_lan = ldata.index(lang) ok = data[lang][0] - if data[lang][1] >= 1000 : + if data[lang][1] > 1000 : test_size = 1000 else: test_size = data[lang][1] result = [x[1] for x in data[lang][3]] counter = dict(Counter(result)) for res_lan in counter.keys(): index_res = ldata.index(res_lan) - out[index_lan][index_res] = counter[res_lan] / test_size + out[index_lan][index_res] = counter.get(res_lan, 0) / test_size return out +def get_accuracy(data): + ldata = sorted(data) + out = {} + for lang in ldata: + ok = data[lang][0] + if data[lang][1] > 1000: + test_size = 1000 + else: + test_size = data[lang][1] + result = [x[1] for x in data[lang][3]] + counter = dict(Counter(result)) + out[lang] = counter.get(lang, 0) / test_size + return out + +def compare(baseline, target): + with open(baseline, 'rb') as f: + data = load(f) + dict_base = get_accuracy(data) + + with open(target, 'rb') as f: + data = load(f) + dict_targ = get_accuracy(data) + + all_lang = sorted(list(set().union(dict_base.keys(),dict_targ.keys())))[::-1] + n = len(all_lang) + acc_base = [dict_base.get(lang, 0) for lang in all_lang] + acc_targ = [dict_targ.get(lang, 0) for lang in all_lang] + + fig, ax = plt.subplots() + fig.set_size_inches(10,200) + ind = np.arange(n) + width = 0.35 + opacity = 0.4 + rects1 = ax.barh(ind + width, acc_base, width, alpha=opacity, color='b', label='N-grams with frequency distance') + rects2 = ax.barh(ind, acc_targ, width, alpha=opacity, color='r', label='N-grams with probability') + + ax.set_xlabel('Accuracy / %') + ax.set_yticks(ind + width / 2) + ax.set_yticklabels(all_lang) + vals = ax.get_xticks() + ax.set_xticklabels(['{:3.0f}%'.format(x * 100) for x in vals]) + ax.xaxis.tick_top() + ax.legend() + + def autolabel(rects): + for rect in rects: + width = rect.get_width() + ax.text(width + 0.01, rect.get_y() + rect.get_height() / 2., '{0:.1f}%'.format(width * 100), ha='left', va='center') + + autolabel(rects1) + autolabel(rects2) + plt.ylim([-1,n+1]) + + fig.tight_layout() + fig.savefig("comparison.pdf", bbox_inches='tight') + if __name__ == '__main__': - if len(sys.argv) != 2: - print('Only argument acceptable is a path.') + if len(sys.argv) == 2: + heatmap(sys.argv[1]) + elif len(sys.argv) == 3: + compare(sys.argv[1],sys.argv[2]) else: - main(sys.argv[1]) + print('Please check arguments.') diff --git a/scripts/result_ngrams_frequency_distance.pdf b/scripts/results_ngrams_frequency_distance.pdf similarity index 50% rename from scripts/result_ngrams_frequency_distance.pdf rename to scripts/results_ngrams_frequency_distance.pdf index be05137..8d0908f 100644 Binary files a/scripts/result_ngrams_frequency_distance.pdf and b/scripts/results_ngrams_frequency_distance.pdf differ diff --git a/scripts/results_ngrams_prob.pdf b/scripts/results_ngrams_prob.pdf new file mode 100644 index 0000000..b56ce47 Binary files /dev/null and b/scripts/results_ngrams_prob.pdf differ diff --git a/swh/langdetect/cnn.py b/swh/langdetect/cnn.py new file mode 100644 index 0000000..81788c6 --- /dev/null +++ b/swh/langdetect/cnn.py @@ -0,0 +1,247 @@ + +import os, sys, subprocess, time +import kenlm + +from itertools import islice +from pickle import dump, load +from collections import Counter +from numpy import array +from utils.common import tokenizer, file_to_string, find_file, count_files +from keras.utils.vis_utils import plot_model +from keras.preprocessing.sequence import pad_sequences +from keras.models import Model +from keras.layers import Input, Dense, Flatten, Dropout, Embedding, ThresholdedReLU +from keras.layers.convolutional import Convolution1D, MaxPooling1D +from keras.layers.merge import concatenate + + +class CNN: + + def __init__(self, root): + # Root of dataset + self._root = root + + # Root of training set + self._root_training_set = os.path.join(self._root, '..', 'training_set') + + # Root of model folder + self._root_model = os.path.join(self._root, '..', 'model_cnn') + + # Root of arranged dataset + self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language') + + # Path of result + self._path_result = os.path.join(self._root, '..', 'result_cnn') + + def train(self): + try: + if len(os.listdir(self._root_training_set)) == 0: + build_training_set(self._root) + try: + os.mkdir(self._root_model) + except FileExistsError: + pass + except FileNotFoundError: + os.mkdir(self._root_training_set) + build_training_set(self._root) + + languages = [x for x in os.listdir(self._root_training_set) if not x.startswith('.')] + try: + f = open(os.path.join(self._root, '..', 'model_cnn', 'texts+labels'), 'rb') + train_file_with_label = load(f) + except FileNotFoundError: + train_file_with_label = self._train_file_with_label(languages) + with open(os.path.join(self._root, '..', 'model_cnn', 'texts+labels'), 'wb') as f: + dump(train_file_with_label, f) + + length = 1000 + vocab_size = 256 + total_class = len(languages) + + model = self._get_model(length, vocab_size, total_class) + model.fit_generator(self._generator(length, total_class), steps_per_epoch=len(train_file_with_label), epochs=10) + model.save(os.path.join(self._root, '..', 'model_cnn', 'model.h5')) + + def _generator(self, length, total_class): + while True: + with open(os.path.join(self._root, '..', 'model_cnn', 'texts+labels'), 'rb') as f: + train_file_with_label = load(f) + for pair in train_file_with_label: + path, label = pair + tokens = [x + 1 for x in tokenizer(file_to_string(path), 'letter')] + tokens = pad_sequences([tokens], maxlen=length, padding='post') + truth = array([[0 for _ in range(total_class)]]) + truth[0][label] = 1 + yield ([tokens], truth) + + def _train_file_with_label(self, languages): + l = [] + + for language in languages: + root_training_set_language = os.path.join(self._root_training_set, language) + root_stat_language = os.path.join(self._root_model, language) + index_lang = languages.index(language) + if os.path.isfile(root_stat_language): + continue + print(language) + for f in [x for x in os.listdir(root_training_set_language) if not x.startswith('.')]: + filename = os.path.join(root_training_set_language, f) + l.append((filename, index_lang)) + + return l + + + def _get_model(self, length, vocab_size, total_class): + + input_size = length + alphabet_size = vocab_size + embedding_size = 128 + conv_layers = [(256,7,3), (256,7,3), (256,3,-1), (256,3,-1), (256,3,-1), (256,3,3)] + threshold = 1e-6 + fully_connected_layers = [1024, 1024] + dropout_p = 0.5 + optimizer = 'adam' + loss = 'categorical_crossentropy' + num_of_classes = total_class + + # Input layer + inputs = Input(shape=(input_size,), name='sent_input', dtype='int64') + # Embedding layers + x = Embedding(alphabet_size + 1, embedding_size, input_length=input_size)(inputs) + # Convolution layers + for cl in conv_layers: + x = Convolution1D(cl[0], cl[1])(x) + x = ThresholdedReLU(threshold)(x) + if cl[2] != -1: + x = MaxPooling1D(cl[2])(x) + x = Flatten()(x) + # Fully connected layers + for fl in fully_connected_layers: + x = Dense(fl)(x) + x = ThresholdedReLU(threshold)(x) + x = Dropout(dropout_p)(x) + # Output layer + predictions = Dense(num_of_classes, activation='softmax')(x) + # Build and compile model + model = Model(inputs=inputs, outputs=predictions) + model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) + + print(model.summary()) + + return model + + def _max_len(self, texts): + return max([len(text) for text in texts]) + + def _vocabulary_size(self, texts): + vocabulary = dict(Counter([token for text in texts for token in text])) + return len(vocabulary.keys()) + + def test(self): + try: + r = open(self._path_result, 'rb') + test_result = load(r) + r.close() + except FileNotFoundError: + test_result = {} + models = self._load_models() + + for language in [x for x in os.listdir(self._root_model) if not x.startswith('.') and x not in test_result.keys()]: + test_result[language] = self.test_class(models, language) + with open(self._path_result, 'wb') as f: + dump(test_result, f) + + def _load_models(self): + models = {} + + for model in [model + for model in os.listdir(self._root_model) + if not model.startswith('.')]: + root_model = os.path.join(self._root_model, model) + models[model] = kenlm.LanguageModel(root_model) + return models + + def _get_test_set(self, language): + root_training_language = os.path.join(self._root_training_set, language) + root_language = os.path.join(self._root_language_dataset, language) + total = count_files(root_language) + training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')] + it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576) + test_set = list(islice(it, 1000)) + if len(test_set) == 0: + it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set) + test_set = list(islice(it, 1000)) + return test_set + + def _count_size(self, files): + size = 0 + for f in files: + size += os.path.getsize(f) + return size + + def test_class(self, models, language): + test_set = self._get_test_set(language) + + ok = 0 + results = [] + count = 0 + length = len(test_set) + for test in test_set: + result = self._guess_file_language(models, test) + count += 1 + print('[{0:4d}/{1:4d}] {2}:{3} '.format(count, length, result[0][1], result[0][0]),end='\r') + results.append(result[0]) + if result[0][1] == language: + ok += 1 + + total_test = len(test_set) + accuracy = ok / len(test_set) + print('Tests for {} '.format(language)) + print('Total test files : {}'.format(total_test)) + print('Correctly classified files : {}'.format(ok)) + print('Accuracy : {}%'.format(accuracy * 100)) + return (ok, len(test_set), accuracy, results) + + def speed_benchmark(self): + language = [x for x in os.listdir(self._root_model) if not x.startswith('.')][10] + models = self._load_models() + + test_set = self._get_test_set(language) + total_size = self._count_size(test_set) + print('{} kB in total'.format(total_size / 1024)) + + t_start = time.perf_counter() + self.test_class(models, language) + t_end = time.perf_counter() + + print('{} seconds.'.format(t_end - t_start)) + print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024)) + + def _guess_file_language(self, models, filename): + tokens = tokenizer(file_to_string(filename), 'letter') + text = ' '.join(chr(token) for token in tokens) + + result = [] + + for model_key in models.keys(): + root_model = os.path.join(self._root_model, model_key) + model = models[model_key] + score = model.score(text) + result.append((score, model_key)) + return sorted(result, reverse=True) + +if __name__ == '__main__': + if len(sys.argv) == 3 and sys.argv[1] == '--train': + n = CNN(sys.argv[2]) + n.train() + elif len(sys.argv) == 3 and sys.argv[1] == '--test': + n = NGramProb(sys.argv[2]) + n.test() + elif len(sys.argv) == 3 and sys.argv[1] == '--benchmark': + n = NGramProb(sys.argv[2]) + n.speed_benchmark() + elif len(sys.argv) == 4 and sys.argv[1] == '--test': + n = NGramProb(sys.argv[2]) + n.test_class(n.load_models(), sys.argv[3]) + else: + print('Wrong arguments, please check your input.') diff --git a/swh/langdetect/ngramdist.py b/swh/langdetect/naivebayesian.py similarity index 52% copy from swh/langdetect/ngramdist.py copy to swh/langdetect/naivebayesian.py index 46ed397..495dd86 100644 --- a/swh/langdetect/ngramdist.py +++ b/swh/langdetect/naivebayesian.py @@ -1,243 +1,221 @@ """ -Baseline approach +Naive Bayesian """ import os, sys, operator, nltk, random, time +import numpy as np +from itertools import islice from pickle import dump, load -from nltk.util import ngrams from utils.common import tokenizer, file_to_string, find_file, count_files from utils.training import build_training_set +from nltk.util import ngrams +from collections import Counter +from sklearn.naive_bayes import MultinomialNB +from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer +from sklearn.externals import joblib -class NGramDist: +class NaiveBayesian: def __init__(self, root): # Root of dataset self._root = root # Root of training set self._root_training_set = os.path.join(self._root, '..', 'training_set') # Root of model folder - self._root_model = os.path.join(self._root, '..', 'model_ngram_dist') + self._root_model = os.path.join(self._root, '..', 'model_bayesian') # Root of arranged dataset self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language') # Path of result - self._path_result = os.path.join(self._root, '..', 'result') - + self._path_result = os.path.join(self._root, '..', 'result_bayesian') + + self._languages = [x for x in os.listdir(self._root_training_set) if not x.startswith('.')] def train(self): ''' train () generates and stores counted n-grams in '_root_model' folder ''' try: if len(os.listdir(self._root_training_set)) == 0: build_training_set(self._root) try: os.mkdir(self._root_model) except FileExistsError: pass except FileNotFoundError: os.mkdir(self._root_training_set) build_training_set(self._root) ''' Calculate frequencies of generated n-grams then store them into a sorted list of (ngram, count) ''' - for language in os.listdir(self._root_training_set): - if not language.startswith('.'): - root_training_set_language = os.path.join(self._root_training_set, language) - root_stat_language = os.path.join(self._root_model, language) - if os.path.isfile(root_stat_language): - continue - statistics = {} - for f in os.listdir(root_training_set_language): - print(f) - if not f.startswith('.'): - filename = os.path.join(root_training_set_language, f) - tokens = tokenizer(file_to_string(filename), 'letter') - generated_ngrams = self._generate_ngrams(tokens, 3) - self._count_ngrams(statistics, generated_ngrams) - with open(root_stat_language, 'wb') as f: - dump(self._sort_by_value(statistics), f) - def _generate_ngrams(self, tokens, n): - ''' - :param tokens: generated tokens from a string. - :param n: maximum n of n-grams - :type tokens: list - :type n: int - :return: generated 1-grams, ... , n-grams - :rtype: list - ''' - generated_ngrams = [] - - for i in range(1, n+1): - igrams = ngrams(tokens, i, - pad_left=True, - pad_right=True, - left_pad_symbol = '$BOF$', - right_pad_symbol = '$EOF$') - for igram in igrams: - generated_ngrams.append(''.join(igram)) - - return generated_ngrams + clf = MultinomialNB() + cv = HashingVectorizer(analyzer='char', ngram_range=(1, 3), n_features=2**17, alternate_sign=False) + + indices = list(range(len(self._languages))) + + for language in self._languages: + root_training_set_language = os.path.join(self._root_training_set, language) + root_stat_language = os.path.join(self._root_model, 'classifier') + index_lang = self._languages.index(language) + texts = [] + for f in os.listdir(root_training_set_language): + if not f.startswith('.'): + print(f) + filename = os.path.join(root_training_set_language, f) + tokens = tokenizer(file_to_string(filename), 'letter') + text = ''.join([chr(token) for token in tokens]) + texts.append(text) + counts = cv.fit_transform(texts) + tf = TfidfTransformer(use_idf=False).fit(counts) + normalised = tf.transform(counts) + clf.partial_fit(normalised, np.array([index_lang for _ in texts]), indices) - def _count_ngrams(self, statistics, ngrams): - ''' - :param statistics: shared dictionary for statistics - :param ngrams: n-grams to be accumulated into statistics - ''' - for ngram in ngrams: - statistics[ngram] = statistics.get(ngram, 0) + 1 + with open(root_stat_language + '.clf', 'wb') as f: + joblib.dump(clf, f) + with open(root_stat_language + '.hv', 'wb') as f: + joblib.dump(cv, f) + def test(self): - test_result = {} - models = self._load_models() + try: + r = open(self._path_result, 'rb') + test_result = load(r) + r.close() + except FileNotFoundError: + test_result = {} + + with open(os.path.join(self._root_model, 'classifier'), 'rb') as f: + clf, cv = load(f) - for language in [x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')]: - test_result[language] = self.test_class(models, language) - with open(self._path_result, 'wb') as f: - dump(test_result, f) + for language in [x for x in os.listdir(self._root_training_set) if not x.startswith('.') and x not in test_result.keys()]: + test_result[language] = self.test_class((clf, cv), language) + with open(self._path_result, 'wb') as f: + dump(test_result, f) def speed_benchmark(self): - language = random.choice([x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')]) + language = [x for x in os.listdir(self._root_training_set) if not x.startswith('.')][10] models = self._load_models() test_set = self._get_test_set(language) total_size = self._count_size(test_set) print('{} kB in total'.format(total_size / 1024)) t_start = time.perf_counter() self.test_class(models, language) t_end = time.perf_counter() print('{} seconds.'.format(t_end - t_start)) print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024)) - def _load_models(self): - models = {} - - for model in [model - for model in os.listdir(self._root_model) - if not model.startswith('.')]: - root_model = os.path.join(self._root_model, model) - with open(root_model, 'rb') as sorted_file: - models[model] = self._list_to_dict(load(sorted_file)) - - return models - - def _list_to_dict(self, model): - model_ngrams = [x[0] for x in model] - model_dict = {} - index = 0 - for ngram in model_ngrams: - index += 1 - model_dict[ngram] = index - return model_dict - def _get_test_set(self, language): root_training_language = os.path.join(self._root_training_set, language) root_language = os.path.join(self._root_language_dataset, language) total = count_files(root_language) training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')] - test_set = [find_file(root_language, x) for x in range(1, total + 1) if x not in training_set][:1000] + it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576) + test_set = list(islice(it, 1000)) + if len(test_set) == 0: + it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set) + test_set = list(islice(it, 1000)) return test_set def _count_size(self, files): size = 0 for f in files: size += os.path.getsize(f) return size - def test_class(self, models, language): + def test_class(self, clf, language): test_set = self._get_test_set(language) + index_lang = self._languages.index(language) ok = 0 results = [] + count = 0 + length = len(test_set) for test in test_set: - result = self._guess_file_language(models, test) - print('{} '.format(result[0]),end='\r') + result = self._guess_file_language(clf, test) + count += 1 + print('[{0:4d}/{1:4d}] {2}:{3} '.format(count, length, result[0][1], result[0][0]),end='\r') results.append(result[0]) if result[0][1] == language: ok += 1 total_test = len(test_set) accuracy = ok / len(test_set) print('Tests for {} '.format(language)) print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) print('Accuracy : {}%'.format(accuracy * 100)) return (ok, len(test_set), accuracy, results) - def test_single(self, models, filename): - self._guess_file_language(models, filename) - - def _guess_file_language(self, models, filename): + def test_single(self, filename): + self._guess_file_language(clf, filename) + def _guess_file_language(self, cc, filename): + clf = cc[0] + cv = cc[1] tokens = tokenizer(file_to_string(filename), 'letter') - generated_ngrams = self._generate_ngrams(tokens, 3) - - statistics = {} - self._count_ngrams(statistics, generated_ngrams) - - test_profile = self._list_to_dict(self._sort_by_value(statistics)) + text = ''.join([chr(token) for token in tokens]) + counts = cv.fit_transform([text]) + tf = TfidfTransformer(use_idf=False).fit(counts) + normalised = tf.transform(counts) - result = [] + result = clf.predict_log_proba(normalised) - for model in models.keys(): - root_model = os.path.join(self._root_model, model) - model_profile = models[model] - distance = self._distance(model_profile, test_profile) - result.append((distance, model)) - - return sorted(result) + result = [(val, self._languages[idx]) for idx, val in enumerate(result[0])] + + return sorted(result, reverse=True) def _sort_by_value(self, statistics): statistics_sorted = sorted(statistics.items(), key = operator.itemgetter(1), reverse = True)[:500] return statistics_sorted def _distance(self, model_profile, test_profile): distance = 0 maximum = len(test_profile) for test_ngram in test_profile.keys(): test_rank = test_profile.get(test_ngram) model_rank = model_profile.get(test_ngram, maximum) d = abs(test_rank - model_rank) distance += d return distance ''' def _prob(model, trigrams): print('Checking {} model ...'.format(model)) with open(model, 'rb') as f: kneser_ney = load(f) result = 1 for trigram in trigrams: prob = kneser_ney.prob(trigram) result = result * prob return result ''' if __name__ == '__main__': if len(sys.argv) == 3 and sys.argv[1] == '--train': - n = NGramDist(sys.argv[2]) + n = NaiveBayesian(sys.argv[2]) n.train() elif len(sys.argv) == 3 and sys.argv[1] == '--test': - n = NGramDist(sys.argv[2]) + n = NaiveBayesian(sys.argv[2]) n.test() elif len(sys.argv) == 3 and sys.argv[1] == '--benchmark': - n = NGramDist(sys.argv[2]) + n = NaiveBayesian(sys.argv[2]) n.speed_benchmark() elif len(sys.argv) == 4 and sys.argv[1] == '--test': - n = NGramDist(sys.argv[2]) + n = NaiveBayesian(sys.argv[2]) n.test_class(n.load_models(), sys.argv[3]) else: print('Wrong arguments, please check your input.') diff --git a/swh/langdetect/ngramdist.py b/swh/langdetect/ngramdist.py index 46ed397..06449dd 100644 --- a/swh/langdetect/ngramdist.py +++ b/swh/langdetect/ngramdist.py @@ -1,243 +1,256 @@ """ Baseline approach """ import os, sys, operator, nltk, random, time +from itertools import islice from pickle import dump, load from nltk.util import ngrams from utils.common import tokenizer, file_to_string, find_file, count_files from utils.training import build_training_set class NGramDist: def __init__(self, root): # Root of dataset self._root = root # Root of training set self._root_training_set = os.path.join(self._root, '..', 'training_set') # Root of model folder self._root_model = os.path.join(self._root, '..', 'model_ngram_dist') # Root of arranged dataset self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language') # Path of result - self._path_result = os.path.join(self._root, '..', 'result') + self._path_result = os.path.join(self._root, '..', 'result_freq') def train(self): ''' train () generates and stores counted n-grams in '_root_model' folder ''' try: if len(os.listdir(self._root_training_set)) == 0: build_training_set(self._root) try: os.mkdir(self._root_model) except FileExistsError: pass except FileNotFoundError: os.mkdir(self._root_training_set) build_training_set(self._root) ''' Calculate frequencies of generated n-grams then store them into a sorted list of (ngram, count) ''' for language in os.listdir(self._root_training_set): if not language.startswith('.'): root_training_set_language = os.path.join(self._root_training_set, language) root_stat_language = os.path.join(self._root_model, language) if os.path.isfile(root_stat_language): continue statistics = {} for f in os.listdir(root_training_set_language): print(f) if not f.startswith('.'): filename = os.path.join(root_training_set_language, f) tokens = tokenizer(file_to_string(filename), 'letter') - generated_ngrams = self._generate_ngrams(tokens, 3) + generated_ngrams = self._generate_ngrams([chr(token) for token in tokens], 3) self._count_ngrams(statistics, generated_ngrams) with open(root_stat_language, 'wb') as f: dump(self._sort_by_value(statistics), f) def _generate_ngrams(self, tokens, n): ''' :param tokens: generated tokens from a string. :param n: maximum n of n-grams :type tokens: list :type n: int :return: generated 1-grams, ... , n-grams :rtype: list ''' generated_ngrams = [] for i in range(1, n+1): igrams = ngrams(tokens, i, pad_left=True, pad_right=True, left_pad_symbol = '$BOF$', right_pad_symbol = '$EOF$') for igram in igrams: generated_ngrams.append(''.join(igram)) return generated_ngrams def _count_ngrams(self, statistics, ngrams): ''' :param statistics: shared dictionary for statistics :param ngrams: n-grams to be accumulated into statistics ''' for ngram in ngrams: statistics[ngram] = statistics.get(ngram, 0) + 1 def test(self): - test_result = {} + try: + r = open(self._path_result, 'rb') + test_result = load(r) + r.close() + except FileNotFoundError: + test_result = {} models = self._load_models() - for language in [x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')]: + for language in [x for x in os.listdir(self._root_model) if not x.startswith('.') and x not in test_result.keys()]: test_result[language] = self.test_class(models, language) - with open(self._path_result, 'wb') as f: - dump(test_result, f) + with open(self._path_result, 'wb') as f: + dump(test_result, f) def speed_benchmark(self): - language = random.choice([x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')]) + language = [x for x in os.listdir(self._root_model) if not x.startswith('.')][10] models = self._load_models() test_set = self._get_test_set(language) total_size = self._count_size(test_set) print('{} kB in total'.format(total_size / 1024)) t_start = time.perf_counter() self.test_class(models, language) t_end = time.perf_counter() print('{} seconds.'.format(t_end - t_start)) print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024)) def _load_models(self): models = {} for model in [model for model in os.listdir(self._root_model) if not model.startswith('.')]: root_model = os.path.join(self._root_model, model) with open(root_model, 'rb') as sorted_file: models[model] = self._list_to_dict(load(sorted_file)) return models def _list_to_dict(self, model): model_ngrams = [x[0] for x in model] model_dict = {} index = 0 for ngram in model_ngrams: index += 1 model_dict[ngram] = index return model_dict def _get_test_set(self, language): root_training_language = os.path.join(self._root_training_set, language) root_language = os.path.join(self._root_language_dataset, language) total = count_files(root_language) training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')] - test_set = [find_file(root_language, x) for x in range(1, total + 1) if x not in training_set][:1000] + it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576) + test_set = list(islice(it, 1000)) + if len(test_set) == 0: + it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set) + test_set = list(islice(it, 1000)) return test_set def _count_size(self, files): size = 0 for f in files: size += os.path.getsize(f) return size def test_class(self, models, language): test_set = self._get_test_set(language) ok = 0 results = [] + count = 0 + length = len(test_set) for test in test_set: result = self._guess_file_language(models, test) - print('{} '.format(result[0]),end='\r') + count += 1 + print('[{0:4d}/{1:4d}] {2}:{3} '.format(count, length, result[0][1], result[0][0]),end='\r') results.append(result[0]) if result[0][1] == language: ok += 1 total_test = len(test_set) accuracy = ok / len(test_set) print('Tests for {} '.format(language)) print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) print('Accuracy : {}%'.format(accuracy * 100)) return (ok, len(test_set), accuracy, results) def test_single(self, models, filename): self._guess_file_language(models, filename) def _guess_file_language(self, models, filename): tokens = tokenizer(file_to_string(filename), 'letter') - generated_ngrams = self._generate_ngrams(tokens, 3) + generated_ngrams = self._generate_ngrams([chr(token) for token in tokens], 3) statistics = {} self._count_ngrams(statistics, generated_ngrams) test_profile = self._list_to_dict(self._sort_by_value(statistics)) result = [] for model in models.keys(): root_model = os.path.join(self._root_model, model) model_profile = models[model] distance = self._distance(model_profile, test_profile) result.append((distance, model)) return sorted(result) def _sort_by_value(self, statistics): statistics_sorted = sorted(statistics.items(), key = operator.itemgetter(1), reverse = True)[:500] return statistics_sorted def _distance(self, model_profile, test_profile): distance = 0 maximum = len(test_profile) for test_ngram in test_profile.keys(): test_rank = test_profile.get(test_ngram) model_rank = model_profile.get(test_ngram, maximum) d = abs(test_rank - model_rank) distance += d return distance ''' def _prob(model, trigrams): print('Checking {} model ...'.format(model)) with open(model, 'rb') as f: kneser_ney = load(f) result = 1 for trigram in trigrams: prob = kneser_ney.prob(trigram) result = result * prob return result ''' if __name__ == '__main__': if len(sys.argv) == 3 and sys.argv[1] == '--train': n = NGramDist(sys.argv[2]) n.train() elif len(sys.argv) == 3 and sys.argv[1] == '--test': n = NGramDist(sys.argv[2]) n.test() elif len(sys.argv) == 3 and sys.argv[1] == '--benchmark': n = NGramDist(sys.argv[2]) n.speed_benchmark() elif len(sys.argv) == 4 and sys.argv[1] == '--test': n = NGramDist(sys.argv[2]) n.test_class(n.load_models(), sys.argv[3]) else: print('Wrong arguments, please check your input.') diff --git a/swh/langdetect/ngramprob.py b/swh/langdetect/ngramprob.py index ba8b5a3..7cf5e47 100644 --- a/swh/langdetect/ngramprob.py +++ b/swh/langdetect/ngramprob.py @@ -1,137 +1,169 @@ -import os, sys, subprocess +import os, sys, subprocess, time import kenlm +from itertools import islice from pickle import dump, load from utils.common import tokenizer, file_to_string, find_file, count_files class NGramProb: def __init__(self, root): # Root of dataset self._root = root # Root of training set self._root_training_set = os.path.join(self._root, '..', 'training_set') # Root of model folder self._root_model = os.path.join(self._root, '..', 'model_ngram_prob') # Root of arranged dataset self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language') # Path of result self._path_result = os.path.join(self._root, '..', 'result_prob') def train(self): try: if len(os.listdir(self._root_training_set)) == 0: build_training_set(self._root) try: os.mkdir(self._root_model) except FileExistsError: pass except FileNotFoundError: os.mkdir(self._root_training_set) build_training_set(self._root) for language in [x for x in os.listdir(self._root_training_set) if not x.startswith('.')]: root_training_set_language = os.path.join(self._root_training_set, language) texts = [] root_stat_language = os.path.join(self._root_model, language) if os.path.isfile(root_stat_language): continue for f in [x for x in os.listdir(root_training_set_language) if not x.startswith('.')]: filename = os.path.join(root_training_set_language, f) tokens = tokenizer(file_to_string(filename), 'letter') - texts.append(' '.join(tokens)) - + texts.append((' '.join(chr(token) for token in tokens))) train_text = ' '.join(texts) - command = ['../../bin/lmplz', '-o', '5', '--discount_fallback'] + command = ['../../bin/lmplz', '-o', '3', '-T', '/tmp', '--discount_fallback'] with open(root_stat_language, 'wb') as f: proc = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=f) proc.communicate(train_text.encode()) if os.path.getsize(root_stat_language) == 0: os.remove(root_stat_language) # st = os.stat(root_stat_language) # os.chmod(root_stat_language, st.st_mode | stat.S_IEXEC) def test(self): - with open(self._path_result, 'rb') as r : + try: + r = open(self._path_result, 'rb') test_result = load(r) + r.close() + except FileNotFoundError: + test_result = {} models = self._load_models() - for language in [x for x in os.listdir(self._root_language_dataset) if not x.startswith('.') and x not in test_result.keys()]: + for language in [x for x in os.listdir(self._root_model) if not x.startswith('.') and x not in test_result.keys()]: test_result[language] = self.test_class(models, language) with open(self._path_result, 'wb') as f: dump(test_result, f) def _load_models(self): models = {} for model in [model for model in os.listdir(self._root_model) if not model.startswith('.')]: root_model = os.path.join(self._root_model, model) models[model] = kenlm.LanguageModel(root_model) return models def _get_test_set(self, language): root_training_language = os.path.join(self._root_training_set, language) root_language = os.path.join(self._root_language_dataset, language) total = count_files(root_language) training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')] - test_set = [find_file(root_language, x) for x in range(1, total + 1) if x not in training_set][:1000] + it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576) + test_set = list(islice(it, 1000)) + if len(test_set) == 0: + it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set) + test_set = list(islice(it, 1000)) return test_set + def _count_size(self, files): + size = 0 + for f in files: + size += os.path.getsize(f) + return size + def test_class(self, models, language): test_set = self._get_test_set(language) ok = 0 results = [] + count = 0 + length = len(test_set) for test in test_set: result = self._guess_file_language(models, test) - print('{} '.format(result[0]),end='\r') + count += 1 + print('[{0:4d}/{1:4d}] {2}:{3} '.format(count, length, result[0][1], result[0][0]),end='\r') results.append(result[0]) if result[0][1] == language: ok += 1 total_test = len(test_set) accuracy = ok / len(test_set) print('Tests for {} '.format(language)) print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) print('Accuracy : {}%'.format(accuracy * 100)) return (ok, len(test_set), accuracy, results) + def speed_benchmark(self): + language = [x for x in os.listdir(self._root_model) if not x.startswith('.')][10] + models = self._load_models() + + test_set = self._get_test_set(language) + total_size = self._count_size(test_set) + print('{} kB in total'.format(total_size / 1024)) + + t_start = time.perf_counter() + self.test_class(models, language) + t_end = time.perf_counter() + + print('{} seconds.'.format(t_end - t_start)) + print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024)) + def _guess_file_language(self, models, filename): tokens = tokenizer(file_to_string(filename), 'letter') - text = ' '.join(tokens) + text = ' '.join(chr(token) for token in tokens) result = [] for model_key in models.keys(): root_model = os.path.join(self._root_model, model_key) model = models[model_key] score = model.score(text) result.append((score, model_key)) return sorted(result, reverse=True) if __name__ == '__main__': if len(sys.argv) == 3 and sys.argv[1] == '--train': n = NGramProb(sys.argv[2]) n.train() elif len(sys.argv) == 3 and sys.argv[1] == '--test': n = NGramProb(sys.argv[2]) n.test() elif len(sys.argv) == 3 and sys.argv[1] == '--benchmark': n = NGramProb(sys.argv[2]) n.speed_benchmark() elif len(sys.argv) == 4 and sys.argv[1] == '--test': n = NGramProb(sys.argv[2]) n.test_class(n.load_models(), sys.argv[3]) else: print('Wrong arguments, please check your input.') diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py index 72d3604..65864f1 100644 --- a/swh/langdetect/utils/common.py +++ b/swh/langdetect/utils/common.py @@ -1,79 +1,79 @@ """ Here regroup basic preprocessing methods used in learning stage for different approaches. """ import re, os _re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""") _re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]') _re_separator = re.compile(r'(\W)') _not_start_with_point = lambda x: not x.startswith('.') def tokenizer(text, re_name): ''' Splits text into tokens ''' if re_name == 'letter': return list(text) elif re_name == 'word': return [word for word in _re_separator.split(text) if word.strip(' \t')] def file_to_string(filename): """ Read a file to a string. """ - with open(filename, 'r', errors='ignore') as f: - data = f.read().lower() + with open(filename, 'rb') as f: + data = f.read() return replace_string_and_number(data) def count_files(root_language): all_folders = natural_sort(filter (_not_start_with_point, os.listdir(root_language))) files = natural_sort(filter (_not_start_with_point, os.listdir(root_language + '/' + all_folders[-1]))) (max,_) = os.path.splitext(files[-1]) return int(max) def find_file(root_language, n): '''Find the n-th file in language folder''' if n > count_files(root_language): return '' else: start = (n - 1) // 1000 * 1000 + 1 end = start + 999 root_count = root_language + '/' + str(start) + '-' + str(end) files = natural_sort(filter (_not_start_with_point, os.listdir(root_count))) return root_count + '/' + files[n - start] def replace_string_and_number(text): """ Replace strings and numbers in a file by special tokens """ # str_replaced = re.sub(_re_string, '__str__', text) # str_num_replaced = re.sub(_re_number, '__num__', str_replaced) str_num_replaced = text return str_num_replaced def natural_sort(l): convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] return sorted(l, key = alphanum_key) def remove_comment(text): # TODO: remove only inline comments and block comments # TODO: maybe build a list of comment markers pass def purify(text, lang): # TODO: for some language like HTML, remove code other than principal language pass