diff --git a/scripts/comparison.pdf b/scripts/comparison.pdf
new file mode 100644
index 0000000..82e40f1
Binary files /dev/null and b/scripts/comparison.pdf differ
diff --git a/scripts/draw_accuracy.py b/scripts/draw_accuracy.py
index 614c34f..979645f 100644
--- a/scripts/draw_accuracy.py
+++ b/scripts/draw_accuracy.py
@@ -1,77 +1,138 @@
 #!/bin/bash/python3
 
 import sys
 from pickle import load
 from collections import namedtuple, Counter
 try:
     import numpy as np
     import matplotlib.pyplot as plt
     from matplotlib.ticker import MaxNLocator
 except ImportError:
     raise ImportError('Please install matplotlib')
 
-def main(path):
+def heatmap(path):
     with open(path, 'rb') as f:
         data = load(f)
     mat = process(data)
     labels = sorted(data)
 
     fig, ax = plt.subplots()
     fig.set_size_inches(100,100)
     heatmap = ax.matshow(mat, cmap='Blues')
 
     fig = plt.gcf()
     ax.set_frame_on(False)
     
     ax.set_yticks(np.arange(len(labels)), minor=False)
     ax.set_xticks(np.arange(len(labels)), minor=False)
-    
+
+    ax.set_xlabel('Classification of test files')
+    ax.set_ylabel('Ground truth class of test files')
     ax.set_xticklabels(labels, minor=False)
     ax.set_yticklabels(labels, minor=False)
     ax.xaxis.tick_top()
+    ax.xaxis.set_label_position('top')
     plt.xticks(rotation=90)
     ax.grid(False)
 
     '''
     for i in np.arange(len(mat)):
         for j in np.arange(len(mat[i])):
             ax.text(i, j, "%.1f" % (mat[i][j] * 100), color='white')
     '''
 
     ax = plt.gca()
 
     for t in ax.xaxis.get_major_ticks():
         t.tick1On = False
         t.tick2On = False
     for t in ax.yaxis.get_major_ticks():
         t.tick1On = False
         t.tick2On = False
 
     fig.savefig("results.pdf", bbox_inches='tight')
     
 def process(data):
     '''
     '''
     ldata = sorted(data)
     length = len(ldata)
     out = [[0 for x in range(length)] for y in range(length)]
     for lang in ldata:
         index_lan = ldata.index(lang)
         ok = data[lang][0]
-        if data[lang][1] >= 1000 :
+        if data[lang][1] > 1000 :
             test_size = 1000
         else:
             test_size = data[lang][1]
         result = [x[1] for x in data[lang][3]]
         counter = dict(Counter(result))
         for res_lan in counter.keys():
             index_res = ldata.index(res_lan)
-            out[index_lan][index_res] = counter[res_lan] / test_size
+            out[index_lan][index_res] = counter.get(res_lan, 0) / test_size
             
     return out
 
+def get_accuracy(data):
+    ldata = sorted(data)
+    out = {}
+    for lang in ldata:
+        ok = data[lang][0]
+        if data[lang][1] > 1000:
+            test_size = 1000
+        else:
+            test_size = data[lang][1]
+        result = [x[1] for x in data[lang][3]]
+        counter = dict(Counter(result))
+        out[lang] = counter.get(lang, 0) / test_size
+    return out
+
+def compare(baseline, target):
+    with open(baseline, 'rb') as f:
+        data = load(f)
+    dict_base = get_accuracy(data)
+
+    with open(target, 'rb') as f:
+        data = load(f)
+    dict_targ = get_accuracy(data)
+
+    all_lang = sorted(list(set().union(dict_base.keys(),dict_targ.keys())))[::-1]
+    n = len(all_lang)
+    acc_base = [dict_base.get(lang, 0) for lang in all_lang]
+    acc_targ = [dict_targ.get(lang, 0) for lang in all_lang]
+
+    fig, ax = plt.subplots()
+    fig.set_size_inches(10,200)
+    ind = np.arange(n)
+    width = 0.35
+    opacity = 0.4
+    rects1 = ax.barh(ind + width, acc_base, width, alpha=opacity, color='b', label='N-grams with frequency distance')
+    rects2 = ax.barh(ind, acc_targ, width, alpha=opacity, color='r', label='N-grams with probability')
+
+    ax.set_xlabel('Accuracy / %')
+    ax.set_yticks(ind + width / 2)
+    ax.set_yticklabels(all_lang)
+    vals = ax.get_xticks()
+    ax.set_xticklabels(['{:3.0f}%'.format(x * 100) for x in vals])
+    ax.xaxis.tick_top()
+    ax.legend()
+
+    def autolabel(rects):
+        for rect in rects:
+            width = rect.get_width()
+            ax.text(width + 0.01, rect.get_y() + rect.get_height() / 2., '{0:.1f}%'.format(width * 100), ha='left', va='center')
+
+    autolabel(rects1)
+    autolabel(rects2)
+    plt.ylim([-1,n+1])
+
+    fig.tight_layout()
+    fig.savefig("comparison.pdf", bbox_inches='tight')
+
 if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        print('Only argument acceptable is a path.')
+    if len(sys.argv) == 2:
+        heatmap(sys.argv[1])
+    elif len(sys.argv) == 3:
+        compare(sys.argv[1],sys.argv[2])
     else:
-        main(sys.argv[1])
+        print('Please check arguments.')
diff --git a/scripts/result_ngrams_frequency_distance.pdf b/scripts/results_ngrams_frequency_distance.pdf
similarity index 50%
rename from scripts/result_ngrams_frequency_distance.pdf
rename to scripts/results_ngrams_frequency_distance.pdf
index be05137..8d0908f 100644
Binary files a/scripts/result_ngrams_frequency_distance.pdf and b/scripts/results_ngrams_frequency_distance.pdf differ
diff --git a/scripts/results_ngrams_prob.pdf b/scripts/results_ngrams_prob.pdf
new file mode 100644
index 0000000..b56ce47
Binary files /dev/null and b/scripts/results_ngrams_prob.pdf differ
diff --git a/swh/langdetect/cnn.py b/swh/langdetect/cnn.py
new file mode 100644
index 0000000..81788c6
--- /dev/null
+++ b/swh/langdetect/cnn.py
@@ -0,0 +1,247 @@
+
+import os, sys, subprocess, time
+import kenlm
+
+from itertools import islice
+from pickle import dump, load
+from collections import Counter
+from numpy import array
+from utils.common import tokenizer, file_to_string, find_file, count_files
+from keras.utils.vis_utils import plot_model
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Model
+from keras.layers import Input, Dense, Flatten, Dropout, Embedding, ThresholdedReLU
+from keras.layers.convolutional import Convolution1D, MaxPooling1D
+from keras.layers.merge import concatenate
+
+
+class CNN:
+
+    def __init__(self, root):
+        # Root of dataset
+        self._root = root
+
+        # Root of training set
+        self._root_training_set = os.path.join(self._root, '..', 'training_set')
+
+        # Root of model folder
+        self._root_model = os.path.join(self._root, '..', 'model_cnn')
+
+        # Root of arranged dataset
+        self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language')
+
+        # Path of result
+        self._path_result = os.path.join(self._root, '..', 'result_cnn')
+
+    def train(self):
+        try:
+            if len(os.listdir(self._root_training_set)) == 0:
+                build_training_set(self._root)
+            try:
+                os.mkdir(self._root_model)
+            except FileExistsError:
+                pass
+        except FileNotFoundError:
+            os.mkdir(self._root_training_set)
+            build_training_set(self._root)
+
+        languages = [x for x in os.listdir(self._root_training_set) if not x.startswith('.')]
+        try:
+            f = open(os.path.join(self._root, '..', 'model_cnn', 'texts+labels'), 'rb')
+            train_file_with_label = load(f)
+        except FileNotFoundError:
+            train_file_with_label = self._train_file_with_label(languages)
+            with open(os.path.join(self._root, '..', 'model_cnn', 'texts+labels'), 'wb') as f:
+                dump(train_file_with_label, f)
+
+        length = 1000
+        vocab_size = 256
+        total_class = len(languages)
+
+        model = self._get_model(length, vocab_size, total_class)
+        model.fit_generator(self._generator(length, total_class), steps_per_epoch=len(train_file_with_label), epochs=10)
+        model.save(os.path.join(self._root, '..', 'model_cnn', 'model.h5'))
+
+    def _generator(self, length, total_class):
+        while True:
+            with open(os.path.join(self._root, '..', 'model_cnn', 'texts+labels'), 'rb') as f:
+                train_file_with_label = load(f)
+                for pair in train_file_with_label:
+                    path, label = pair
+                    tokens = [x + 1 for x in tokenizer(file_to_string(path), 'letter')]
+                    tokens = pad_sequences([tokens], maxlen=length, padding='post')
+                    truth = array([[0 for _ in range(total_class)]])
+                    truth[0][label] = 1
+                    yield ([tokens], truth)
+                
+    def _train_file_with_label(self, languages):
+        l = []
+        
+        for language in languages:
+            root_training_set_language = os.path.join(self._root_training_set, language)
+            root_stat_language = os.path.join(self._root_model, language)
+            index_lang = languages.index(language)
+            if os.path.isfile(root_stat_language):
+                continue
+            print(language)
+            for f in [x for x in os.listdir(root_training_set_language) if not x.startswith('.')]:
+                filename = os.path.join(root_training_set_language, f)
+                l.append((filename, index_lang))
+
+        return l
+
+
+    def _get_model(self, length, vocab_size, total_class):
+
+        input_size = length
+        alphabet_size = vocab_size
+        embedding_size = 128
+        conv_layers = [(256,7,3), (256,7,3), (256,3,-1), (256,3,-1), (256,3,-1), (256,3,3)]
+        threshold = 1e-6
+        fully_connected_layers = [1024, 1024]
+        dropout_p = 0.5
+        optimizer = 'adam'
+        loss = 'categorical_crossentropy'
+        num_of_classes = total_class
+        
+        # Input layer
+        inputs = Input(shape=(input_size,), name='sent_input', dtype='int64')
+        # Embedding layers
+        x = Embedding(alphabet_size + 1, embedding_size, input_length=input_size)(inputs)
+        # Convolution layers
+        for cl in conv_layers:
+            x = Convolution1D(cl[0], cl[1])(x)
+            x = ThresholdedReLU(threshold)(x)
+            if cl[2] != -1:
+                x = MaxPooling1D(cl[2])(x)
+        x = Flatten()(x)
+        # Fully connected layers
+        for fl in fully_connected_layers:
+            x = Dense(fl)(x)
+            x = ThresholdedReLU(threshold)(x)
+            x = Dropout(dropout_p)(x)
+        # Output layer
+        predictions = Dense(num_of_classes, activation='softmax')(x)
+        # Build and compile model
+        model = Model(inputs=inputs, outputs=predictions)
+        model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
+        
+        print(model.summary())
+        
+        return model
+
+    def _max_len(self, texts):
+        return max([len(text) for text in texts])
+                
+    def _vocabulary_size(self, texts):
+        vocabulary = dict(Counter([token for text in texts for token in text]))
+        return len(vocabulary.keys())
+
+    def test(self):
+        try:
+            r = open(self._path_result, 'rb')
+            test_result = load(r)
+            r.close()
+        except FileNotFoundError:
+            test_result = {}
+        models = self._load_models()
+        
+        for language in [x for x in os.listdir(self._root_model) if not x.startswith('.') and x not in test_result.keys()]:
+            test_result[language] = self.test_class(models, language)
+            with open(self._path_result, 'wb') as f:
+                dump(test_result, f)
+            
+    def _load_models(self):
+        models = {}
+        
+        for model in [model
+                      for model in os.listdir(self._root_model)
+                      if not model.startswith('.')]:
+            root_model = os.path.join(self._root_model, model)
+            models[model] = kenlm.LanguageModel(root_model)
+        return models
+
+    def _get_test_set(self, language):
+        root_training_language = os.path.join(self._root_training_set, language)
+        root_language = os.path.join(self._root_language_dataset, language)
+        total = count_files(root_language)
+        training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')]
+        it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576)
+        test_set = list(islice(it, 1000))
+        if len(test_set) == 0:
+            it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set)
+            test_set = list(islice(it, 1000))
+        return test_set
+    
+    def _count_size(self, files):
+        size = 0
+        for f in files:
+            size += os.path.getsize(f)
+        return size
+    
+    def test_class(self, models, language):
+        test_set = self._get_test_set(language)
+
+        ok = 0
+        results = []
+        count = 0
+        length = len(test_set)
+        for test in test_set:
+            result = self._guess_file_language(models, test)
+            count += 1
+            print('[{0:4d}/{1:4d}] {2}:{3}       '.format(count, length, result[0][1], result[0][0]),end='\r')
+            results.append(result[0])
+            if result[0][1] == language:
+                ok += 1
+
+        total_test = len(test_set)
+        accuracy = ok / len(test_set)
+        print('Tests for {}                   '.format(language))
+        print('Total test files           : {}'.format(total_test))
+        print('Correctly classified files : {}'.format(ok))
+        print('Accuracy                   : {}%'.format(accuracy * 100))
+        return (ok, len(test_set), accuracy, results)
+
+    def speed_benchmark(self):
+        language = [x for x in os.listdir(self._root_model) if not x.startswith('.')][10]
+        models = self._load_models()
+
+        test_set = self._get_test_set(language)
+        total_size = self._count_size(test_set)
+        print('{} kB in total'.format(total_size / 1024))
+        
+        t_start = time.perf_counter()
+        self.test_class(models, language)
+        t_end = time.perf_counter()
+        
+        print('{} seconds.'.format(t_end - t_start))
+        print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024))
+
+    def _guess_file_language(self, models, filename):
+        tokens = tokenizer(file_to_string(filename), 'letter')
+        text = ' '.join(chr(token) for token in tokens)
+
+        result = []
+        
+        for model_key in models.keys():
+            root_model = os.path.join(self._root_model, model_key)
+            model = models[model_key]
+            score = model.score(text)
+            result.append((score, model_key))
+        return sorted(result, reverse=True)
+
+if __name__ == '__main__':
+    if len(sys.argv) == 3 and sys.argv[1] == '--train':
+        n = CNN(sys.argv[2])
+        n.train()
+    elif len(sys.argv) == 3 and sys.argv[1] == '--test':
+        n = NGramProb(sys.argv[2])
+        n.test()
+    elif len(sys.argv) == 3 and sys.argv[1] == '--benchmark':
+        n = NGramProb(sys.argv[2])
+        n.speed_benchmark()
+    elif len(sys.argv) == 4 and sys.argv[1] == '--test':
+        n = NGramProb(sys.argv[2])
+        n.test_class(n.load_models(), sys.argv[3])
+    else:
+        print('Wrong arguments, please check your input.')
diff --git a/swh/langdetect/ngramdist.py b/swh/langdetect/naivebayesian.py
similarity index 52%
copy from swh/langdetect/ngramdist.py
copy to swh/langdetect/naivebayesian.py
index 46ed397..495dd86 100644
--- a/swh/langdetect/ngramdist.py
+++ b/swh/langdetect/naivebayesian.py
@@ -1,243 +1,221 @@
 """
-Baseline approach
+Naive Bayesian
 """
 
 import os, sys, operator, nltk, random, time
+import numpy as np
 
+from itertools import islice
 from pickle import dump, load
-from nltk.util import ngrams
 from utils.common import tokenizer, file_to_string, find_file, count_files
 from utils.training import build_training_set
+from nltk.util import ngrams
+from collections import Counter
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
+from sklearn.externals import joblib
 
-class NGramDist:
+class NaiveBayesian:
     
     def __init__(self, root):
         # Root of dataset
         self._root = root
 
         # Root of training set
         self._root_training_set = os.path.join(self._root, '..', 'training_set')
 
         # Root of model folder
-        self._root_model = os.path.join(self._root, '..', 'model_ngram_dist')
+        self._root_model = os.path.join(self._root, '..', 'model_bayesian')
 
         # Root of arranged dataset
         self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language')
 
         # Path of result
-        self._path_result = os.path.join(self._root, '..', 'result')
-        
+        self._path_result = os.path.join(self._root, '..', 'result_bayesian')
+
+        self._languages = [x for x in os.listdir(self._root_training_set) if not x.startswith('.')]
     def train(self):
         '''
         train () generates and stores counted n-grams in '_root_model' folder
         '''
         
         try:
             if len(os.listdir(self._root_training_set)) == 0:
                 build_training_set(self._root)
             try:
                 os.mkdir(self._root_model)
             except FileExistsError:
                 pass
         except FileNotFoundError:
             os.mkdir(self._root_training_set)
             build_training_set(self._root)
 
         '''
         Calculate frequencies of generated n-grams then store 
         them into a sorted list of (ngram, count)
         '''
-        for language in os.listdir(self._root_training_set):
-            if not language.startswith('.'):
-                root_training_set_language = os.path.join(self._root_training_set, language)
-                root_stat_language = os.path.join(self._root_model, language)
-                if os.path.isfile(root_stat_language):
-                    continue
-                statistics = {}
-                for f in os.listdir(root_training_set_language):
-                    print(f)
-                    if not f.startswith('.'):
-                        filename = os.path.join(root_training_set_language, f)
-                        tokens = tokenizer(file_to_string(filename), 'letter')
-                        generated_ngrams = self._generate_ngrams(tokens, 3)
-                        self._count_ngrams(statistics, generated_ngrams)
-                with open(root_stat_language, 'wb') as f:
-                    dump(self._sort_by_value(statistics), f)
         
-    def _generate_ngrams(self, tokens, n):
-        '''
-        :param tokens: generated tokens from a string.
-        :param n: maximum n of n-grams
-        :type tokens: list
-        :type n: int
-        :return: generated 1-grams,  ... , n-grams
-        :rtype: list 
-        '''
-        generated_ngrams = []
-
-        for i in range(1, n+1):
-            igrams = ngrams(tokens, i,
-                            pad_left=True,
-                            pad_right=True,
-                            left_pad_symbol  = '$BOF$',
-                            right_pad_symbol = '$EOF$')
-            for igram in igrams:
-                generated_ngrams.append(''.join(igram))
-
-        return generated_ngrams
+        clf = MultinomialNB()
+        cv = HashingVectorizer(analyzer='char', ngram_range=(1, 3), n_features=2**17, alternate_sign=False)
+        
+        indices = list(range(len(self._languages)))
+        
+        for language in self._languages:
+            root_training_set_language = os.path.join(self._root_training_set, language)
+            root_stat_language = os.path.join(self._root_model, 'classifier')
+            index_lang = self._languages.index(language)
+            texts = []
+            for f in os.listdir(root_training_set_language):
+                if not f.startswith('.'):
+                    print(f)
+                    filename = os.path.join(root_training_set_language, f)
+                    tokens = tokenizer(file_to_string(filename), 'letter')
+                    text = ''.join([chr(token) for token in tokens])
+                    texts.append(text)
+            counts = cv.fit_transform(texts)
+            tf = TfidfTransformer(use_idf=False).fit(counts)
+            normalised = tf.transform(counts)
+            clf.partial_fit(normalised, np.array([index_lang for _ in texts]), indices)
             
-    def _count_ngrams(self, statistics, ngrams):
-        '''
-        :param statistics: shared dictionary for statistics
-        :param ngrams: n-grams to be accumulated into statistics
-        '''
-        for ngram in ngrams:
-            statistics[ngram] = statistics.get(ngram, 0) + 1
+        with open(root_stat_language + '.clf', 'wb') as f:
+            joblib.dump(clf, f)
+        with open(root_stat_language + '.hv', 'wb') as f:
+            joblib.dump(cv, f)
+
 
     def test(self):
-        test_result = {}
-        models = self._load_models()
+        try:
+            r = open(self._path_result, 'rb')
+            test_result = load(r)
+            r.close()
+        except FileNotFoundError:
+            test_result = {}
+
+        with open(os.path.join(self._root_model, 'classifier'), 'rb') as f:
+            clf, cv = load(f)
         
-        for language in [x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')]:
-            test_result[language] = self.test_class(models, language)
-        with open(self._path_result, 'wb') as f:
-            dump(test_result, f)
+        for language in [x for x in os.listdir(self._root_training_set) if not x.startswith('.') and x not in test_result.keys()]:
+            test_result[language] = self.test_class((clf, cv), language)
+            with open(self._path_result, 'wb') as f:
+                dump(test_result, f)
 
     def speed_benchmark(self):
-        language = random.choice([x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')])
+        language = [x for x in os.listdir(self._root_training_set) if not x.startswith('.')][10]
         models = self._load_models()
 
         test_set = self._get_test_set(language)
         total_size = self._count_size(test_set)
         print('{} kB in total'.format(total_size / 1024))
         
         t_start = time.perf_counter()
         self.test_class(models, language)
         t_end = time.perf_counter()
         
         print('{} seconds.'.format(t_end - t_start))
         print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024))
         
 
-    def _load_models(self):
-        models = {}
-        
-        for model in [model
-                      for model in os.listdir(self._root_model)
-                      if not model.startswith('.')]:
-            root_model = os.path.join(self._root_model, model)
-            with open(root_model, 'rb') as sorted_file:
-                models[model] = self._list_to_dict(load(sorted_file))
-
-        return models
-
-    def _list_to_dict(self, model):
-        model_ngrams = [x[0] for x in model]
-        model_dict = {}
-        index = 0
-        for ngram in model_ngrams:
-            index += 1
-            model_dict[ngram] = index
-        return model_dict
-
     def _get_test_set(self, language):
         root_training_language = os.path.join(self._root_training_set, language)
         root_language = os.path.join(self._root_language_dataset, language)
         total = count_files(root_language)
         training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')]
-        test_set = [find_file(root_language, x) for x in range(1, total + 1) if x not in training_set][:1000]
+        it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576)
+        test_set = list(islice(it, 1000))
+        if len(test_set) == 0:
+            it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set)
+            test_set = list(islice(it, 1000))
         return test_set
 
     def _count_size(self, files):
         size = 0
         for f in files:
             size += os.path.getsize(f)
         return size
 
-    def test_class(self, models, language):
+    def test_class(self, clf, language):
         test_set = self._get_test_set(language)
+        index_lang = self._languages.index(language)
 
         ok = 0
         results = []
+        count = 0
+        length = len(test_set)
         for test in test_set:
-            result = self._guess_file_language(models, test)
-            print('{}      '.format(result[0]),end='\r')
+            result = self._guess_file_language(clf, test)
+            count += 1
+            print('[{0:4d}/{1:4d}] {2}:{3}       '.format(count, length, result[0][1], result[0][0]),end='\r')
             results.append(result[0])
             if result[0][1] == language:
                 ok += 1
 
         total_test = len(test_set)
         accuracy = ok / len(test_set)
         print('Tests for {}                   '.format(language))
         print('Total test files           : {}'.format(total_test))
         print('Correctly classified files : {}'.format(ok))
         print('Accuracy                   : {}%'.format(accuracy * 100))
         return (ok, len(test_set), accuracy, results)
 
-    def test_single(self, models, filename):
-        self._guess_file_language(models, filename)
-        
-    def _guess_file_language(self, models, filename):
+    def test_single(self, filename):
+        self._guess_file_language(clf, filename)
         
+    def _guess_file_language(self, cc, filename):
+        clf = cc[0]
+        cv = cc[1]
         tokens = tokenizer(file_to_string(filename), 'letter')
-        generated_ngrams = self._generate_ngrams(tokens, 3)
-        
-        statistics = {}
-        self._count_ngrams(statistics, generated_ngrams)
-        
-        test_profile = self._list_to_dict(self._sort_by_value(statistics))
+        text = ''.join([chr(token) for token in tokens])
+        counts = cv.fit_transform([text])
+        tf = TfidfTransformer(use_idf=False).fit(counts)
+        normalised = tf.transform(counts)
 
-        result = []
+        result = clf.predict_log_proba(normalised)
 
-        for model in models.keys():
-            root_model = os.path.join(self._root_model, model)
-            model_profile = models[model]
-            distance = self._distance(model_profile, test_profile)
-            result.append((distance, model))
-                    
-        return sorted(result)
+        result = [(val, self._languages[idx]) for idx, val in enumerate(result[0])]
+        
+        return sorted(result, reverse=True)
 
     def _sort_by_value(self, statistics):
         statistics_sorted = sorted(statistics.items(),
                                    key = operator.itemgetter(1),
                                    reverse = True)[:500]
         return statistics_sorted
         
     def _distance(self, model_profile, test_profile):
         distance = 0
         maximum = len(test_profile)
 
         for test_ngram in test_profile.keys():
             test_rank = test_profile.get(test_ngram)
             model_rank = model_profile.get(test_ngram, maximum)
             d = abs(test_rank - model_rank)
             distance += d
 
         return distance
     '''    
     def _prob(model, trigrams):
         print('Checking {} model ...'.format(model))
         with open(model, 'rb') as f:
             kneser_ney = load(f)
         result = 1
         for trigram in trigrams:
             prob = kneser_ney.prob(trigram)
             result = result * prob
         return result    
     '''
 
 if __name__ == '__main__':
     if len(sys.argv) == 3 and sys.argv[1] == '--train':
-        n = NGramDist(sys.argv[2])
+        n = NaiveBayesian(sys.argv[2])
         n.train()
     elif len(sys.argv) == 3 and sys.argv[1] == '--test':
-        n = NGramDist(sys.argv[2])
+        n = NaiveBayesian(sys.argv[2])
         n.test()
     elif len(sys.argv) == 3 and sys.argv[1] == '--benchmark':
-        n = NGramDist(sys.argv[2])
+        n = NaiveBayesian(sys.argv[2])
         n.speed_benchmark()
     elif len(sys.argv) == 4 and sys.argv[1] == '--test':
-        n = NGramDist(sys.argv[2])
+        n = NaiveBayesian(sys.argv[2])
         n.test_class(n.load_models(), sys.argv[3])
     else:
         print('Wrong arguments, please check your input.')
diff --git a/swh/langdetect/ngramdist.py b/swh/langdetect/ngramdist.py
index 46ed397..06449dd 100644
--- a/swh/langdetect/ngramdist.py
+++ b/swh/langdetect/ngramdist.py
@@ -1,243 +1,256 @@
 """
 Baseline approach
 """
 
 import os, sys, operator, nltk, random, time
 
+from itertools import islice
 from pickle import dump, load
 from nltk.util import ngrams
 from utils.common import tokenizer, file_to_string, find_file, count_files
 from utils.training import build_training_set
 
 class NGramDist:
     
     def __init__(self, root):
         # Root of dataset
         self._root = root
 
         # Root of training set
         self._root_training_set = os.path.join(self._root, '..', 'training_set')
 
         # Root of model folder
         self._root_model = os.path.join(self._root, '..', 'model_ngram_dist')
 
         # Root of arranged dataset
         self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language')
 
         # Path of result
-        self._path_result = os.path.join(self._root, '..', 'result')
+        self._path_result = os.path.join(self._root, '..', 'result_freq')
         
     def train(self):
         '''
         train () generates and stores counted n-grams in '_root_model' folder
         '''
         
         try:
             if len(os.listdir(self._root_training_set)) == 0:
                 build_training_set(self._root)
             try:
                 os.mkdir(self._root_model)
             except FileExistsError:
                 pass
         except FileNotFoundError:
             os.mkdir(self._root_training_set)
             build_training_set(self._root)
 
         '''
         Calculate frequencies of generated n-grams then store 
         them into a sorted list of (ngram, count)
         '''
         for language in os.listdir(self._root_training_set):
             if not language.startswith('.'):
                 root_training_set_language = os.path.join(self._root_training_set, language)
                 root_stat_language = os.path.join(self._root_model, language)
                 if os.path.isfile(root_stat_language):
                     continue
                 statistics = {}
                 for f in os.listdir(root_training_set_language):
                     print(f)
                     if not f.startswith('.'):
                         filename = os.path.join(root_training_set_language, f)
                         tokens = tokenizer(file_to_string(filename), 'letter')
-                        generated_ngrams = self._generate_ngrams(tokens, 3)
+                        generated_ngrams = self._generate_ngrams([chr(token) for token in tokens], 3)
                         self._count_ngrams(statistics, generated_ngrams)
                 with open(root_stat_language, 'wb') as f:
                     dump(self._sort_by_value(statistics), f)
         
     def _generate_ngrams(self, tokens, n):
         '''
         :param tokens: generated tokens from a string.
         :param n: maximum n of n-grams
         :type tokens: list
         :type n: int
         :return: generated 1-grams,  ... , n-grams
         :rtype: list 
         '''
         generated_ngrams = []
 
         for i in range(1, n+1):
             igrams = ngrams(tokens, i,
                             pad_left=True,
                             pad_right=True,
                             left_pad_symbol  = '$BOF$',
                             right_pad_symbol = '$EOF$')
             for igram in igrams:
                 generated_ngrams.append(''.join(igram))
 
         return generated_ngrams
             
     def _count_ngrams(self, statistics, ngrams):
         '''
         :param statistics: shared dictionary for statistics
         :param ngrams: n-grams to be accumulated into statistics
         '''
         for ngram in ngrams:
             statistics[ngram] = statistics.get(ngram, 0) + 1
 
     def test(self):
-        test_result = {}
+        try:
+            r = open(self._path_result, 'rb')
+            test_result = load(r)
+            r.close()
+        except FileNotFoundError:
+            test_result = {}
         models = self._load_models()
         
-        for language in [x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')]:
+        for language in [x for x in os.listdir(self._root_model) if not x.startswith('.') and x not in test_result.keys()]:
             test_result[language] = self.test_class(models, language)
-        with open(self._path_result, 'wb') as f:
-            dump(test_result, f)
+            with open(self._path_result, 'wb') as f:
+                dump(test_result, f)
 
     def speed_benchmark(self):
-        language = random.choice([x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')])
+        language = [x for x in os.listdir(self._root_model) if not x.startswith('.')][10]
         models = self._load_models()
 
         test_set = self._get_test_set(language)
         total_size = self._count_size(test_set)
         print('{} kB in total'.format(total_size / 1024))
         
         t_start = time.perf_counter()
         self.test_class(models, language)
         t_end = time.perf_counter()
         
         print('{} seconds.'.format(t_end - t_start))
         print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024))
         
 
     def _load_models(self):
         models = {}
         
         for model in [model
                       for model in os.listdir(self._root_model)
                       if not model.startswith('.')]:
             root_model = os.path.join(self._root_model, model)
             with open(root_model, 'rb') as sorted_file:
                 models[model] = self._list_to_dict(load(sorted_file))
 
         return models
 
     def _list_to_dict(self, model):
         model_ngrams = [x[0] for x in model]
         model_dict = {}
         index = 0
         for ngram in model_ngrams:
             index += 1
             model_dict[ngram] = index
         return model_dict
 
     def _get_test_set(self, language):
         root_training_language = os.path.join(self._root_training_set, language)
         root_language = os.path.join(self._root_language_dataset, language)
         total = count_files(root_language)
         training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')]
-        test_set = [find_file(root_language, x) for x in range(1, total + 1) if x not in training_set][:1000]
+        it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576)
+        test_set = list(islice(it, 1000))
+        if len(test_set) == 0:
+            it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set)
+            test_set = list(islice(it, 1000))
         return test_set
 
     def _count_size(self, files):
         size = 0
         for f in files:
             size += os.path.getsize(f)
         return size
 
     def test_class(self, models, language):
         test_set = self._get_test_set(language)
 
         ok = 0
         results = []
+        count = 0
+        length = len(test_set)
         for test in test_set:
             result = self._guess_file_language(models, test)
-            print('{}      '.format(result[0]),end='\r')
+            count += 1
+            print('[{0:4d}/{1:4d}] {2}:{3}       '.format(count, length, result[0][1], result[0][0]),end='\r')
             results.append(result[0])
             if result[0][1] == language:
                 ok += 1
 
         total_test = len(test_set)
         accuracy = ok / len(test_set)
         print('Tests for {}                   '.format(language))
         print('Total test files           : {}'.format(total_test))
         print('Correctly classified files : {}'.format(ok))
         print('Accuracy                   : {}%'.format(accuracy * 100))
         return (ok, len(test_set), accuracy, results)
 
     def test_single(self, models, filename):
         self._guess_file_language(models, filename)
         
     def _guess_file_language(self, models, filename):
         
         tokens = tokenizer(file_to_string(filename), 'letter')
-        generated_ngrams = self._generate_ngrams(tokens, 3)
+        generated_ngrams = self._generate_ngrams([chr(token) for token in tokens], 3)
         
         statistics = {}
         self._count_ngrams(statistics, generated_ngrams)
         
         test_profile = self._list_to_dict(self._sort_by_value(statistics))
 
         result = []
 
         for model in models.keys():
             root_model = os.path.join(self._root_model, model)
             model_profile = models[model]
             distance = self._distance(model_profile, test_profile)
             result.append((distance, model))
                     
         return sorted(result)
 
     def _sort_by_value(self, statistics):
         statistics_sorted = sorted(statistics.items(),
                                    key = operator.itemgetter(1),
                                    reverse = True)[:500]
         return statistics_sorted
         
     def _distance(self, model_profile, test_profile):
         distance = 0
         maximum = len(test_profile)
 
         for test_ngram in test_profile.keys():
             test_rank = test_profile.get(test_ngram)
             model_rank = model_profile.get(test_ngram, maximum)
             d = abs(test_rank - model_rank)
             distance += d
 
         return distance
     '''    
     def _prob(model, trigrams):
         print('Checking {} model ...'.format(model))
         with open(model, 'rb') as f:
             kneser_ney = load(f)
         result = 1
         for trigram in trigrams:
             prob = kneser_ney.prob(trigram)
             result = result * prob
         return result    
     '''
 
 if __name__ == '__main__':
     if len(sys.argv) == 3 and sys.argv[1] == '--train':
         n = NGramDist(sys.argv[2])
         n.train()
     elif len(sys.argv) == 3 and sys.argv[1] == '--test':
         n = NGramDist(sys.argv[2])
         n.test()
     elif len(sys.argv) == 3 and sys.argv[1] == '--benchmark':
         n = NGramDist(sys.argv[2])
         n.speed_benchmark()
     elif len(sys.argv) == 4 and sys.argv[1] == '--test':
         n = NGramDist(sys.argv[2])
         n.test_class(n.load_models(), sys.argv[3])
     else:
         print('Wrong arguments, please check your input.')
diff --git a/swh/langdetect/ngramprob.py b/swh/langdetect/ngramprob.py
index ba8b5a3..7cf5e47 100644
--- a/swh/langdetect/ngramprob.py
+++ b/swh/langdetect/ngramprob.py
@@ -1,137 +1,169 @@
 
-import os, sys, subprocess
+import os, sys, subprocess, time
 import kenlm
 
+from itertools import islice
 from pickle import dump, load
 from utils.common import tokenizer, file_to_string, find_file, count_files
 
 class NGramProb:
 
     def __init__(self, root):
         # Root of dataset
         self._root = root
 
         # Root of training set
         self._root_training_set = os.path.join(self._root, '..', 'training_set')
 
         # Root of model folder
         self._root_model = os.path.join(self._root, '..', 'model_ngram_prob')
 
         # Root of arranged dataset
         self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language')
 
         # Path of result
         self._path_result = os.path.join(self._root, '..', 'result_prob')
 
     def train(self):
         try:
             if len(os.listdir(self._root_training_set)) == 0:
                 build_training_set(self._root)
             try:
                 os.mkdir(self._root_model)
             except FileExistsError:
                 pass
         except FileNotFoundError:
             os.mkdir(self._root_training_set)
             build_training_set(self._root)
 
         for language in [x for x in os.listdir(self._root_training_set) if not x.startswith('.')]:
             root_training_set_language = os.path.join(self._root_training_set, language)
             texts = []
             root_stat_language = os.path.join(self._root_model, language)
             if os.path.isfile(root_stat_language):
                 continue
             
             for f in [x for x in os.listdir(root_training_set_language) if not x.startswith('.')]:
                 filename = os.path.join(root_training_set_language, f)
                 tokens = tokenizer(file_to_string(filename), 'letter')
-                texts.append(' '.join(tokens))
-
+                texts.append((' '.join(chr(token) for token in tokens)))
             train_text = ' '.join(texts)
-            command = ['../../bin/lmplz', '-o', '5', '--discount_fallback']
+            command = ['../../bin/lmplz', '-o', '3', '-T', '/tmp', '--discount_fallback']
 
             with open(root_stat_language, 'wb') as f:
                 proc = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=f)
                 proc.communicate(train_text.encode())
             if os.path.getsize(root_stat_language) == 0:
                 os.remove(root_stat_language)
 
             # st = os.stat(root_stat_language)
             # os.chmod(root_stat_language, st.st_mode | stat.S_IEXEC)
 
     def test(self):
-        with open(self._path_result, 'rb') as r :
+        try:
+            r = open(self._path_result, 'rb')
             test_result = load(r)
+            r.close()
+        except FileNotFoundError:
+            test_result = {}
         models = self._load_models()
         
-        for language in [x for x in os.listdir(self._root_language_dataset) if not x.startswith('.') and x not in test_result.keys()]:
+        for language in [x for x in os.listdir(self._root_model) if not x.startswith('.') and x not in test_result.keys()]:
             test_result[language] = self.test_class(models, language)
             with open(self._path_result, 'wb') as f:
                 dump(test_result, f)
             
     def _load_models(self):
         models = {}
         
         for model in [model
                       for model in os.listdir(self._root_model)
                       if not model.startswith('.')]:
             root_model = os.path.join(self._root_model, model)
             models[model] = kenlm.LanguageModel(root_model)
         return models
 
     def _get_test_set(self, language):
         root_training_language = os.path.join(self._root_training_set, language)
         root_language = os.path.join(self._root_language_dataset, language)
         total = count_files(root_language)
         training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')]
-        test_set = [find_file(root_language, x) for x in range(1, total + 1) if x not in training_set][:1000]
+        it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576)
+        test_set = list(islice(it, 1000))
+        if len(test_set) == 0:
+            it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set)
+            test_set = list(islice(it, 1000))
         return test_set
     
+    def _count_size(self, files):
+        size = 0
+        for f in files:
+            size += os.path.getsize(f)
+        return size
+    
     def test_class(self, models, language):
         test_set = self._get_test_set(language)
 
         ok = 0
         results = []
+        count = 0
+        length = len(test_set)
         for test in test_set:
             result = self._guess_file_language(models, test)
-            print('{}      '.format(result[0]),end='\r')
+            count += 1
+            print('[{0:4d}/{1:4d}] {2}:{3}       '.format(count, length, result[0][1], result[0][0]),end='\r')
             results.append(result[0])
             if result[0][1] == language:
                 ok += 1
 
         total_test = len(test_set)
         accuracy = ok / len(test_set)
         print('Tests for {}                   '.format(language))
         print('Total test files           : {}'.format(total_test))
         print('Correctly classified files : {}'.format(ok))
         print('Accuracy                   : {}%'.format(accuracy * 100))
         return (ok, len(test_set), accuracy, results)
 
+    def speed_benchmark(self):
+        language = [x for x in os.listdir(self._root_model) if not x.startswith('.')][10]
+        models = self._load_models()
+
+        test_set = self._get_test_set(language)
+        total_size = self._count_size(test_set)
+        print('{} kB in total'.format(total_size / 1024))
+        
+        t_start = time.perf_counter()
+        self.test_class(models, language)
+        t_end = time.perf_counter()
+        
+        print('{} seconds.'.format(t_end - t_start))
+        print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024))
+
     def _guess_file_language(self, models, filename):
         tokens = tokenizer(file_to_string(filename), 'letter')
-        text = ' '.join(tokens)
+        text = ' '.join(chr(token) for token in tokens)
 
         result = []
         
         for model_key in models.keys():
             root_model = os.path.join(self._root_model, model_key)
             model = models[model_key]
             score = model.score(text)
             result.append((score, model_key))
         return sorted(result, reverse=True)
 
 if __name__ == '__main__':
     if len(sys.argv) == 3 and sys.argv[1] == '--train':
         n = NGramProb(sys.argv[2])
         n.train()
     elif len(sys.argv) == 3 and sys.argv[1] == '--test':
         n = NGramProb(sys.argv[2])
         n.test()
     elif len(sys.argv) == 3 and sys.argv[1] == '--benchmark':
         n = NGramProb(sys.argv[2])
         n.speed_benchmark()
     elif len(sys.argv) == 4 and sys.argv[1] == '--test':
         n = NGramProb(sys.argv[2])
         n.test_class(n.load_models(), sys.argv[3])
     else:
         print('Wrong arguments, please check your input.')
diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py
index 72d3604..65864f1 100644
--- a/swh/langdetect/utils/common.py
+++ b/swh/langdetect/utils/common.py
@@ -1,79 +1,79 @@
 """
 Here regroup basic preprocessing methods
 used in learning stage for different 
 approaches.
 
 """
 
 import re, os
 
 _re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
 _re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
 _re_separator = re.compile(r'(\W)')
 _not_start_with_point = lambda x: not x.startswith('.')
 
 def tokenizer(text, re_name):
     ''' Splits text into tokens '''
     if re_name == 'letter':
         return list(text)
     elif re_name == 'word':
         return [word for word in _re_separator.split(text) if word.strip(' \t')]
 
 def file_to_string(filename):
     """ Read a file to a string. """
-    with open(filename, 'r', errors='ignore') as f:
-        data = f.read().lower()
+    with open(filename, 'rb') as f:
+        data = f.read()
     return replace_string_and_number(data)
 
 def count_files(root_language):    
     all_folders = natural_sort(filter
                                (_not_start_with_point,
                                 os.listdir(root_language)))
     files = natural_sort(filter
                          (_not_start_with_point,
                           os.listdir(root_language + '/' + all_folders[-1])))
     (max,_) = os.path.splitext(files[-1])
     return int(max)
 
 def find_file(root_language, n):
     '''Find the n-th file in language folder'''
     if n > count_files(root_language):
         return ''
     else:
         start = (n - 1) // 1000 * 1000 + 1
         end = start + 999
         root_count = root_language + '/' + str(start) + '-' + str(end)
         files = natural_sort(filter
                              (_not_start_with_point,
                               os.listdir(root_count)))
         return root_count + '/' + files[n - start]
 
 def replace_string_and_number(text):
     """ Replace strings and numbers in a file by special tokens 
     """
     # str_replaced = re.sub(_re_string, '__str__', text)
     # str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
     str_num_replaced = text
     return str_num_replaced
 
 def natural_sort(l): 
     convert = lambda text: int(text) if text.isdigit() else text.lower() 
     alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
     return sorted(l, key = alphanum_key)
 
 def remove_comment(text):
     # TODO: remove only inline comments and block comments
     # TODO: maybe build a list of comment markers
     pass
 
 def purify(text, lang):
     # TODO: for some language like HTML, remove code other than principal language
     pass