diff --git a/swh/langdetect/ngram.py b/swh/langdetect/ngram.py
index 9db7303..d9b4a9a 100644
--- a/swh/langdetect/ngram.py
+++ b/swh/langdetect/ngram.py
@@ -1,63 +1,107 @@
 """
 Calculate frequencies for classes
 """
 
-import os,sys,nltk
+import os, sys, nltk
 
+from pickle import dump, load
 from collections import Counter
 from nltk.util import ngrams
 from utils.common import tokenizer, file_to_string
 from utils.training import build_training_set
+from nltk.probability import *
 
 class NGramTrain:
     
     def __init__(self, root):
-        self._ngram_list = {}
-        self._prob = {}
         self._root = root
         
     def train(self):
         root_training_set = self._root + '/../training_set'
         root_model = '../../dataset/model'
         
         try:
             if len(os.listdir(root_training_set)) == 0:
                 build_training_set(self._root)
+            try:
+                os.mkdir(root_model)
+            except FileExistsError:
+                pass
         except FileNotFoundError:
             os.mkdir(root_training_set)
             build_training_set(self._root)
             
         for language in os.listdir(root_training_set):
             if not language.startswith('.'):
-                _ngram_list[language] = {}
-                _prob[language] = None
                 root_training_set_language = root_training_set + '/' + language
                 for f in os.listdir(root_training_set_language):
+                    print(f)
                     if not f.startswith('.'):
-                        _ngrams_max(language, f)
+                        ngram_lan = ngrams_max(root_training_set_language + '/' + f)
+                        prob_lan = self._kneser_key_prob(ngram_lan)
+                #with open(root_model + '/' + language + '.model', 'wb') as f:
+                #    dump(ngram_lan, f)
+                with open(root_model + '/' + language + '.model', 'wb') as f:
+                    dump(prob_lan, f)
         
-    def _ngrams_max(self, language, filename, n=3):
-        ngram_lan = self._ngram_list[language]
+    def ngrams_max(filename, n=3):
+        ngram_lan = {}
         tokens = tokenizer(file_to_string(filename))
-        for i in range(1, n):
+        for i in range(n - 1, n):
             ngram_lan[i + 1] = ngram_lan.get(i + 1, []) + \
-                               list(ngram(tokens, i + 1,
+                               list(ngrams(tokens, i + 1,
                                           pad_left  = True,
                                           pad_right = True,
                                           left_pad_symbol  = '$BOS$',
                                           right_pad_symbol = '$EOS$'))
+        return ngram_lan
             
-    def _kneser_key_prob(self, language):
+    def _kneser_key_prob(self, ngram_lan):
         c = Counter()
-        ngram_lan = self._ngram_list[language]
-        for key in self._ngram_lan.keys():
-            c = c + Counter(self.ngram_list[key])
+        for key in ngram_lan.keys():
+            c = c + Counter(ngram_lan[key])
         freq_dist = nltk.FreqDist(c)
-        self._prob[language] = nltk.KneserNeyProbDist(freq_dist)
+        return nltk.KneserNeyProbDist(freq_dist)
+
+
+class NGramTest:
+
+    def test(filename):
+        NGramTest._guess_file_language(filename)
+        
+    def _guess_file_language(filename):
+        root_model_folder = '../../dataset/model'
+        trigrams = NGramTrain.ngrams_max(filename)
+        result = []
+
+        for model in os.listdir(root_model_folder):
+            if not model.startswith('.'):
+                root_model = root_model_folder + '/' + model
+                (language,_) = os.path.splitext(model)
+                result.append((NGramTest._prob(root_model, trigrams[3]), language))
+        print(sorted(result))
+        
+    def _prob(model, trigrams):
+        with open(model, 'rb') as f:
+            kneser_ney = load(f)
+        result = 0
+        for trigram in trigrams:
+            prob = kneser_ney.prob(trigram)
+            if prob != 0:
+                if result == 0:
+                    result = prob
+                else:
+                    result = result * prob
+        return result
+                    
 
 if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        print('Only argument acceptable is a path.')
-    else:
-        model = NGramTrain(sys.argv[1])
+    if len(sys.argv) != 3:
+        print('Only acceptable arguments are an option and a path.')
+    elif sys.argv[1] == '--train':
+        model = NGramTrain(sys.argv[2])
         model.train()
+    elif sys.argv[1] == '--test':
+        NGramTest.test(sys.argv[2])
+    else:
+        print('Wrong arguments, please check your input.')
diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py
index 37648f9..52d387f 100644
--- a/swh/langdetect/utils/common.py
+++ b/swh/langdetect/utils/common.py
@@ -1,75 +1,75 @@
 """
 Here regroup basic preprocessing methods
 used in learning stage for different 
 approaches.
 
 """
 
 import re, os
 
 _re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
 _re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
 _re_separator = re.compile(r'(\W)')
 _not_start_with_point = lambda x: not x.startswith('.')
 
 def tokenizer(text):
     ''' Splits text into tokens '''
     return [word for word in _re_separator.split(text) if word.strip(' \t')]
 
 def file_to_string(filename):
     """ Read a file to a string. """
-    with open(filename, 'r') as f:
-        data = f.read().replace('\n',' ').lower
-    return data
+    with open(filename, 'r', errors='ignore') as f:
+        data = f.read().replace('\n',' ').lower()
+    return replace_string_and_number(data)
 
 def count_files(root_language):    
     all_folders = natural_sort(filter
                                (_not_start_with_point,
                                 os.listdir(root_language)))
     files = natural_sort(filter
                          (_not_start_with_point,
                           os.listdir(root_language + '/' + all_folders[-1])))
     (max,_) = os.path.splitext(files[-1])
     return int(max)
 
 def find_file(root_language, n):
     '''Find the n-th file in language folder'''
     if n > count_files(root_language):
         return ''
     else:
         start = (n - 1) // 1000 * 1000 + 1
         end = start + 999
         root_count = root_language + '/' + str(start) + '-' + str(end)
         files = natural_sort(filter
                              (_not_start_with_point,
                               os.listdir(root_count)))
         return root_count + '/' + files[n - start]
 
 def replace_string_and_number(text):
     """ Replace strings and numbers in a file by special tokens 
     """
     str_replaced = re.sub(_re_string, '__str__', text)
     str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
     return str_num_replaced
 
 def natural_sort(l): 
     convert = lambda text: int(text) if text.isdigit() else text.lower() 
     alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
     return sorted(l, key = alphanum_key)
 
 def remove_comment(text):
     # TODO: remove only inline comments and block comments
     # TODO: maybe build a list of comment markers
     pass
 
 def purify(text, lang):
     # TODO: for some language like HTML, remove code other than principal language
     pass
 
 
 
     
     
     
     
 
diff --git a/swh/langdetect/utils/training.py b/swh/langdetect/utils/training.py
index 07872ab..a9d42df 100644
--- a/swh/langdetect/utils/training.py
+++ b/swh/langdetect/utils/training.py
@@ -1,35 +1,35 @@
 import os,random
 
 from utils.common import count_files, find_file
 
 def build_training_set(root):
     root_code = root + '/../code_by_language'
     root_training = root + '/../training_set'
     for language in os.listdir(root_code):
         if not language.startswith('.'):
             root_language = root_code + '/' + language
             root_training_language = root_training + '/' + language
             build_language_training_set(count_files(root_language),
                                         root_language,
                                         root_training_language)
 
 def build_language_training_set(total, root_language, root_training_language):
     # limit defines the size of training set
     # upper defines the maximum size
 
     try:
         os.mkdir(root_training_language)
     except FileExistsError:
         pass
 
-    upper = 4000
+    upper = 1000
     if total >= upper:
         limit = upper // 2
     else:
         limit = total // 2
     indices = random.sample(range(1, total + 1), limit)
     files = map(lambda x : find_file(root_language, x), indices)
     for src in files:
         basename = os.path.basename(src)
         des = root_training_language + '/' + basename
         os.symlink(src, des)