diff --git a/swh/langdetect/cnn.py b/swh/langdetect/cnn.py
index 9383fa3..b1bca09 100644
--- a/swh/langdetect/cnn.py
+++ b/swh/langdetect/cnn.py
@@ -1,266 +1,240 @@
 import os
 import sys
 import subprocess
 import time
 import random
 import csv
 import numpy as np
-import tensorflow as tf
+import warnings
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore")
+    import tensorflow as tf
 import json
 import argparse
 
 from ast import literal_eval
-from itertools import islice
 from pickle import dump
 from pickle import load
-from collections import Counter
 from numpy import array
 from utils.common import tokenizer
 from utils.common import file_to_string
-from utils.common import find_file
-from utils.common import count_files
-from keras.utils.vis_utils import plot_model
 from keras.preprocessing.sequence import pad_sequences
 from keras.models import Model
 from keras.models import Sequential
+from keras.models import load_model
 from keras.layers import Input
 from keras.layers import Dense
 from keras.layers import Flatten
 from keras.layers import Dropout
 from keras.layers import ThresholdedReLU
 from keras.layers import Activation
 from keras.layers import Lambda
 from keras.layers import Embedding
 from keras.layers.convolutional import Convolution1D
 from keras.layers.convolutional import MaxPooling1D
 from keras.layers.normalization import BatchNormalization
 from keras.layers.merge import concatenate
 from keras.utils import np_utils
 from keras.optimizers import SGD
 
 def main():
     parser = argparse.ArgumentParser(description='Training and test tool of charactor-level ConvNet text categorisation.')
     group = parser.add_mutually_exclusive_group()
     group.add_argument('-t', '--train'    , action='store_true', help='train on the dataset.')
     group.add_argument('-T', '--test'     , action='store_true', help='test on the dataset.')
     group.add_argument('-b', '--benchmark', action='store_true', help='calculate average processing time on the dataset.')
     parser.add_argument("path", type=str, help="Path of dataset.")
     args = parser.parse_args()
     
     n = CNN(args.path)
     
     if args.train:
         n.train()
     elif args.test:
         n.test()
     elif args.benchmark:
         print('Currently unavailable.')
         # n.speed_benchmark()
     else:
         print('Currently unavailable.')
 
 class CNN:
 
     def __init__(self, path):
         
         self._path = path
 
         # Root of model folder
         self._root_model = os.path.join(os.path.dirname(path), 'model_cnn')
         try:
             os.mkdir(self._root_model)
         except:
             pass
 
         # Path of result
         self._path_result = os.path.join(os.path.dirname(path), 'result_cnn')
-        try:
-            os.mkdir(self._root_model)
-        except:
-            pass
-
+        
         with open('static_data/languages.json', 'r') as f:
             self._languages = json.load(f)
 
+        self._path_test_csv = path
+
+        self._input_size = 1024
+        self._vocab_size = 256
+        self._num_of_classes = len(self._languages)
+        self._batch_size = 128
+
     def file_len(self, fname):
         with open(fname) as f:
             count = 0
             for l in f:
                 count += 1
             return count
 
     def train(self):
 
-        length = 1024
-        vocab_size = 256
-        total_class = len(self._languages)
-        batch_size = 128
-        sample_size = self.file_len(self._path)
-
-        model = self._get_model(length, vocab_size, total_class)
-        model.fit_generator(self._generator(length, total_class, batch_size), steps_per_epoch=sample_size / batch_size / 100, epochs=2)
+        model = self._get_model()
+        model.fit_generator(self._generator(length, total_class, batch_size), steps_per_epoch=self.file_len(self._path) / batch_size / 100, epochs=2)
         model.save(os.path.join(self._root_model, 'model.h5'))
 
     def _generator(self, length, total_class, batch_size=128):
         counter = 0
         while True:
             with open(self._path, newline='') as csvfile:
                 r = csv.reader(csvfile, delimiter=' ', quotechar='|')
                 for pair in r:
                     if counter == 0:
                         X = np.empty((0, length))
                         Y = np.empty((0, total_class))
                     label, string = pair
                     label = int(label)
                     string = literal_eval(string)
                     tokens = [x + 1 for x in tokenizer(string, 'letter')]
                     X = np.append(X, pad_sequences([tokens], maxlen=length), axis=0)
                     label = array(np_utils.to_categorical([label], total_class))
                     Y = np.append(Y, label, axis=0)
                     
                     counter += 1
                     if counter == batch_size:
                         counter = 0
                         yield(X,Y)
                         
-    def _get_model(self, length, vocab_size, total_class):
+    def _get_model(self):
 
-        input_size = length
-        alphabet_size = vocab_size
+        input_size = self._input_size
+        alphabet_size = self._vocab_size
         embedding_size = 256
         conv_layers = [(256,7,3), (256,7,3), (256,3,-1), (256,3,-1), (256,3,-1), (256,3,3)]
         threshold = 1e-6
         fully_connected_layers = [1024, 1024]
         dropout_p = 0.2
         optimizer = 'adam'
         loss = 'categorical_crossentropy'
-        num_of_classes = total_class
+        num_of_classes = self._num_of_classes
         
         # Input layer
         inputs = Input(shape=(input_size,), name='sent_input', dtype='int64')
         # Embedding layers
         x = Embedding(alphabet_size + 1, embedding_size, input_length=input_size)(inputs)
         # Convolution layers
         for cl in conv_layers:
             x = Convolution1D(cl[0], cl[1])(x)
             x = ThresholdedReLU(threshold)(x)
             if cl[2] != -1:
                 x = MaxPooling1D(cl[2])(x)
         x = Flatten()(x)
         # Fully connected layers
         for fl in fully_connected_layers:
             x = Dense(fl)(x)
             x = ThresholdedReLU(threshold)(x)
             x = Dropout(dropout_p)(x)
         # Output layer
         predictions = Dense(num_of_classes, activation='softmax')(x)
         # Build and compile model
         model = Model(inputs=inputs, outputs=predictions)
         model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
         
         print(model.summary())
         
         return model
 
     def _max_len(self, texts):
         return max([len(text) for text in texts])
-                
-    def _vocabulary_size(self, texts):
-        vocabulary = dict(Counter([token for text in texts for token in text]))
-        return len(vocabulary.keys())
 
     def test(self):
+        csv.field_size_limit(sys.maxsize)
         try:
             r = open(self._path_result, 'rb')
             test_result = load(r)
             r.close()
         except FileNotFoundError:
             test_result = {}
-        models = self._load_models()
+            
+        model = self._load_model()
         
-        for language in [x for x in os.listdir(self._root_model) if not x.startswith('.') and x not in test_result.keys()]:
-            test_result[language] = self.test_class(models, language)
+        for language in [x for x in self._languages if x not in test_result.keys()]:
+            test_result[language] = self.test_class(model, language)
             with open(self._path_result, 'wb') as f:
                 dump(test_result, f)
             
-    def _load_models(self):
-        models = {}
-        
-        for model in [model
-                      for model in os.listdir(self._root_model)
-                      if not model.startswith('.')]:
-            root_model = os.path.join(self._root_model, model)
-            models[model] = kenlm.LanguageModel(root_model)
-        return models
+    def _load_model(self):
+        model = load_model(os.path.join(self._root_model, 'model.h5'))
 
-    def _get_test_set(self, language):
-        root_training_language = os.path.join(self._root_training_set, language)
-        root_language = os.path.join(self._root_language_dataset, language)
-        total = count_files(root_language)
-        training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')]
-        it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576)
-        test_set = list(islice(it, 1000))
-        if len(test_set) == 0:
-            it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set)
-            test_set = list(islice(it, 1000))
-        return test_set
+        return model
     
     def _count_size(self, files):
         size = 0
         for f in files:
             size += os.path.getsize(f)
         return size
     
-    def test_class(self, models, language):
-        test_set = self._get_test_set(language)
-
+    def test_class(self, model, language):
         ok = 0
         results = []
         count = 0
-        length = len(test_set)
-        for test in test_set:
-            result = self._guess_file_language(models, test)
-            count += 1
-            print('[{0:4d}/{1:4d}] {2}:{3}       '.format(count, length, result[0][1], result[0][0]),end='\r')
-            results.append(result[0])
-            if result[0][1] == language:
-                ok += 1
+        total_test = self.file_len(os.path.join(self._path_test_csv, language + '.csv'))
+                          
+        with open(os.path.join(self._path_test_csv, language + '.csv'), newline='') as csvfile:
+            r = csv.reader(csvfile, delimiter=' ', quotechar='|')
+            for pair in r:
+                label, string = pair
+                label = int(label)
+                string = literal_eval(string)
+                tokens = [x + 1 for x in tokenizer(string, 'letter')]
+                result = self._guess_file_language(model, tokens)
+                count += 1
+                print('[{0:4d}/{1:4d}] {2}:{3}       '.format(count, total_test, result[0][1], result[0][0]),end='\r')
+                results.append(result[0])
+                if result[0][1] == language:
+                    ok += 1
 
-        total_test = len(test_set)
-        accuracy = ok / len(test_set)
+        accuracy = ok / total_test
         print('Tests for {}                   '.format(language))
         print('Total test files           : {}'.format(total_test))
         print('Correctly classified files : {}'.format(ok))
         print('Accuracy                   : {}%'.format(accuracy * 100))
-        return (ok, len(test_set), accuracy, results)
+        return (ok, total_test, accuracy, results)
 
     def speed_benchmark(self):
-        language = [x for x in os.listdir(self._root_model) if not x.startswith('.')][10]
-        models = self._load_models()
+        language = self._languages[10]
+        model = self._load_model()
 
         test_set = self._get_test_set(language)
         total_size = self._count_size(test_set)
         print('{} kB in total'.format(total_size / 1024))
         
         t_start = time.perf_counter()
-        self.test_class(models, language)
+        self.test_class(model, language)
         t_end = time.perf_counter()
         
         print('{} seconds.'.format(t_end - t_start))
-        print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024))
+        print('{} seconds per KiB'.format(((t_end - t_start) / total_size) * 1024))
 
-    def _guess_file_language(self, models, filename):
-        tokens = tokenizer(file_to_string(filename), 'letter')
-        text = ' '.join(chr(token) for token in tokens)
-
-        result = []
-        
-        for model_key in models.keys():
-            root_model = os.path.join(self._root_model, model_key)
-            model = models[model_key]
-            score = model.score(text)
-            result.append((score, model_key))
+    def _guess_file_language(self, model, tokens):
+        X = pad_sequences([tokens], maxlen=self._input_size)
+        result = list(model.predict(X))[0]
+        result = [(s, self._languages[i]) for i, s in enumerate(result)]
         return sorted(result, reverse=True)
 
 if __name__ == '__main__':
     main()
diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py
index 1a9b5ad..65864f1 100644
--- a/swh/langdetect/utils/common.py
+++ b/swh/langdetect/utils/common.py
@@ -1,92 +1,79 @@
 """
 Here regroup basic preprocessing methods
 used in learning stage for different 
 approaches.
 
 """
 
 import re, os
 
 _re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
 _re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
 _re_separator = re.compile(r'(\W)')
 _not_start_with_point = lambda x: not x.startswith('.')
 
-def train_files_with_label(languages, root_training_set, root_csv_training_set):
-    with open(root_csv_training_set, 'w', newline='') as csvfile:
-        setwriter = csv.writer(csvfile, delimiter=' ',
-                               quotechar='|', quoting=csv.QUOTE_MINIMAL)
-        for language in languages:
-            root_training_set_language = os.path.join(root_training_set, language)
-            index_lang = languages.index(language)
-            print(language)
-            for f in [x for x in os.listdir(root_training_set_language) if not x.startswith('.')]:
-                filename = os.path.join(root_training_set_language, f)
-                tokens = file_to_string(filename)[-maxsize:] # 10240
-                setwriter.writerow([index_lang, tokens])
-
 def tokenizer(text, re_name):
     ''' Splits text into tokens '''
     if re_name == 'letter':
         return list(text)
     elif re_name == 'word':
         return [word for word in _re_separator.split(text) if word.strip(' \t')]
 
 def file_to_string(filename):
     """ Read a file to a string. """
     with open(filename, 'rb') as f:
         data = f.read()
     return replace_string_and_number(data)
 
 def count_files(root_language):    
     all_folders = natural_sort(filter
                                (_not_start_with_point,
                                 os.listdir(root_language)))
     files = natural_sort(filter
                          (_not_start_with_point,
                           os.listdir(root_language + '/' + all_folders[-1])))
     (max,_) = os.path.splitext(files[-1])
     return int(max)
 
 def find_file(root_language, n):
     '''Find the n-th file in language folder'''
     if n > count_files(root_language):
         return ''
     else:
         start = (n - 1) // 1000 * 1000 + 1
         end = start + 999
         root_count = root_language + '/' + str(start) + '-' + str(end)
         files = natural_sort(filter
                              (_not_start_with_point,
                               os.listdir(root_count)))
         return root_count + '/' + files[n - start]
 
 def replace_string_and_number(text):
     """ Replace strings and numbers in a file by special tokens 
     """
     # str_replaced = re.sub(_re_string, '__str__', text)
     # str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
     str_num_replaced = text
     return str_num_replaced
 
 def natural_sort(l): 
     convert = lambda text: int(text) if text.isdigit() else text.lower() 
     alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
     return sorted(l, key = alphanum_key)
 
 def remove_comment(text):
     # TODO: remove only inline comments and block comments
     # TODO: maybe build a list of comment markers
     pass
 
 def purify(text, lang):
     # TODO: for some language like HTML, remove code other than principal language
     pass
 
 
 
     
     
     
     
 
diff --git a/swh/langdetect/utils/training.py b/swh/langdetect/utils/training.py
index a9d42df..d0ca69b 100644
--- a/swh/langdetect/utils/training.py
+++ b/swh/langdetect/utils/training.py
@@ -1,35 +1,105 @@
-import os,random
-
-from utils.common import count_files, find_file
-
-def build_training_set(root):
-    root_code = root + '/../code_by_language'
-    root_training = root + '/../training_set'
-    for language in os.listdir(root_code):
-        if not language.startswith('.'):
-            root_language = root_code + '/' + language
-            root_training_language = root_training + '/' + language
-            build_language_training_set(count_files(root_language),
-                                        root_language,
-                                        root_training_language)
-
-def build_language_training_set(total, root_language, root_training_language):
-    # limit defines the size of training set
-    # upper defines the maximum size
-
-    try:
-        os.mkdir(root_training_language)
-    except FileExistsError:
-        pass
-
-    upper = 1000
-    if total >= upper:
-        limit = upper // 2
-    else:
-        limit = total // 2
-    indices = random.sample(range(1, total + 1), limit)
-    files = map(lambda x : find_file(root_language, x), indices)
-    for src in files:
-        basename = os.path.basename(src)
-        des = root_training_language + '/' + basename
-        os.symlink(src, des)
+import os
+import random
+import csv
+
+from utils.common import count_files, find_file, file_to_string
+from itertools import islice
+
+
+class Dataset:
+
+    def __init__(self, root):
+        self.root_code = os.path.join(root, '..', 'code_by_language')
+        self.root_training = os.path.join(root, '..', 'training_set')
+        self.root_training_csv = os.path.join(root, '..', 'training_set_csv')
+        self.root_test = os.path.join(root, '..', 'test_set')
+        self.root_test_csv = os.path.join(root, '..', 'test_set_csv')
+        try:
+            os.mkdir(self.root_training)
+        except FileExistsError:
+            pass
+        try:
+            os.mkdir(self.root_training_csv)
+        except FileExistsError:
+            pass
+        try:
+            os.mkdir(self.root_test)
+        except FileExistsError:
+            pass
+        try:
+            os.mkdir(self.root_test_csv)
+        except FileExistsError:
+            pass
+    
+    def build_training_set(self, languages):
+        for language in languages:
+        # limit defines the size of training set
+        # upper defines the maximum size
+            root_code_language = os.path.join(self.root_code, language)
+            root_training_language = os.path.join(self.root_training, language)
+            total = count_files(root_code_language)
+            try:
+                os.mkdir(root_training_language)
+            except FileExistsError:
+                pass
+
+            upper = 1000
+            if total >= upper:
+                limit = upper // 2
+            else:
+                limit = total // 2
+            
+            indices = random.sample(range(1, total + 1), limit)
+            files = map(lambda x : find_file(root_code_language, x), indices)
+            for src in files:
+                basename = os.path.basename(src)
+                des = os.path.join(root_training_language, basename)
+                os.symlink(src, des)
+
+    def build_test_set(self, languages):
+        for language in languages:
+            root_language = os.path.join(self.root_code, language)
+            root_test_language = os.path.join(self.root_test, language)
+            try:
+                os.mkdir(root_test_language)
+            except FileExistsError:
+                pass
+
+            files = self.get_test_set(language)
+            for src in files:
+                des = os.path.join(root_test_language, os.path.basename(src))
+                os.symlink(src, des)
+
+    def train_files_with_label(self, languages, maxsize):
+        with open(os.path.join(self.root_training_csv, 'training_set.csv'), 'w', newline='') as csvfile:
+            setwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
+            for language in languages:
+                root_training_language = os.path.join(self.root_training, language)
+                index_lang = languages.index(language)
+                for f in [x for x in os.listdir(root_training_language) if not x.startswith('.')]:
+                    filename = os.path.join(root_training_language, f)
+                    tokens = file_to_string(filename)[-maxsize:] # 10240
+                    setwriter.writerow([index_lang, tokens])
+
+    def get_test_set(self, language):
+        root_training_language = os.path.join(self.root_training, language)
+        root_language = os.path.join(self.root_code, language)
+        total = count_files(root_language)
+        training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')]
+        it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576)
+        test_set = list(islice(it, 1000))
+        if len(test_set) == 0:
+            it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set)
+            test_set = list(islice(it, 1000))
+        return test_set
+
+    def test_files_with_label(self, languages):
+        for language in languages:
+            root_test_language = os.path.join(self.root_test, language)
+            index_lang = languages.index(language)
+            with open(os.path.join(self.root_test_csv, language + '.csv'), 'w', newline='') as csvfile:
+                setwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
+                for f in [x for x in os.listdir(root_test_language) if not x.startswith('.')]:
+                    filename = os.path.join(root_test_language, f)
+                    tokens = file_to_string(filename)
+                    setwriter.writerow([index_lang, tokens])