diff --git a/requirements.txt b/requirements.txt
index cdf0b9a..3f3776f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,9 @@
 # Add here external Python modules dependencies, one per line. Module names
 # should match https://pypi.python.org/pypi names. For the full spec or
 # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
 vcversioner
 pygithub
-pyyaml
\ No newline at end of file
+pyyaml
+tensorflow
+keras
+h5py
\ No newline at end of file
diff --git a/swh/langdetect/cnn.py b/swh/langdetect/cnn.py
index 8dbafb4..9383fa3 100644
--- a/swh/langdetect/cnn.py
+++ b/swh/langdetect/cnn.py
@@ -1,328 +1,266 @@
-
-import os, sys, subprocess, time, random
+import os
+import sys
+import subprocess
+import time
+import random
+import csv
 import numpy as np
 import tensorflow as tf
+import json
+import argparse
 
+from ast import literal_eval
 from itertools import islice
-from pickle import dump, load
+from pickle import dump
+from pickle import load
 from collections import Counter
 from numpy import array
-from utils.common import tokenizer, file_to_string, find_file, count_files
+from utils.common import tokenizer
+from utils.common import file_to_string
+from utils.common import find_file
+from utils.common import count_files
 from keras.utils.vis_utils import plot_model
 from keras.preprocessing.sequence import pad_sequences
-from keras.models import Model, Sequential
-from keras.layers import Input, Dense, Flatten, Dropout, Embedding, ThresholdedReLU, Activation, Lambda
-from keras.layers.convolutional import Convolution1D, MaxPooling1D
+from keras.models import Model
+from keras.models import Sequential
+from keras.layers import Input
+from keras.layers import Dense
+from keras.layers import Flatten
+from keras.layers import Dropout
+from keras.layers import ThresholdedReLU
+from keras.layers import Activation
+from keras.layers import Lambda
+from keras.layers import Embedding
+from keras.layers.convolutional import Convolution1D
+from keras.layers.convolutional import MaxPooling1D
 from keras.layers.normalization import BatchNormalization
 from keras.layers.merge import concatenate
 from keras.utils import np_utils
 from keras.optimizers import SGD
 
+def main():
+    parser = argparse.ArgumentParser(description='Training and test tool of charactor-level ConvNet text categorisation.')
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('-t', '--train'    , action='store_true', help='train on the dataset.')
+    group.add_argument('-T', '--test'     , action='store_true', help='test on the dataset.')
+    group.add_argument('-b', '--benchmark', action='store_true', help='calculate average processing time on the dataset.')
+    parser.add_argument("path", type=str, help="Path of dataset.")
+    args = parser.parse_args()
+    
+    n = CNN(args.path)
+    
+    if args.train:
+        n.train()
+    elif args.test:
+        n.test()
+    elif args.benchmark:
+        print('Currently unavailable.')
+        # n.speed_benchmark()
+    else:
+        print('Currently unavailable.')
 
 class CNN:
 
-    def __init__(self, root):
-        # Root of dataset
-        self._root = root
-
-        # Root of training set
-        self._root_training_set = os.path.join(self._root, '..', 'training_set')
+    def __init__(self, path):
+        
+        self._path = path
 
         # Root of model folder
-        self._root_model = os.path.join(self._root, '..', 'model_cnn')
-
-        # Root of arranged dataset
-        self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language')
+        self._root_model = os.path.join(os.path.dirname(path), 'model_cnn')
+        try:
+            os.mkdir(self._root_model)
+        except:
+            pass
 
         # Path of result
-        self._path_result = os.path.join(self._root, '..', 'result_cnn')
-
-    def train(self):
+        self._path_result = os.path.join(os.path.dirname(path), 'result_cnn')
         try:
-            if len(os.listdir(self._root_training_set)) == 0:
-                build_training_set(self._root)
-            try:
-                os.mkdir(self._root_model)
-            except FileExistsError:
-                pass
-        except FileNotFoundError:
-            os.mkdir(self._root_training_set)
-            build_training_set(self._root)
+            os.mkdir(self._root_model)
+        except:
+            pass
 
-        languages = [x for x in os.listdir(self._root_training_set) if not x.startswith('.')]
-        try:
-            f = open(os.path.join(self._root, '..', 'model_cnn', 'texts+labels'), 'rb')
-            train_file_with_label = load(f)
-        except FileNotFoundError:
-            train_file_with_label = self._train_file_with_label(languages)
-            with open(os.path.join(self._root, '..', 'model_cnn', 'texts+labels'), 'wb') as f:
-                dump(train_file_with_label, f)
+        with open('static_data/languages.json', 'r') as f:
+            self._languages = json.load(f)
+
+    def file_len(self, fname):
+        with open(fname) as f:
+            count = 0
+            for l in f:
+                count += 1
+            return count
+
+    def train(self):
 
         length = 1024
         vocab_size = 256
-        total_class = len(languages)
+        total_class = len(self._languages)
         batch_size = 128
+        sample_size = self.file_len(self._path)
 
         model = self._get_model(length, vocab_size, total_class)
-        model.fit_generator(self._generator(length, total_class, batch_size), steps_per_epoch=len(train_file_with_label)/batch_size, epochs=100)
-        model.save(os.path.join(self._root, '..', 'model_cnn', 'model.h5'))
+        model.fit_generator(self._generator(length, total_class, batch_size), steps_per_epoch=sample_size / batch_size / 100, epochs=2)
+        model.save(os.path.join(self._root_model, 'model.h5'))
 
     def _generator(self, length, total_class, batch_size=128):
         counter = 0
         while True:
-            with open(os.path.join(self._root, '..', 'model_cnn', 'texts+labels'), 'rb') as f:
-                train_file_with_label = load(f)
-                train_file_with_label = [(a,b) for (a,b) in train_file_with_label if b < total_class ]
-                random.shuffle(train_file_with_label)
-                for pair in train_file_with_label:
+            with open(self._path, newline='') as csvfile:
+                r = csv.reader(csvfile, delimiter=' ', quotechar='|')
+                for pair in r:
                     if counter == 0:
                         X = np.empty((0, length))
                         Y = np.empty((0, total_class))
-                    path, label = pair
-                    tokens = [x + 1 for x in tokenizer(file_to_string(path), 'letter')]
+                    label, string = pair
+                    label = int(label)
+                    string = literal_eval(string)
+                    tokens = [x + 1 for x in tokenizer(string, 'letter')]
                     X = np.append(X, pad_sequences([tokens], maxlen=length), axis=0)
                     label = array(np_utils.to_categorical([label], total_class))
                     Y = np.append(Y, label, axis=0)
                     
                     counter += 1
                     if counter == batch_size:
                         counter = 0
                         yield(X,Y)
-                
-    def _train_file_with_label(self, languages):
-        l = []
-        
-        for language in languages:
-            root_training_set_language = os.path.join(self._root_training_set, language)
-            root_stat_language = os.path.join(self._root_model, language)
-            index_lang = languages.index(language)
-            if os.path.isfile(root_stat_language):
-                continue
-            print(language)
-            for f in [x for x in os.listdir(root_training_set_language) if not x.startswith('.')]:
-                filename = os.path.join(root_training_set_language, f)
-                l.append((filename, index_lang))
-
-        return l
-
-    '''
-    def _get_model(self, length, vocab_size, total_class):
-        num_filters = [64, 128, 256, 512]
-        num_classes = total_class
-        sequence_max_length = length
-        num_quantized_chars = vocab_size + 1
-        embedding_size = 16
-        learning_rate = 0.001
-        top_k = 3
-
-        class ConvBlockLayer(object):
-            """
-            two layer ConvNet. Apply batch_norm and relu after each layer
-            """
-
-            def __init__(self, input_shape, num_filters):
-                self.model = Sequential()
-                # first conv layer
-                self.model.add(Convolution1D(filters=num_filters, kernel_size=3, strides=1, padding="same", input_shape=input_shape))
-                self.model.add(BatchNormalization())
-                self.model.add(Activation('relu'))
-
-                # second conv layer
-                self.model.add(Convolution1D(filters=num_filters, kernel_size=3, strides=1, padding="same"))
-                self.model.add(BatchNormalization())
-                self.model.add(Activation('relu'))
-
-            def __call__(self, inputs):
-                return self.model(inputs)
-
-        def get_conv_shape(conv):
-            return conv.get_shape().as_list()[1:]
-
-        inputs = Input(shape=(sequence_max_length, ), dtype='int32', name='inputs')
-        
-        embedded_sent = Embedding(num_quantized_chars, embedding_size, input_length=sequence_max_length)(inputs)
-
-        # First conv layer
-        conv = Convolution1D(filters=64, kernel_size=3, strides=2, padding="same")(embedded_sent)
-
-        # Each ConvBlock with one MaxPooling Layer
-        for i in range(len(num_filters)):
-            conv = ConvBlockLayer(get_conv_shape(conv), num_filters[i])(conv)
-            conv = MaxPooling1D(pool_size=3, strides=2, padding="same")(conv)
-
-        # k-max pooling (Finds values and indices of the k largest entries for the last dimension)
-        def _top_k(x):
-            x = tf.transpose(x, [0, 2, 1])
-            k_max = tf.nn.top_k(x, k=top_k)
-            return tf.reshape(k_max[0], (-1, num_filters[-1] * top_k))
-        k_max = Lambda(_top_k, output_shape=(num_filters[-1] * top_k,))(conv)
-
-        # 3 fully-connected layer with dropout regularization
-        fc1 = Dropout(0.2)(Dense(128, activation='relu', kernel_initializer='he_normal')(k_max))
-        fc2 = Dropout(0.2)(Dense(128, activation='relu', kernel_initializer='he_normal')(fc1))
-        fc3 = Dense(num_classes, activation='softmax')(fc2)
-
-        # define optimizer
-        sgd = SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=False)
-        model = Model(inputs=inputs, outputs=fc3)
-        model.compile(optimizer=sgd, loss='mean_squared_error', metrics=['accuracy'])
-        
-        print(model.summary())
-        return model
-'''
-    
-
+                        
     def _get_model(self, length, vocab_size, total_class):
 
         input_size = length
         alphabet_size = vocab_size
         embedding_size = 256
         conv_layers = [(256,7,3), (256,7,3), (256,3,-1), (256,3,-1), (256,3,-1), (256,3,3)]
         threshold = 1e-6
         fully_connected_layers = [1024, 1024]
         dropout_p = 0.2
         optimizer = 'adam'
         loss = 'categorical_crossentropy'
         num_of_classes = total_class
         
         # Input layer
         inputs = Input(shape=(input_size,), name='sent_input', dtype='int64')
         # Embedding layers
         x = Embedding(alphabet_size + 1, embedding_size, input_length=input_size)(inputs)
         # Convolution layers
         for cl in conv_layers:
             x = Convolution1D(cl[0], cl[1])(x)
             x = ThresholdedReLU(threshold)(x)
             if cl[2] != -1:
                 x = MaxPooling1D(cl[2])(x)
         x = Flatten()(x)
         # Fully connected layers
         for fl in fully_connected_layers:
             x = Dense(fl)(x)
             x = ThresholdedReLU(threshold)(x)
             x = Dropout(dropout_p)(x)
         # Output layer
         predictions = Dense(num_of_classes, activation='softmax')(x)
         # Build and compile model
         model = Model(inputs=inputs, outputs=predictions)
         model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
         
         print(model.summary())
         
         return model
 
     def _max_len(self, texts):
         return max([len(text) for text in texts])
                 
     def _vocabulary_size(self, texts):
         vocabulary = dict(Counter([token for text in texts for token in text]))
         return len(vocabulary.keys())
 
     def test(self):
         try:
             r = open(self._path_result, 'rb')
             test_result = load(r)
             r.close()
         except FileNotFoundError:
             test_result = {}
         models = self._load_models()
         
         for language in [x for x in os.listdir(self._root_model) if not x.startswith('.') and x not in test_result.keys()]:
             test_result[language] = self.test_class(models, language)
             with open(self._path_result, 'wb') as f:
                 dump(test_result, f)
             
     def _load_models(self):
         models = {}
         
         for model in [model
                       for model in os.listdir(self._root_model)
                       if not model.startswith('.')]:
             root_model = os.path.join(self._root_model, model)
             models[model] = kenlm.LanguageModel(root_model)
         return models
 
     def _get_test_set(self, language):
         root_training_language = os.path.join(self._root_training_set, language)
         root_language = os.path.join(self._root_language_dataset, language)
         total = count_files(root_language)
         training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')]
         it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576)
         test_set = list(islice(it, 1000))
         if len(test_set) == 0:
             it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set)
             test_set = list(islice(it, 1000))
         return test_set
     
     def _count_size(self, files):
         size = 0
         for f in files:
             size += os.path.getsize(f)
         return size
     
     def test_class(self, models, language):
         test_set = self._get_test_set(language)
 
         ok = 0
         results = []
         count = 0
         length = len(test_set)
         for test in test_set:
             result = self._guess_file_language(models, test)
             count += 1
             print('[{0:4d}/{1:4d}] {2}:{3}       '.format(count, length, result[0][1], result[0][0]),end='\r')
             results.append(result[0])
             if result[0][1] == language:
                 ok += 1
 
         total_test = len(test_set)
         accuracy = ok / len(test_set)
         print('Tests for {}                   '.format(language))
         print('Total test files           : {}'.format(total_test))
         print('Correctly classified files : {}'.format(ok))
         print('Accuracy                   : {}%'.format(accuracy * 100))
         return (ok, len(test_set), accuracy, results)
 
     def speed_benchmark(self):
         language = [x for x in os.listdir(self._root_model) if not x.startswith('.')][10]
         models = self._load_models()
 
         test_set = self._get_test_set(language)
         total_size = self._count_size(test_set)
         print('{} kB in total'.format(total_size / 1024))
         
         t_start = time.perf_counter()
         self.test_class(models, language)
         t_end = time.perf_counter()
         
         print('{} seconds.'.format(t_end - t_start))
         print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024))
 
     def _guess_file_language(self, models, filename):
         tokens = tokenizer(file_to_string(filename), 'letter')
         text = ' '.join(chr(token) for token in tokens)
 
         result = []
         
         for model_key in models.keys():
             root_model = os.path.join(self._root_model, model_key)
             model = models[model_key]
             score = model.score(text)
             result.append((score, model_key))
         return sorted(result, reverse=True)
 
 if __name__ == '__main__':
-    if len(sys.argv) == 3 and sys.argv[1] == '--train':
-        n = CNN(sys.argv[2])
-        n.train()
-    elif len(sys.argv) == 3 and sys.argv[1] == '--test':
-        n = NGramProb(sys.argv[2])
-        n.test()
-    elif len(sys.argv) == 3 and sys.argv[1] == '--benchmark':
-        n = NGramProb(sys.argv[2])
-        n.speed_benchmark()
-    elif len(sys.argv) == 4 and sys.argv[1] == '--test':
-        n = NGramProb(sys.argv[2])
-        n.test_class(n.load_models(), sys.argv[3])
-    else:
-        print('Wrong arguments, please check your input.')
+    main()
diff --git a/swh/langdetect/static_data/comments.json b/swh/langdetect/static_data/comments.json
deleted file mode 100644
index e69de29..0000000
diff --git a/swh/langdetect/static_data/languages.json b/swh/langdetect/static_data/languages.json
new file mode 100644
index 0000000..01d803f
--- /dev/null
+++ b/swh/langdetect/static_data/languages.json
@@ -0,0 +1 @@
+["1C Enterprise", "ABAP", "ActionScript", "Ada", "Agda", "AGS Script", "Alloy", "AMPL", "AngelScript", "ANTLR", "Apex", "API Blueprint", "APL", "AppleScript", "Arc", "ASP", "AspectJ", "Assembly", "ATS", "Augeas", "AutoHotkey", "AutoIt", "Awk", "Ballerina", "Batchfile", "Befunge", "BitBake", "BlitzBasic", "BlitzMax", "Bluespec", "Boo", "Brainfuck", "Brightscript", "Bro", "C", "C#", "C++", "Cap'n Proto", "CartoCSS", "Ceylon", "Chapel", "Charity", "ChucK", "Cirru", "Clarion", "Clean", "Click", "CLIPS", "Clojure", "CMake", "COBOL", "CoffeeScript", "ColdFusion", "Common Lisp", "Common Workflow Language", "Component Pascal", "Cool", "Coq", "Crystal", "Csound", "Csound Document", "Csound Score", "CSS", "Cuda", "CWeb", "Cycript", "D", "Dart", "DataWeave", "DIGITAL Command Language", "DM", "Dogescript", "DTrace", "Dylan", "E", "eC", "ECL", "Eiffel", "Elixir", "Elm", "Emacs Lisp", "EmberScript", "EQ", "Erlang", "F#", "Factor", "Fancy", "Fantom", "Filebench WML", "FLUX", "Forth", "Fortran", "FreeMarker", "Frege", "Game Maker Language", "GAMS", "GAP", "GDB", "GDScript", "Genie", "Genshi", "Gherkin", "GLSL", "Glyph", "Gnuplot", "Go", "Golo", "Gosu", "Grace", "Grammatical Framework", "Groovy", "Hack", "Harbour", "Haskell", "Haxe", "HCL", "HLSL", "HTML", "Hy", "HyPhy", "IDL", "Idris", "IGOR Pro", "Inform 7", "Inno Setup", "Io", "Ioke", "Isabelle", "J", "Jasmin", "Java", "JavaScript", "Jolie", "JSONiq", "Julia", "Jupyter Notebook", "Kit", "Kotlin", "KRL", "LabVIEW", "Lasso", "Lean", "Lex", "LFE", "LilyPond", "Limbo", "Liquid", "LiveScript", "LLVM", "Logos", "Logtalk", "LOLCODE", "LookML", "LoomScript", "LSL", "Lua", "M", "M4", "Makefile", "Mako", "Markdown", "Mask", "Mathematica", "Matlab", "Max", "MAXScript", "Mercury", "Meson", "Metal", "Mirah", "Modelica", "Modula-2", "Module Management System", "Monkey", "Moocode", "MoonScript", "MQL4", "MQL5", "MTML", "mupad", "NCL", "Nearley", "Nemerle", "nesC", "NetLinx", "NetLinx+ERB", "NetLogo", "NewLisp", "Nextflow", "Nim", "Nit", "Nix", "NSIS", "Nu", "Objective-C", "Objective-C++", "Objective-J", "OCaml", "Omgrofl", "ooc", "Opa", "Opal", "OpenEdge ABL", "OpenSCAD", "Ox", "Oxygene", "Oz", "P4", "Pan", "Papyrus", "Parrot", "Pascal", "PAWN", "Pep8", "Perl", "Perl 6", "PHP", "PicoLisp", "PigLatin", "Pike", "PLpgSQL", "PLSQL", "PogoScript", "Pony", "PostScript", "POV-Ray SDL", "PowerBuilder", "PowerShell", "Processing", "Prolog", "Propeller Spin", "Puppet", "PureBasic", "PureScript", "Python", "QMake", "QML", "R", "Racket", "Ragel", "RAML", "Rascal", "REALbasic", "Rebol", "Red", "Redcode", "Ren'Py", "RenderScript", "reStructuredText", "REXX", "Ring", "RMarkdown", "RobotFramework", "Roff", "Rouge", "RPC", "Ruby", "Rust", "SaltStack", "SAS", "Scala", "Scheme", "Scilab", "Self", "ShaderLab", "Shell", "ShellSession", "Shen", "Slash", "Smali", "Smalltalk", "Smarty", "SMT", "Solidity", "SourcePawn", "SQF", "SQLPL", "Squirrel", "SRecode Template", "Stan", "Standard ML", "Stata", "SuperCollider", "Swift", "SystemVerilog", "Tcl", "Tea", "Terra", "TeX", "Thrift", "TI Program", "TLA", "Turing", "TXL", "TypeScript", "Uno", "UnrealScript", "UrWeb", "Vala", "VCL", "Verilog", "VHDL", "Vim script", "Visual Basic", "Volt", "Vue", "wdl", "WebAssembly", "WebIDL", "wisp", "X10", "xBase", "XC", "Xojo", "XProc", "XQuery", "XS", "XSLT", "Xtend", "Yacc", "YAML", "Zephir", "Zimpl"]
\ No newline at end of file
diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py
index 65864f1..1a9b5ad 100644
--- a/swh/langdetect/utils/common.py
+++ b/swh/langdetect/utils/common.py
@@ -1,79 +1,92 @@
 """
 Here regroup basic preprocessing methods
 used in learning stage for different 
 approaches.
 
 """
 
 import re, os
 
 _re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
 _re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
 _re_separator = re.compile(r'(\W)')
 _not_start_with_point = lambda x: not x.startswith('.')
 
+def train_files_with_label(languages, root_training_set, root_csv_training_set):
+    with open(root_csv_training_set, 'w', newline='') as csvfile:
+        setwriter = csv.writer(csvfile, delimiter=' ',
+                               quotechar='|', quoting=csv.QUOTE_MINIMAL)
+        for language in languages:
+            root_training_set_language = os.path.join(root_training_set, language)
+            index_lang = languages.index(language)
+            print(language)
+            for f in [x for x in os.listdir(root_training_set_language) if not x.startswith('.')]:
+                filename = os.path.join(root_training_set_language, f)
+                tokens = file_to_string(filename)[-maxsize:] # 10240
+                setwriter.writerow([index_lang, tokens])
+
 def tokenizer(text, re_name):
     ''' Splits text into tokens '''
     if re_name == 'letter':
         return list(text)
     elif re_name == 'word':
         return [word for word in _re_separator.split(text) if word.strip(' \t')]
 
 def file_to_string(filename):
     """ Read a file to a string. """
     with open(filename, 'rb') as f:
         data = f.read()
     return replace_string_and_number(data)
 
 def count_files(root_language):    
     all_folders = natural_sort(filter
                                (_not_start_with_point,
                                 os.listdir(root_language)))
     files = natural_sort(filter
                          (_not_start_with_point,
                           os.listdir(root_language + '/' + all_folders[-1])))
     (max,_) = os.path.splitext(files[-1])
     return int(max)
 
 def find_file(root_language, n):
     '''Find the n-th file in language folder'''
     if n > count_files(root_language):
         return ''
     else:
         start = (n - 1) // 1000 * 1000 + 1
         end = start + 999
         root_count = root_language + '/' + str(start) + '-' + str(end)
         files = natural_sort(filter
                              (_not_start_with_point,
                               os.listdir(root_count)))
         return root_count + '/' + files[n - start]
 
 def replace_string_and_number(text):
     """ Replace strings and numbers in a file by special tokens 
     """
     # str_replaced = re.sub(_re_string, '__str__', text)
     # str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
     str_num_replaced = text
     return str_num_replaced
 
 def natural_sort(l): 
     convert = lambda text: int(text) if text.isdigit() else text.lower() 
     alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
     return sorted(l, key = alphanum_key)
 
 def remove_comment(text):
     # TODO: remove only inline comments and block comments
     # TODO: maybe build a list of comment markers
     pass
 
 def purify(text, lang):
     # TODO: for some language like HTML, remove code other than principal language
     pass