diff --git a/scripts/dataset/ground_truth_arrange.py b/scripts/dataset/ground_truth_arrange.py
index 47e2dd3..65b09f0 100644
--- a/scripts/dataset/ground_truth_arrange.py
+++ b/scripts/dataset/ground_truth_arrange.py
@@ -1,107 +1,113 @@
 # Copyright (C) 2015-2016  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 #!/usr/bin/env python3
 # coding: utf-8
 
 import os
 import sys
 import io
 import json
 
 def main(root):
     
     root_ground_truth = root + '/../ground_truth'
     root_ground_truth_text = root_ground_truth + '_text'
     root_code_by_language = root + '/../code_by_language'
     counts = dict()
     
     try:
         os.mkdir(root_code_by_language)
     except FileExistsError:
         pass
     
     for r in os.listdir(root):
         if not r.startswith('.'):
             for d in os.listdir(root + '/' + r):
                 if not d.startswith('.'):
                     try:
                         ground_truth = io.open(root_ground_truth + '/' + r + '/' + d + '.json')
                         try:
                             j = json.load(ground_truth)
                             for language in j.keys():
                                 root_language = root_code_by_language + '/' + language
                                 try:
                                     os.mkdir(root_language)
                                 except FileExistsError:
                                     pass
                                 for f in j.get(language):
                                     counts[language] = counts.get(language, 0) + 1
                                     start = (counts[language] - 1) // 1000 * 1000 + 1
                                     end = start + 999
                                     root_count = root_language + '/' + str(start) + '-' + str(end)
                                     if counts[language] % 1000 == 1:
                                         try: 
                                             os.mkdir(root_count)
                                         except FileExistsError:
                                             pass
                                     (_,ext) = os.path.splitext(f)
                                     new_name = str(counts[language]) + ext
                                     copy_src = root + '/' + r + '/' + d + '/' + f
                                     copy_des = root_count + '/' + new_name
-                                    os.symlink(copy_src, copy_des)
-                                    print(copy_src + ' successfully copied.')
+                                    try:
+                                        os.symlink(copy_src, copy_des)
+                                        print('{} successfully copied.'.format(copy_src))
+                                    except FileExistsError:
+                                        pass
                         except json.decoder.JSONDecodeError:
                             ground_truth.close()
                             ground_truth = io.open(root_ground_truth_text + '/' + r + '/' + d, 'r')
                             while(True):
                                 line = ground_truth.readline()
                                 if line == '\n' or line == '':
                                     break
                                 else:
                                     pass
                                 
                             while(True):
                                 line = ground_truth.readline()
                                 stripped = line.strip()
                                 if line == '':
                                     break
                                 else:
                                     stripped = line.strip()
                                     language = stripped.replace(':','')
                                     root_language = root_code_by_language + '/' + language
                                     try:
                                         os.mkdir(root_language)
                                     except FileExistsError:
                                         pass
                                     while(True):
                                         line = ground_truth.readline()
                                         if line == '\n':
                                             break
                                         else:
                                             f = line.strip()
                                             counts[language] = counts.get(language, 0) + 1
                                             start = (counts[language] - 1) // 1000 * 1000 + 1
                                             end = start + 999
                                             root_count = root_language + '/' + str(start) + '-' + str(end)
                                             if counts[language] % 1000 == 1:
                                                 try: 
                                                     os.mkdir(root_count)
                                                 except FileExistsError:
                                                     pass
                                             (_,ext) = os.path.splitext(f)
                                             new_name = str(counts[language]) + ext
                                             copy_src = root + '/' + r + '/' + d + '/' + f
                                             copy_des = root_count + '/' + new_name
-                                            os.symlink(copy_src, copy_des)
-                                            print('{} successfully copied.'.format(copy_src))
+                                            try:
+                                                os.symlink(copy_src, copy_des)
+                                                print('{} successfully copied.'.format(copy_src))
+                                            except FileExistsError:
+                                                pass
                     finally:
                         ground_truth.close()
                             
 if __name__ == '__main__':
     if len(sys.argv) != 2:
         print('Only argument acceptable is a path.')
     else:
         main(sys.argv[1])
diff --git a/swh/langdetect/__init__.py b/swh/langdetect/__init__.py
index db2d42b..5f8841b 100644
--- a/swh/langdetect/__init__.py
+++ b/swh/langdetect/__init__.py
@@ -1,4 +1,5 @@
 """
 Detectlang detects the programming language of source code file.
 
 """
+
diff --git a/swh/langdetect/ngram.py b/swh/langdetect/ngram.py
new file mode 100644
index 0000000..9db7303
--- /dev/null
+++ b/swh/langdetect/ngram.py
@@ -0,0 +1,63 @@
+"""
+Calculate frequencies for classes
+"""
+
+import os,sys,nltk
+
+from collections import Counter
+from nltk.util import ngrams
+from utils.common import tokenizer, file_to_string
+from utils.training import build_training_set
+
+class NGramTrain:
+    
+    def __init__(self, root):
+        self._ngram_list = {}
+        self._prob = {}
+        self._root = root
+        
+    def train(self):
+        root_training_set = self._root + '/../training_set'
+        root_model = '../../dataset/model'
+        
+        try:
+            if len(os.listdir(root_training_set)) == 0:
+                build_training_set(self._root)
+        except FileNotFoundError:
+            os.mkdir(root_training_set)
+            build_training_set(self._root)
+            
+        for language in os.listdir(root_training_set):
+            if not language.startswith('.'):
+                _ngram_list[language] = {}
+                _prob[language] = None
+                root_training_set_language = root_training_set + '/' + language
+                for f in os.listdir(root_training_set_language):
+                    if not f.startswith('.'):
+                        _ngrams_max(language, f)
+        
+    def _ngrams_max(self, language, filename, n=3):
+        ngram_lan = self._ngram_list[language]
+        tokens = tokenizer(file_to_string(filename))
+        for i in range(1, n):
+            ngram_lan[i + 1] = ngram_lan.get(i + 1, []) + \
+                               list(ngram(tokens, i + 1,
+                                          pad_left  = True,
+                                          pad_right = True,
+                                          left_pad_symbol  = '$BOS$',
+                                          right_pad_symbol = '$EOS$'))
+            
+    def _kneser_key_prob(self, language):
+        c = Counter()
+        ngram_lan = self._ngram_list[language]
+        for key in self._ngram_lan.keys():
+            c = c + Counter(self.ngram_list[key])
+        freq_dist = nltk.FreqDist(c)
+        self._prob[language] = nltk.KneserNeyProbDist(freq_dist)
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print('Only argument acceptable is a path.')
+    else:
+        model = NGramTrain(sys.argv[1])
+        model.train()
diff --git a/swh/langdetect/utils.py b/swh/langdetect/utils.py
deleted file mode 100644
index 70d84c1..0000000
--- a/swh/langdetect/utils.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""
-Here regroup basic preprocessing methods
-used in learning stage for different 
-approaches.
-
-"""
-
-import re
-
-_re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
-_re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
-
-def replace_string_and_number(text):
-    """ Replace strings and numbers in a file by special tokens 
-    """
-    str_replaced = re.sub(_re_string, '__str__', text)
-    str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
-    return str_num_replaced
-
-def remove_comment(text):
-    # TODO: remove only inline comments and block comments
-    # TODO: maybe build a list of comment markers
-    pass
-
-def purify(text, lang):
-    # TODO: for some language like HTML, remove code other than principal language
-    pass
-
-
-
-
-    
-    
-    
-    
-
diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py
new file mode 100644
index 0000000..37648f9
--- /dev/null
+++ b/swh/langdetect/utils/common.py
@@ -0,0 +1,75 @@
+"""
+Here regroup basic preprocessing methods
+used in learning stage for different 
+approaches.
+
+"""
+
+import re, os
+
+_re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
+_re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
+_re_separator = re.compile(r'(\W)')
+_not_start_with_point = lambda x: not x.startswith('.')
+
+def tokenizer(text):
+    ''' Splits text into tokens '''
+    return [word for word in _re_separator.split(text) if word.strip(' \t')]
+
+def file_to_string(filename):
+    """ Read a file to a string. """
+    with open(filename, 'r') as f:
+        data = f.read().replace('\n',' ').lower
+    return data
+
+def count_files(root_language):    
+    all_folders = natural_sort(filter
+                               (_not_start_with_point,
+                                os.listdir(root_language)))
+    files = natural_sort(filter
+                         (_not_start_with_point,
+                          os.listdir(root_language + '/' + all_folders[-1])))
+    (max,_) = os.path.splitext(files[-1])
+    return int(max)
+
+def find_file(root_language, n):
+    '''Find the n-th file in language folder'''
+    if n > count_files(root_language):
+        return ''
+    else:
+        start = (n - 1) // 1000 * 1000 + 1
+        end = start + 999
+        root_count = root_language + '/' + str(start) + '-' + str(end)
+        files = natural_sort(filter
+                             (_not_start_with_point,
+                              os.listdir(root_count)))
+        return root_count + '/' + files[n - start]
+
+def replace_string_and_number(text):
+    """ Replace strings and numbers in a file by special tokens 
+    """
+    str_replaced = re.sub(_re_string, '__str__', text)
+    str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
+    return str_num_replaced
+
+def natural_sort(l): 
+    convert = lambda text: int(text) if text.isdigit() else text.lower() 
+    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
+    return sorted(l, key = alphanum_key)
+
+def remove_comment(text):
+    # TODO: remove only inline comments and block comments
+    # TODO: maybe build a list of comment markers
+    pass
+
+def purify(text, lang):
+    # TODO: for some language like HTML, remove code other than principal language
+    pass
+
+
+
+    
+    
+    
+    
+
diff --git a/swh/langdetect/utils/training.py b/swh/langdetect/utils/training.py
new file mode 100644
index 0000000..07872ab
--- /dev/null
+++ b/swh/langdetect/utils/training.py
@@ -0,0 +1,35 @@
+import os,random
+
+from utils.common import count_files, find_file
+
+def build_training_set(root):
+    root_code = root + '/../code_by_language'
+    root_training = root + '/../training_set'
+    for language in os.listdir(root_code):
+        if not language.startswith('.'):
+            root_language = root_code + '/' + language
+            root_training_language = root_training + '/' + language
+            build_language_training_set(count_files(root_language),
+                                        root_language,
+                                        root_training_language)
+
+def build_language_training_set(total, root_language, root_training_language):
+    # limit defines the size of training set
+    # upper defines the maximum size
+
+    try:
+        os.mkdir(root_training_language)
+    except FileExistsError:
+        pass
+
+    upper = 4000
+    if total >= upper:
+        limit = upper // 2
+    else:
+        limit = total // 2
+    indices = random.sample(range(1, total + 1), limit)
+    files = map(lambda x : find_file(root_language, x), indices)
+    for src in files:
+        basename = os.path.basename(src)
+        des = root_training_language + '/' + basename
+        os.symlink(src, des)