diff --git a/scripts/dataset/ground_truth_arrange.py b/scripts/dataset/ground_truth_arrange.py
index 65b09f0..7eb9dac 100644
--- a/scripts/dataset/ground_truth_arrange.py
+++ b/scripts/dataset/ground_truth_arrange.py
@@ -1,113 +1,123 @@
 # Copyright (C) 2015-2016  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 #!/usr/bin/env python3
 # coding: utf-8
 
 import os
 import sys
 import io
 import json
 
 def main(root):
     
     root_ground_truth = root + '/../ground_truth'
     root_ground_truth_text = root_ground_truth + '_text'
     root_code_by_language = root + '/../code_by_language'
     counts = dict()
     
     try:
         os.mkdir(root_code_by_language)
     except FileExistsError:
         pass
     
     for r in os.listdir(root):
         if not r.startswith('.'):
             for d in os.listdir(root + '/' + r):
                 if not d.startswith('.'):
                     try:
                         ground_truth = io.open(root_ground_truth + '/' + r + '/' + d + '.json')
                         try:
                             j = json.load(ground_truth)
                             for language in j.keys():
                                 root_language = root_code_by_language + '/' + language
                                 try:
                                     os.mkdir(root_language)
                                 except FileExistsError:
                                     pass
                                 for f in j.get(language):
+                                    copy_src = root + '/' + r + '/' + d + '/' + f
+                                    try:
+                                        if os.path.getsize(copy_src) > 10485760 :
+                                            continue
+                                    except FileNotFoundError:
+                                        continue
                                     counts[language] = counts.get(language, 0) + 1
                                     start = (counts[language] - 1) // 1000 * 1000 + 1
                                     end = start + 999
                                     root_count = root_language + '/' + str(start) + '-' + str(end)
                                     if counts[language] % 1000 == 1:
                                         try: 
                                             os.mkdir(root_count)
                                         except FileExistsError:
                                             pass
                                     (_,ext) = os.path.splitext(f)
                                     new_name = str(counts[language]) + ext
-                                    copy_src = root + '/' + r + '/' + d + '/' + f
+                                   
                                     copy_des = root_count + '/' + new_name
                                     try:
                                         os.symlink(copy_src, copy_des)
                                         print('{} successfully copied.'.format(copy_src))
                                     except FileExistsError:
                                         pass
                         except json.decoder.JSONDecodeError:
                             ground_truth.close()
                             ground_truth = io.open(root_ground_truth_text + '/' + r + '/' + d, 'r')
                             while(True):
                                 line = ground_truth.readline()
                                 if line == '\n' or line == '':
                                     break
                                 else:
                                     pass
                                 
                             while(True):
                                 line = ground_truth.readline()
                                 stripped = line.strip()
                                 if line == '':
                                     break
                                 else:
                                     stripped = line.strip()
                                     language = stripped.replace(':','')
                                     root_language = root_code_by_language + '/' + language
                                     try:
                                         os.mkdir(root_language)
                                     except FileExistsError:
                                         pass
                                     while(True):
                                         line = ground_truth.readline()
                                         if line == '\n':
                                             break
                                         else:
-                                            f = line.strip()
+                                            copy_src = root + '/' + r + '/' + d + '/' + f
+                                            try:
+                                                if os.path.getsize(copy_src) > 10485760 :
+                                                    continue
+                                            except FileNotFoundError:
+                                                continue
                                             counts[language] = counts.get(language, 0) + 1
                                             start = (counts[language] - 1) // 1000 * 1000 + 1
                                             end = start + 999
                                             root_count = root_language + '/' + str(start) + '-' + str(end)
                                             if counts[language] % 1000 == 1:
                                                 try: 
                                                     os.mkdir(root_count)
                                                 except FileExistsError:
                                                     pass
                                             (_,ext) = os.path.splitext(f)
                                             new_name = str(counts[language]) + ext
-                                            copy_src = root + '/' + r + '/' + d + '/' + f
                                             copy_des = root_count + '/' + new_name
                                             try:
                                                 os.symlink(copy_src, copy_des)
                                                 print('{} successfully copied.'.format(copy_src))
                                             except FileExistsError:
                                                 pass
                     finally:
                         ground_truth.close()
                             
 if __name__ == '__main__':
     if len(sys.argv) != 2:
         print('Only argument acceptable is a path.')
     else:
         main(sys.argv[1])
diff --git a/swh/langdetect/ngram.py b/swh/langdetect/ngram.py
deleted file mode 100644
index d9b4a9a..0000000
--- a/swh/langdetect/ngram.py
+++ /dev/null
@@ -1,107 +0,0 @@
-"""
-Calculate frequencies for classes
-"""
-
-import os, sys, nltk
-
-from pickle import dump, load
-from collections import Counter
-from nltk.util import ngrams
-from utils.common import tokenizer, file_to_string
-from utils.training import build_training_set
-from nltk.probability import *
-
-class NGramTrain:
-    
-    def __init__(self, root):
-        self._root = root
-        
-    def train(self):
-        root_training_set = self._root + '/../training_set'
-        root_model = '../../dataset/model'
-        
-        try:
-            if len(os.listdir(root_training_set)) == 0:
-                build_training_set(self._root)
-            try:
-                os.mkdir(root_model)
-            except FileExistsError:
-                pass
-        except FileNotFoundError:
-            os.mkdir(root_training_set)
-            build_training_set(self._root)
-            
-        for language in os.listdir(root_training_set):
-            if not language.startswith('.'):
-                root_training_set_language = root_training_set + '/' + language
-                for f in os.listdir(root_training_set_language):
-                    print(f)
-                    if not f.startswith('.'):
-                        ngram_lan = ngrams_max(root_training_set_language + '/' + f)
-                        prob_lan = self._kneser_key_prob(ngram_lan)
-                #with open(root_model + '/' + language + '.model', 'wb') as f:
-                #    dump(ngram_lan, f)
-                with open(root_model + '/' + language + '.model', 'wb') as f:
-                    dump(prob_lan, f)
-        
-    def ngrams_max(filename, n=3):
-        ngram_lan = {}
-        tokens = tokenizer(file_to_string(filename))
-        for i in range(n - 1, n):
-            ngram_lan[i + 1] = ngram_lan.get(i + 1, []) + \
-                               list(ngrams(tokens, i + 1,
-                                          pad_left  = True,
-                                          pad_right = True,
-                                          left_pad_symbol  = '$BOS$',
-                                          right_pad_symbol = '$EOS$'))
-        return ngram_lan
-            
-    def _kneser_key_prob(self, ngram_lan):
-        c = Counter()
-        for key in ngram_lan.keys():
-            c = c + Counter(ngram_lan[key])
-        freq_dist = nltk.FreqDist(c)
-        return nltk.KneserNeyProbDist(freq_dist)
-
-
-class NGramTest:
-
-    def test(filename):
-        NGramTest._guess_file_language(filename)
-        
-    def _guess_file_language(filename):
-        root_model_folder = '../../dataset/model'
-        trigrams = NGramTrain.ngrams_max(filename)
-        result = []
-
-        for model in os.listdir(root_model_folder):
-            if not model.startswith('.'):
-                root_model = root_model_folder + '/' + model
-                (language,_) = os.path.splitext(model)
-                result.append((NGramTest._prob(root_model, trigrams[3]), language))
-        print(sorted(result))
-        
-    def _prob(model, trigrams):
-        with open(model, 'rb') as f:
-            kneser_ney = load(f)
-        result = 0
-        for trigram in trigrams:
-            prob = kneser_ney.prob(trigram)
-            if prob != 0:
-                if result == 0:
-                    result = prob
-                else:
-                    result = result * prob
-        return result
-                    
-
-if __name__ == '__main__':
-    if len(sys.argv) != 3:
-        print('Only acceptable arguments are an option and a path.')
-    elif sys.argv[1] == '--train':
-        model = NGramTrain(sys.argv[2])
-        model.train()
-    elif sys.argv[1] == '--test':
-        NGramTest.test(sys.argv[2])
-    else:
-        print('Wrong arguments, please check your input.')
diff --git a/swh/langdetect/ngramdist.py b/swh/langdetect/ngramdist.py
new file mode 100644
index 0000000..01b3dd4
--- /dev/null
+++ b/swh/langdetect/ngramdist.py
@@ -0,0 +1,150 @@
+"""
+Baseline approach
+"""
+
+import os, sys, operator, nltk
+
+from pickle import dump, load
+from nltk.util import ngrams
+from utils.common import tokenizer, file_to_string, find_file, count_files
+from utils.training import build_training_set
+
+class NGramDist:
+    
+    def __init__(self, root):
+        self._root = root
+        self._root_training_set = os.path.join(self._root, '..', 'training_set')
+        self._root_model = os.path.join(self._root, '..', 'model_ngram_dist')
+        
+    def train(self):
+        try:
+            if len(os.listdir(self._root_training_set)) == 0:
+                build_training_set(self._root)
+            try:
+                os.mkdir(self._root_model)
+            except FileExistsError:
+                pass
+        except FileNotFoundError:
+            os.mkdir(self._root_training_set)
+            build_training_set(self._root)
+            
+        for language in os.listdir(self._root_training_set):
+            if not language.startswith('.'):
+                root_training_set_language = os.path.join(self._root_training_set, language)
+                root_stat_language = os.path.join(self._root_model, language)
+                if os.path.isfile(root_stat_language):
+                    continue
+                else:
+                    statistics = {}
+                    for f in os.listdir(root_training_set_language):
+                        print(f)
+                        if not f.startswith('.'):
+                            filename = os.path.join(root_training_set_language, f)
+                            tokens = tokenizer(file_to_string(filename))
+                            generated_ngrams = self._generate_ngrams(tokens, 3)
+                            self._count_ngrams(statistics, generated_ngrams)
+                    with open(root_stat_language, 'wb') as f:
+                        dump(self._sort_by_value(statistics), f)
+        
+    def _generate_ngrams(self, tokens, n):
+        generated_ngrams = []
+
+        for i in range(1, n+1):
+            igrams = ngrams(tokens, i,
+                           pad_left  = True,
+                            pad_right = True,
+                            left_pad_symbol  = '$BOF$',
+                            right_pad_symbol = '$EOF$')
+            for igram in igrams:
+                generated_ngrams.append(''.join(igram))
+
+        return generated_ngrams
+            
+    def _count_ngrams(self, statistics, ngrams):
+        for ngram in ngrams:
+            statistics[ngram] = statistics.get(ngram, 0) + 1
+
+    def test_class(self, root_language):
+        language = os.path.basename(root_language)
+        root_training_language = os.path.join(self._root_training_set, language)
+
+        total = count_files(root_language)
+        training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')]
+        test_set = [x for x in range(1, total + 1) if x not in training_set]
+
+        ok = 0
+        for test in test_set[:1000]:
+            result = self._guess_file_language(find_file(root_language, test))
+            print(result[0])
+            if result[0][1] == language:
+                ok += 1
+
+        print('Total test files           : {}'.format(len(test_set)))
+        print('Correctly classified files : {}'.format(ok))
+        print('Accuracy                   : {}%'.format(ok / len(test_set)))
+
+    def test_single(self, filename):
+        self._guess_file_language(filename)
+        
+    def _guess_file_language(self, filename):
+        tokens = tokenizer(file_to_string(filename))
+        generated_ngrams = self._generate_ngrams(tokens, 3)
+        statistics = {}
+        self._count_ngrams(statistics, generated_ngrams)
+        test_profile = self._sort_by_value(statistics)
+
+        result = []
+
+        for model in os.listdir(self._root_model):
+            if not model.startswith('.'):
+                root_model = os.path.join(self._root_model, model)
+                with open(root_model, 'rb') as sorted_file:
+                    model_profile = load(sorted_file)
+                distance = self._distance(model_profile, test_profile)
+                result.append((distance, os.path.splitext(model)[0]))
+                    
+        return sorted(result)
+
+    def _sort_by_value(self, statistics):
+        statistics_sorted = sorted(statistics.items(),
+                                   key = operator.itemgetter(1),
+                                   reverse = True)[:500]
+        return statistics_sorted
+        
+    def _distance(self, model_profile, test_profile):
+        distance = 0
+        model_ngrams = [x[0] for x in model_profile ]
+        test_ngrams  = [x[0] for x in  test_profile ]
+        maximum = len(test_ngrams)
+
+        for test_ngram in test_ngrams:
+            test_rank = test_ngrams.index(test_ngram)
+            try:
+                model_rank = model_ngrams.index(test_ngram)
+            except ValueError:
+                model_rank = maximum
+            d = abs(test_rank - model_rank)
+            distance += d
+
+        return distance
+    '''    
+    def _prob(model, trigrams):
+        print('Checking {} model ...'.format(model))
+        with open(model, 'rb') as f:
+            kneser_ney = load(f)
+        result = 1
+        for trigram in trigrams:
+            prob = kneser_ney.prob(trigram)
+            result = result * prob
+        return result    
+    '''
+
+if __name__ == '__main__':
+    if len(sys.argv) == 3 and sys.argv[1] == '--train':
+        n = NGramDist(sys.argv[2])
+        n.train()
+    elif len(sys.argv) == 4 and sys.argv[1] == '--test':
+        n = NGramDist(sys.argv[2])
+        n.test_class(sys.argv[3])
+    else:
+        print('Wrong arguments, please check your input.')
diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py
index 52d387f..4f89523 100644
--- a/swh/langdetect/utils/common.py
+++ b/swh/langdetect/utils/common.py
@@ -1,75 +1,77 @@
 """
 Here regroup basic preprocessing methods
 used in learning stage for different 
 approaches.
 
 """
 
 import re, os
 
 _re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
 _re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
 _re_separator = re.compile(r'(\W)')
 _not_start_with_point = lambda x: not x.startswith('.')
 
 def tokenizer(text):
     ''' Splits text into tokens '''
-    return [word for word in _re_separator.split(text) if word.strip(' \t')]
+    #return [word for word in _re_separator.split(text) if word.strip(' \t')]
+    return list(text)
 
 def file_to_string(filename):
     """ Read a file to a string. """
     with open(filename, 'r', errors='ignore') as f:
-        data = f.read().replace('\n',' ').lower()
+        data = f.read().lower()
     return replace_string_and_number(data)
 
 def count_files(root_language):    
     all_folders = natural_sort(filter
                                (_not_start_with_point,
                                 os.listdir(root_language)))
     files = natural_sort(filter
                          (_not_start_with_point,
                           os.listdir(root_language + '/' + all_folders[-1])))
     (max,_) = os.path.splitext(files[-1])
     return int(max)
 
 def find_file(root_language, n):
     '''Find the n-th file in language folder'''
     if n > count_files(root_language):
         return ''
     else:
         start = (n - 1) // 1000 * 1000 + 1
         end = start + 999
         root_count = root_language + '/' + str(start) + '-' + str(end)
         files = natural_sort(filter
                              (_not_start_with_point,
                               os.listdir(root_count)))
         return root_count + '/' + files[n - start]
 
 def replace_string_and_number(text):
     """ Replace strings and numbers in a file by special tokens 
     """
-    str_replaced = re.sub(_re_string, '__str__', text)
-    str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
+    # str_replaced = re.sub(_re_string, '__str__', text)
+    # str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
+    str_num_replaced = text
     return str_num_replaced
 
 def natural_sort(l): 
     convert = lambda text: int(text) if text.isdigit() else text.lower() 
     alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
     return sorted(l, key = alphanum_key)
 
 def remove_comment(text):
     # TODO: remove only inline comments and block comments
     # TODO: maybe build a list of comment markers
     pass
 
 def purify(text, lang):
     # TODO: for some language like HTML, remove code other than principal language
     pass