diff --git a/swh/langdetect/cnn.py b/swh/langdetect/cnn.py index 22b555b..008f01e 100644 --- a/swh/langdetect/cnn.py +++ b/swh/langdetect/cnn.py @@ -1,308 +1,309 @@ import os import sys import subprocess import time import random import csv import numpy as np import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") import tensorflow as tf import json import argparse from ast import literal_eval from pickle import dump from pickle import load from numpy import array -from .utils.common import tokenizer +from .utils.common import Tokenizer from .utils.common import file_to_string from keras.preprocessing.sequence import pad_sequences from keras.callbacks import EarlyStopping from keras.models import Model from keras.models import Sequential from keras.models import load_model from keras.layers import Input from keras.layers import Dense from keras.layers import Flatten from keras.layers import Dropout from keras.layers import ThresholdedReLU from keras.layers import Activation from keras.layers import Lambda from keras.layers import Embedding from keras.layers.convolutional import Convolution1D from keras.layers.convolutional import MaxPooling1D from keras.layers.normalization import BatchNormalization from keras.utils import np_utils from keras.optimizers import SGD from pyspark import SparkContext, SparkConf from elephas.spark_model import SparkModel # pip install flask from elephas import optimizers as elephas_optimizers from elephas.utils.rdd_utils import to_labeled_point csv.field_size_limit(sys.maxsize) conf = SparkConf().setAppName('Elephas_App').setMaster('local[8]') # Set up on cluster. sc = SparkContext(conf=conf) def main(): parser = argparse.ArgumentParser(description='Training and test tool of charactor-level ConvNet text categorisation.') subparsers = parser.add_subparsers(dest='sub_command') parser_train = subparsers.add_parser('train', help='Training on the dataset, dataset must be a *.csv file. A model will be created in the same directory.') parser_train.add_argument('-s', '--spark', type=bool, help='Training on cluster.', dest='train_spark') parser_train.add_argument('train_path', metavar='PATH', type=str, help='Path of the training dataset.') parser_train.add_argument('-ms', '--maxsize', metavar='SIZE', dest='train_maxsize', type=int, help='Set maximum input size of ConvNet, default 1024.') parser_train.add_argument('-e', '--epochs', metavar='N', dest='train_epochs', type=int, help='Number of training epochs (iterations), default 50.') parser_test = subparsers.add_parser('test', help='Test on the dataset, dataset must be a directory with *.csv dataset named by corresponding language.') parser_test.add_argument('test_root', metavar='ROOT', type=str, help='Root of the test dataset.') if len(sys.argv[1:]) == 0: parser.print_help() parser.exit() args = parser.parse_args() if args.sub_command == 'train' : maxsize = 1024 epochs = 50 if args.train_maxsize: maxsize = args.train_maxsize if args.train_epochs: epochs = args.train_epochs n = CNN(args.train_path, maxsize=maxsize, epochs=epochs) if args.train_spark: n.train_on_cluster() - elif args.train_gpu: + else: n.train() + elif args.sub_command == 'test': n = CNN(args.test_root) n.test() else: parser.parse_args('-h') class CNN: def __init__(self, path, maxsize, epochs): self._path = path # Root of model folder self._root_model = os.path.join(os.path.dirname(path), 'model_cnn') try: os.mkdir(self._root_model) except: pass # Path of result self._path_result = os.path.join(os.path.dirname(path), 'result_cnn') dir_path = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(dir_path, 'static_data', 'languages.json'), 'r') as f: self._languages = json.load(f) self._path_test_csv = path self._input_size = maxsize self._vocab_size = 256 self._num_of_classes = len(self._languages) self._batch_size = 128 self._epochs = epochs def file_len(self, fname): with open(fname) as f: count = 0 for l in f: count += 1 return count def train(self): model = self._get_model() earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto') callbacks = [earlystop] model.fit_generator( self._generator(self._input_size, self._num_of_classes, self._batch_size), steps_per_epoch=self.file_len(self._path) / self._batch_size, epochs=self._epochs, callbacks=callbacks) model.save(os.path.join(self._root_model, 'model.h5')) def train_on_cluster(self): rdd = self._get_train_rdd() model = self._get_model() adagrad = elephas_optimizers.Adagrad() spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=2) spark_model.train(rdd, nb_epoch=self._epochs, batch_size=self._batch_size, verbose=0, categorical=True, nb_classes=self._num_of_classes) model.save(os.path.join(self._root_model, 'model.h5')) def _get_train_rdd(self): print('Prepairing RDD for training...') X_train = np.empty((0, self._input_size)) Y_train = np.empty((0, self._num_of_classes)) with open(self._path, newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: label, string = pair label = int(label) print(label, end='\r') string = literal_eval(string) - tokens = [x + 1 for x in tokenizer(string, 'letter')] + tokens = [x + 1 for x in Tokenizer.tokenize(string, 'letter')] X_train = np.append(X_train, pad_sequences([tokens], maxlen=self._input_size), axis=0) label = array(np_utils.to_categorical([label], self._num_of_classes)) Y_train = np.append(Y_train, label, axis=0) rdd = to_labeled_point(sc, X_train, Y_train, categorical=True) def _generator(self, length, total_class, batch_size=128): counter = 0 while True: with open(self._path, newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: if counter == 0: X = np.empty((0, length)) Y = np.empty((0, total_class)) label, string = pair label = int(label) string = literal_eval(string) - tokens = [x + 1 for x in tokenizer(string, 'letter')] + tokens = [x + 1 for x in Tokenizer.tokenize(string, 'letter')] X = np.append(X, pad_sequences([tokens], maxlen=length), axis=0) label = array(np_utils.to_categorical([label], total_class)) Y = np.append(Y, label, axis=0) counter += 1 if counter == batch_size: counter = 0 yield(X,Y) def _get_model(self): input_size = self._input_size alphabet_size = self._vocab_size embedding_size = 256 - conv_layers = [(256,7,5), (256,7,5), (256,3,-1), (256,3,-1), (256,3,-1), (256,3,5)] + conv_layers = [(256,7,3), (256,7,3), (256,3,-1), (256,3,-1), (256,3,-1), (256,3,3)] threshold = 1e-6 fully_connected_layers = [1024, 1024] dropout_p = 0.2 optimizer = 'adam' loss = 'categorical_crossentropy' num_of_classes = self._num_of_classes # Input layer inputs = Input(shape=(input_size,), name='sent_input', dtype='int64') # Embedding layers x = Embedding(alphabet_size + 1, embedding_size, input_length=input_size)(inputs) # Convolution layers for cl in conv_layers: x = Convolution1D(cl[0], cl[1])(x) x = ThresholdedReLU(threshold)(x) if cl[2] != -1: x = MaxPooling1D(cl[2])(x) x = Flatten()(x) # Fully connected layers for fl in fully_connected_layers: x = Dense(fl)(x) x = ThresholdedReLU(threshold)(x) x = Dropout(dropout_p)(x) # Output layer predictions = Dense(num_of_classes, activation='softmax')(x) # Build and compile model model = Model(inputs=inputs, outputs=predictions) model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) print(model.summary()) return model def _max_len(self, texts): return max([len(text) for text in texts]) def test(self): csv.field_size_limit(sys.maxsize) try: r = open(self._path_result, 'rb') test_result = load(r) r.close() except FileNotFoundError: test_result = {} model = self._load_model() for language in [x for x in self._languages if x not in test_result.keys()]: test_result[language] = self.test_class(model, language) with open(self._path_result, 'wb') as f: dump(test_result, f) def _load_model(self): model = load_model(os.path.join(self._root_model, 'model.h5')) return model def _count_size(self, files): size = 0 for f in files: size += os.path.getsize(f) return size def test_class(self, model, language): ok = 0 results = [] count = 0 total_test = self.file_len(os.path.join(self._path_test_csv, language + '.csv')) with open(os.path.join(self._path_test_csv, language + '.csv'), newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: label, string = pair label = int(label) string = literal_eval(string) - tokens = [x + 1 for x in tokenizer(string, 'letter')] + tokens = [x + 1 for x in Tokenizer.tokenize(string, 'letter')] result = self._guess_file_language(model, tokens) count += 1 print('[{0:4d}/{1:4d}] {2}:{3} '.format(count, total_test, result[0][1], result[0][0]),end='\r') results.append(result[0]) if result[0][1] == language: ok += 1 accuracy = ok / total_test print('Tests for {} '.format(language)) print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) print('Accuracy : {}%'.format(accuracy * 100)) return (ok, total_test, accuracy, results) def speed_benchmark(self): language = self._languages[10] model = self._load_model() test_set = self._get_test_set(language) total_size = self._count_size(test_set) print('{} kB in total'.format(total_size / 1024)) t_start = time.perf_counter() self.test_class(model, language) t_end = time.perf_counter() print('{} seconds.'.format(t_end - t_start)) print('{} seconds per KiB'.format(((t_end - t_start) / total_size) * 1024)) def _guess_file_language(self, model, tokens): X = pad_sequences([tokens], maxlen=self._input_size) result = list(model.predict(X))[0] result = [(s, self._languages[i]) for i, s in enumerate(result)] return sorted(result, reverse=True) if __name__ == '__main__': main() diff --git a/swh/langdetect/cnn_w.py b/swh/langdetect/cnn_w.py index 3ae2a36..34167c8 100644 --- a/swh/langdetect/cnn_w.py +++ b/swh/langdetect/cnn_w.py @@ -1,301 +1,301 @@ import os import sys import subprocess import time import random import csv import numpy as np import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") import tensorflow as tf import json import argparse from ast import literal_eval from pickle import dump from pickle import load from numpy import array -from .utils.common import tokenizer +from .utils.common import Tokenizer from .utils.common import file_to_string from keras.preprocessing.sequence import pad_sequences from keras.callbacks import EarlyStopping from keras.models import Model from keras.models import Sequential from keras.models import load_model from keras.layers import Input from keras.layers import Dense from keras.layers import Flatten from keras.layers import Merge from keras.layers import Dropout from keras.layers import ThresholdedReLU from keras.layers import Activation from keras.layers import Lambda from keras.layers import Embedding from keras.layers.convolutional import Convolution1D from keras.layers.convolutional import MaxPooling1D from keras.layers.normalization import BatchNormalization from keras.layers import Concatenate from keras.utils import np_utils from keras.optimizers import SGD from collections import Counter csv.field_size_limit(sys.maxsize) from keras import backend as K K.set_session(K.tf.Session(config=K.tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1))) def main(): parser = argparse.ArgumentParser(description='Training and test tool of charactor-level ConvNet text categorisation.') subparsers = parser.add_subparsers(dest='sub_command') parser_train = subparsers.add_parser('train', help='Training on the dataset, dataset must be a *.csv file. A model will be created in the same directory.') parser_train.add_argument('train_path', metavar='PATH', type=str, help='Path of the training dataset.') parser_train.add_argument('-ms', '--maxsize', metavar='SIZE', dest='train_maxsize', type=int, help='Set maximum input size of ConvNet, default 1024.') parser_train.add_argument('-e', '--epochs', metavar='N', dest='train_epochs', type=int, help='Number of training epochs (iterations), default 50.') parser_test = subparsers.add_parser('test', help='Test on the dataset, dataset must be a directory with *.csv dataset named by corresponding language.') parser_test.add_argument('test_root', metavar='ROOT', type=str, help='Root of the test dataset.') if len(sys.argv[1:]) == 0: parser.print_help() parser.exit() args = parser.parse_args() if args.sub_command == "train": if args.train_maxsize: if args.train_epochs: n = CNNword(args.train_path, maxsize=args.train_maxsize, epochs=args.train_epochs) n.train() else: n = CNNword(args.train_path, maxsize=args.train_maxsize) n.train() else: if args.train_epochs: n = CNNword(args.train_path, epochs=args.train_epochs) n.train() else: n = CNNword(args.train_path) n.train() elif args.sub_command == "test": n = CNNword(args.test_root) print(args.test_root) n.test() else: parser.parse_args('-h') class CNNword: def __init__(self, path, maxsize=768, epochs=30): self._path = path # Root of model folder self._root_model = os.path.join(os.path.dirname(path), 'model_cnn_word') try: os.mkdir(self._root_model) except: pass # Path of result self._path_result = os.path.join(os.path.dirname(path), 'result_cnn_word') dir_path = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(dir_path, 'static_data', 'languages.json'), 'r') as f: self._languages = json.load(f) self._path_test_csv = path self._path_vocab = os.path.join(self._root_model, 'vocab') self._input_size = maxsize self._vocab_size = 20001 self._num_of_classes = len(self._languages) self._batch_size = 64 self._epochs = epochs if not os.path.isfile(self._path_vocab): self._learn_vocab(self._input_size, self._num_of_classes) with open(self._path_vocab, 'rb') as f: c = load(f) l = c.most_common(20000) print(l) self._indexer = dict((v[0], i + 1) for i, v in enumerate(l)) self._oov_index = len(self._indexer) + 1 def file_len(self, fname): with open(fname) as f: count = 0 for l in f: count += 1 return count def train(self): model = self._get_model() earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=2, verbose=0, mode='auto') callbacks = [earlystop] model.fit_generator( self._generator(self._input_size, self._num_of_classes, self._batch_size), steps_per_epoch=self.file_len(self._path) / self._batch_size, epochs=self._epochs, callbacks=callbacks) model.save(os.path.join(self._root_model, 'model.h5')) def _learn_vocab(self, length, total_class): c = Counter() with open(self._path, newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: label, string = pair label = int(label) print(label, end='\r') string = literal_eval(string) - tokens = tokenizer(string, 'word') + tokens = Tokenizer.tokenize(string, 'word') c.update(tokens) with open(self._path_vocab, 'wb') as f: dump(c, f) def _generator(self, length, total_class, batch_size=64): counter = 0 while True: with open(self._path, newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: if counter == 0: X = np.empty((0, length)) Y = np.empty((0, total_class)) label, string = pair label = int(label) string = literal_eval(string) - tokens = [self._indexer.get(x, self._oov_index) for x in tokenizer(string, 'word')] + tokens = [self._indexer.get(x, self._oov_index) for x in Tokenizer.tokenize(string, 'word')] X = np.append(X, pad_sequences([tokens], maxlen=length), axis=0) label = array(np_utils.to_categorical([label], total_class)) Y = np.append(Y, label, axis=0) counter += 1 if counter == batch_size: counter = 0 yield(X,Y) def _get_model(self): input_size = self._input_size vocab_size = self._vocab_size embedding_size = 128 optimizer = 'adam' loss = 'categorical_crossentropy' num_of_classes = self._num_of_classes embedding_layer = Embedding(vocab_size + 1, embedding_size, input_length=input_size, ) # applying a more complex convolutional approach convs = [] filter_sizes = [3,4,5] sequence_input = Input(shape=(input_size,), dtype='int64') embedded_sequences = embedding_layer(sequence_input) for fsz in filter_sizes: l_conv = Convolution1D(filters=10, kernel_size=fsz, activation='relu')(embedded_sequences) l_pool = MaxPooling1D(5)(l_conv) convs.append(l_pool) l_merge = Concatenate(axis=1)(convs) l_conv1= Convolution1D(128, 5, activation='relu')(l_merge) l_pool1 = MaxPooling1D(5)(l_conv1) l_conv2 = Convolution1D(128, 5, activation='relu')(l_pool1) l_pool2 = MaxPooling1D(5)(l_conv2) l_flat = Flatten()(l_pool2) l_dense = Dense(512, activation='relu')(l_flat) preds = Dense(num_of_classes, activation='softmax')(l_dense) model = Model(sequence_input, preds) model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) print(model.summary()) return model def _max_len(self, texts): return max([len(text) for text in texts]) def test(self): csv.field_size_limit(sys.maxsize) try: r = open(self._path_result, 'rb') test_result = load(r) r.close() except FileNotFoundError: test_result = {} model = self._load_model() for language in [x for x in self._languages if x not in test_result.keys()]: test_result[language] = self.test_class(model, language) with open(self._path_result, 'wb') as f: dump(test_result, f) def _load_model(self): model = load_model(os.path.join(self._root_model, 'model.h5')) return model def _count_size(self, files): size = 0 for f in files: size += os.path.getsize(f) return size def test_class(self, model, language): ok = 0 results = [] count = 0 total_test = self.file_len(os.path.join(self._path_test_csv, language + '.csv')) with open(os.path.join(self._path_test_csv, language + '.csv'), newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: label, string = pair label = int(label) string = literal_eval(string) - tokens = [self._indexer.get(x, self._oov_index) for x in tokenizer(string, 'word')] + tokens = [self._indexer.get(x, self._oov_index) for x in Tokenizer.tokenize(string, 'word')] result = self._guess_file_language(model, tokens) count += 1 print('[{0:4d}/{1:4d}] {2}:{3} '.format(count, total_test, result[0][1], result[0][0]),end='\r') results.append(result[0]) if result[0][1] == language: ok += 1 accuracy = ok / total_test print('Tests for {} '.format(language)) print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) print('Accuracy : {}%'.format(accuracy * 100)) return (ok, total_test, accuracy, results) def speed_benchmark(self): language = self._languages[10] model = self._load_model() test_set = self._get_test_set(language) total_size = self._count_size(test_set) print('{} kB in total'.format(total_size / 1024)) t_start = time.perf_counter() self.test_class(model, language) t_end = time.perf_counter() print('{} seconds.'.format(t_end - t_start)) print('{} seconds per KiB'.format(((t_end - t_start) / total_size) * 1024)) def _guess_file_language(self, model, tokens): X = pad_sequences([tokens], maxlen=self._input_size) result = list(model.predict(X))[0] result = [(s, self._languages[i]) for i, s in enumerate(result)] return sorted(result, reverse=True) if __name__ == '__main__': main() diff --git a/swh/langdetect/unsupervised.py b/swh/langdetect/unsupervised.py index fc322c2..c87b001 100644 --- a/swh/langdetect/unsupervised.py +++ b/swh/langdetect/unsupervised.py @@ -1,255 +1,267 @@ """ Naive Bayesian """ import os import sys import operator import nltk import random import time import numpy as np import csv import argparse import json +import matplotlib.pyplot as plt +import matplotlib as mpl + from ast import literal_eval from itertools import islice from pickle import dump, load from .utils.common import tokenizer, file_to_string, find_file, count_files from nltk.util import ngrams from collections import Counter from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import HashingVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.externals import joblib -from sklearn.cluster import KMeans +from sklearn.cluster import KMeans, MiniBatchKMeans +from sklearn.metrics.pairwise import cosine_similarity +from scipy.cluster.hierarchy import ward, dendrogram +from sklearn.manifold import MDS csv.field_size_limit(sys.maxsize) def main(): parser = argparse.ArgumentParser(description='Training and test tool of multinumial naive bayesian.') subparsers = parser.add_subparsers(dest='sub_command') parser_train = subparsers.add_parser('train', help='Training on the dataset, dataset must be a *.csv file. A model will be created in the same directory.') parser_train.add_argument('train_path', metavar='PATH', type=str, help='Path of the training dataset.') # parser_train.add_argument('-n', '--ngrams', metavar='N', dest='train_maxsize', type=int, help='Set maximum input size of ConvNet, default 5.') parser_test = subparsers.add_parser('test', help='Test on the dataset, dataset must be a directory with *.csv dataset named by corresponding language.') parser_test.add_argument('test_root', metavar='ROOT', type=str, help='Root of the test dataset.') if len(sys.argv[1:]) == 0: parser.print_help() parser.exit() args = parser.parse_args() if args.sub_command == 'train' : n = Unsupervised(args.train_path) - # n.train() - n.clustering() + n.train() + # n.clustering() + # n.graph() elif args.sub_command == 'test': n = Unsupervised(args.test_root) n.test() else: parser.parse_args('-h') class Unsupervised: def __init__(self, path): self._path = path # Root of model folder self._root_model = os.path.join(os.path.dirname(path), 'model_unsupervised') try: os.mkdir(self._root_model) except: pass # Path of result self._path_result = os.path.join(os.path.dirname(path), 'result_unsupervised') dir_path = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(dir_path, 'static_data', 'languages.json'), 'r') as f: self._languages = json.load(f) self._path_test_csv = path self._num_of_classes = len(self._languages) - def train(self): - ''' - train () generates and stores counted n-grams in '_root_model' folder - ''' - - ''' - Calculate frequencies of generated n-grams then store - them into a sorted list of (ngram, count) - ''' - + def train(self): cv = HashingVectorizer(analyzer='char', ngram_range=(1, 4), n_features=2**16, alternate_sign=False) - - indices = list(range(len(self._languages))) texts = [] + label = 0 + string = '' with open(self._path, newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: - label, string = pair - label = int(label) - print(label, end='\r') - - string = literal_eval(string) - tokens = tokenizer(string, 'letter')[-10248:] + label_new, string_new = pair + if not int(label_new) == label: + if not os.path.isfile(os.path.join(self._root_model, 'counts{}.pkl'.format(label))): + counts = cv.fit_transform(texts) + self.clustering(counts, 1, label) + self.graph(label) + texts = [] + label = int(label_new) + string = literal_eval(string_new) + tokens = tokenizer(string, 'letter') text = ''.join([chr(token) for token in tokens]) texts.append(text) - - #counts = cv.fit_transform(texts) with open(os.path.join(self._root_model, 'classifier.cv'), 'wb') as f: joblib.dump(cv, f) - with open(os.path.join(self._root_model, 'classifier.counts'), 'wb') as f: - joblib.dump(counts, f) - def clustering(self): - with open(os.path.join(self._root_model, 'classifier.cv'), 'rb') as f: - cv = joblib.load(f) - with open(os.path.join(self._root_model, 'classifier.counts'), 'rb') as f: - counts = joblib.load(f) - - num_clusters = 323 - + def clustering(self, counts, num_clusters, label): km = KMeans(n_clusters=num_clusters) km.fit(counts) - - with open(os.path.join(self._root_model, 'cluster.pkl'), 'wb') as f: + + with open(os.path.join(self._root_model, 'counts{}.pkl'.format(label)), 'wb') as f: + joblib.dump(counts, f) + with open(os.path.join(self._root_model, 'cluster{}.pkl'.format(label)), 'wb') as f: joblib.dump(km, f) + def graph(self, label): + with open(os.path.join(self._root_model, 'counts{}.pkl'.format(label)), 'rb') as f: + counts = joblib.load(f) + dist = 1 - cosine_similarity(counts) + linkage_matrix = ward(dist) + + fig, ax = plt.subplots(figsize=(15, 40)) + titles = list(range(1,counts.shape[0]+1)) + ax = dendrogram(linkage_matrix, orientation="right", labels=titles) + + plt.tick_params(axis= 'x', + which='both', + bottom='off', + top='off', + labelbottom='off') + + plt.tight_layout() + plt.savefig(os.path.join(self._root_model, '{}_cluster.pdf'.format(self._languages[label]))) + def test(self): try: r = open(self._path_result, 'rb') test_result = load(r) r.close() except FileNotFoundError: test_result = {} with open(os.path.join(self._root_model, 'classifier.clf'), 'rb') as f: clf = joblib.load(f) with open(os.path.join(self._root_model, 'classifier.hv'), 'rb') as f: cv = joblib.load(f) for language in [x for x in self._languages if x not in test_result.keys()]: test_result[language] = self.test_class((clf, cv), language) with open(self._path_result, 'wb') as f: dump(test_result, f) def speed_benchmark(self): language = [x for x in os.listdir(self._root_training_set) if not x.startswith('.')][10] models = self._load_models() test_set = self._get_test_set(language) total_size = self._count_size(test_set) print('{} kB in total'.format(total_size / 1024)) t_start = time.perf_counter() self.test_class(models, language) t_end = time.perf_counter() print('{} seconds.'.format(t_end - t_start)) print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024)) def _get_test_set(self, language): root_training_language = os.path.join(self._root_training_set, language) root_language = os.path.join(self._root_language_dataset, language) total = count_files(root_language) training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')] it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set and os.path.getsize(find_file(root_language, x)) <= 1048576) test_set = list(islice(it, 1000)) if len(test_set) == 0: it = (find_file(root_language, x) for x in range(1, total + 1) if x not in training_set) test_set = list(islice(it, 1000)) return test_set def _count_size(self, files): size = 0 for f in files: size += os.path.getsize(f) return size def test_class(self, clf, language): ok = 0 results = [] count = 0 total_test = self.file_len(os.path.join(self._path_test_csv, language + '.csv')) with open(os.path.join(self._path_test_csv, language + '.csv'), newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: label, string = pair label = int(label) string = literal_eval(string) result = self._guess_file_language(clf, string) count += 1 print('[{0:4d}/{1:4d}] {2}:{3} '.format(count, total_test, result[0][1], result[0][0]),end='\r') results.append(result[0]) if result[0][1] == language: ok += 1 accuracy = ok / total_test print('Tests for {} '.format(language)) print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) print('Accuracy : {}%'.format(accuracy * 100)) return (ok, total_test, accuracy, results) def test_single(self, filename): self._guess_file_language(clf, filename) def file_len(self, fname): with open(fname) as f: count = 0 for l in f: count += 1 return count def _guess_file_language(self, cc, string): clf = cc[0] cv = cc[1] tokens = tokenizer(string, 'letter') text = ''.join([chr(token) for token in tokens]) counts = cv.fit_transform([text]) tf = TfidfTransformer().fit(counts) normalised = tf.transform(counts) result = clf.predict_log_proba(normalised) result = [(val, self._languages[idx]) for idx, val in enumerate(result[0])] return sorted(result, reverse=True) def _distance(self, model_profile, test_profile): distance = 0 maximum = len(test_profile) for test_ngram in test_profile.keys(): test_rank = test_profile.get(test_ngram) model_rank = model_profile.get(test_ngram, maximum) d = abs(test_rank - model_rank) distance += d return distance ''' def _prob(model, trigrams): print('Checking {} model ...'.format(model)) with open(model, 'rb') as f: kneser_ney = load(f) result = 1 for trigram in trigrams: prob = kneser_ney.prob(trigram) result = result * prob return result ''' if __name__ == '__main__': main() diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py index 0b46574..0817cfd 100644 --- a/swh/langdetect/utils/common.py +++ b/swh/langdetect/utils/common.py @@ -1,156 +1,175 @@ """ Here regroup basic preprocessing methods used in learning stage for different approaches. """ import re, os, time -_re_string = re.compile(b"""("(\\\\.|[^"\\\\])*"|'(\\\\.|[^'\\\\])*')""") -_re_number = re.compile(b'\d+(\.\d+)?') -_re_separator = re.compile(b'([\x20-\x30\x3a-\x40\x5b-\x5e\x60\x7b-\x7e\t\n])') _not_start_with_point = lambda x: not x.startswith('.') -def tokenizer(text, re_name): - ''' Splits text into tokens ''' - if re_name == 'letter': - return list(text) - elif re_name == 'word': - text_replaced = replace_string_and_number(text) - return [word for word in _re_separator.split(text_replaced) if word.strip(b' ')] +class Tokenizer(): + + separator = re.compile( + b'([\x20-\x2f\x3a-\x40\x5b-\x5e\x60\x7b-\x7e\t\n]|\d+\.\d+|\d+|\d+\.\d+[eE][+-]?\d+)') + + def is_number(n): + try: + float(n) + except ValueError: + return False + return True + + def tokenize(text, re_name): + ''' Splits text into tokens ''' + if re_name == 'letter': + return list(text) + elif re_name == 'word': + pretokens = [x for x in Tokenizer.separator.split(text.lower()) if x and x.strip(b'\n')] + tokens = [] + for x in pretokens : + if Tokenizer.is_number(x): + tokens.append(b'') + elif x.isspace(): + tokens.append(b' ') + else: + tokens.append(x) + return tokens + def file_to_string(filename): """ Read a file to a string. """ with open(filename, 'rb') as f: data = f.read() - return replace_string_and_number(data) + return data def count_files(root_language): all_folders = natural_sort(filter (_not_start_with_point, os.listdir(root_language))) files = natural_sort(filter (_not_start_with_point, os.listdir(root_language + '/' + all_folders[-1]))) (max,_) = os.path.splitext(files[-1]) return int(max) def find_file(root_language, n): '''Find the n-th file in language folder''' if n > count_files(root_language): return '' else: start = (n - 1) // 1000 * 1000 + 1 end = start + 999 root_count = root_language + '/' + str(start) + '-' + str(end) files = natural_sort(filter (_not_start_with_point, os.listdir(root_count))) return root_count + '/' + files[n - start] -def replace_string_and_number(text): +'''def replace_string_and_number(text): """ Replace strings and numbers in a file by special tokens """ str_replaced = _re_string.sub(b'"__str__"', text) str_num_replaced = _re_number.sub(b'__num__', str_replaced) #str_num_replaced = text return str_num_replaced +''' def natural_sort(l): convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] return sorted(l, key = alphanum_key) def remove_comment(binary_text, language): splited_text = binary_text.splitlines() text = b'\n'.join(splited_text) regexp = get_regexp(language) if not regexp: return binary_text return regexp.sub(b'\n', text) def get_regexp(language): re_inline = get_inline(language) re_block = get_block(language) rs = [] if re_inline: rs.append(re_inline) if re_block: rs.append(re_block) if rs == []: return None return re.compile(b'|'.join(rs), re.DOTALL) def get_inline(language): r_base = b'[^\\n]*(?:\\n|$)' if language in ['Ada', 'Eiffel', 'VHDL', 'AppleScript', 'Haskell', 'Lua', 'PLSQL']: r = b'(--)' + r_base elif language in ['C', 'C++', 'C#', 'D', 'JavaScript', 'ActionScript', 'Java', 'Rust']: r = b'(//)' + r_base elif language == 'Xojo': r = b'(' + b'|'.join([b'//', b"\'"]) + b')' + r_base elif language in ['R', 'Tcl', 'Awk', 'Perl', 'Perl 6', 'Ruby', 'Python']: r = b'(#)' + r_base elif language in ['COBOL']: r = b'(\\*>)' + r_base elif language in ['Matlab']: r = b'(%)' + r_base else: return None return b'(' + r + b')' def get_block(language): r_base = b'.*?' if language in ['C', 'C++', 'C#', 'JavaScript', 'ActionScript', 'PLSQL', 'PHP', 'Rust']: r = b'(/\\*)' + r_base + b'(\\*/)' elif language in ['OCaml', 'Pascal', 'Modula-2', 'Smarty']: r = b'(\\(\\*)' + r_base + b'(\\*\\))' elif language == 'Python': r = b'(\'\'\')' + r_base + b'(\'\'\')' else: return None return b'(' + r + b')' def purify(text, lang): # TODO: for some language like HTML, remove code other than principal language pass