diff --git a/requirements.txt b/requirements.txt index 682ea65..8c1e153 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,11 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner pygithub pyyaml tensorflow-gpu keras h5py +flask +git+https://github.com/maxpumperla/elephas diff --git a/scripts/comparison_2.pdf b/scripts/comparison_2.pdf deleted file mode 100644 index 82e40f1..0000000 Binary files a/scripts/comparison_2.pdf and /dev/null differ diff --git a/scripts/results_cnn_word.pdf b/scripts/results_cnn_word.pdf new file mode 100644 index 0000000..f1b017c Binary files /dev/null and b/scripts/results_cnn_word.pdf differ diff --git a/scripts/results_ngrams_naive_bayes.pdf b/scripts/results_ngrams_naive_bayes.pdf new file mode 100644 index 0000000..427b08c Binary files /dev/null and b/scripts/results_ngrams_naive_bayes.pdf differ diff --git a/swh/langdetect/cnn.py b/swh/langdetect/cnn.py index b250a02..a9ab20a 100644 --- a/swh/langdetect/cnn.py +++ b/swh/langdetect/cnn.py @@ -1,263 +1,303 @@ import os import sys import subprocess import time import random import csv import numpy as np import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") import tensorflow as tf import json import argparse from ast import literal_eval from pickle import dump from pickle import load from numpy import array from .utils.common import tokenizer from .utils.common import file_to_string from keras.preprocessing.sequence import pad_sequences from keras.callbacks import EarlyStopping from keras.models import Model from keras.models import Sequential from keras.models import load_model from keras.layers import Input from keras.layers import Dense from keras.layers import Flatten from keras.layers import Dropout from keras.layers import ThresholdedReLU from keras.layers import Activation from keras.layers import Lambda from keras.layers import Embedding from keras.layers.convolutional import Convolution1D from keras.layers.convolutional import MaxPooling1D from keras.layers.normalization import BatchNormalization from keras.utils import np_utils from keras.optimizers import SGD +from pyspark import SparkContext, SparkConf +from elephas.spark_model import SparkModel # pip install flask +from elephas import optimizers as elephas_optimizers +from elephas.utils.rdd_utils import to_labeled_point + +csv.field_size_limit(sys.maxsize) + +conf = SparkConf().setAppName('Elephas_App').setMaster('local[8]') +sc = SparkContext(conf=conf) def main(): parser = argparse.ArgumentParser(description='Training and test tool of charactor-level ConvNet text categorisation.') subparsers = parser.add_subparsers(dest='sub_command') parser_train = subparsers.add_parser('train', help='Training on the dataset, dataset must be a *.csv file. A model will be created in the same directory.') + parser_train.add_argument('-s', '--spark', type=bool, help='Training on cluster.', dest='train_spark') parser_train.add_argument('train_path', metavar='PATH', type=str, help='Path of the training dataset.') parser_train.add_argument('-ms', '--maxsize', metavar='SIZE', dest='train_maxsize', type=int, help='Set maximum input size of ConvNet, default 1024.') parser_train.add_argument('-e', '--epochs', metavar='N', dest='train_epochs', type=int, help='Number of training epochs (iterations), default 50.') parser_test = subparsers.add_parser('test', help='Test on the dataset, dataset must be a directory with *.csv dataset named by corresponding language.') parser_test.add_argument('test_root', metavar='ROOT', type=str, help='Root of the test dataset.') if len(sys.argv[1:]) == 0: parser.print_help() parser.exit() args = parser.parse_args() if args.sub_command == 'train' : if args.train_maxsize: if args.train_epochs: n = CNN(args.train_path, maxsize=args.train_maxsize, epochs=args.train_epochs) - n.train() else: n = CNN(args.train_path, maxsize=args.train_maxsize) - n.train() else: if args.train_epochs: n = CNN(args.train_path, epochs=args.train_epochs) - n.train() else: n = CNN(args.train_path) - n.train() + if not args.train_spark: + n.train() + else: + n.train_on_cluster() elif args.sub_command == 'test': n = CNN(args.test_root) n.test() else: parser.parse_args('-h') class CNN: def __init__(self, path, maxsize=1024, epochs=50): self._path = path # Root of model folder self._root_model = os.path.join(os.path.dirname(path), 'model_cnn') try: os.mkdir(self._root_model) except: pass # Path of result self._path_result = os.path.join(os.path.dirname(path), 'result_cnn') dir_path = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(dir_path, 'static_data', 'languages.json'), 'r') as f: self._languages = json.load(f) self._path_test_csv = path self._input_size = maxsize self._vocab_size = 256 self._num_of_classes = len(self._languages) self._batch_size = 128 self._epochs = epochs def file_len(self, fname): with open(fname) as f: count = 0 for l in f: count += 1 return count def train(self): model = self._get_model() earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=2, verbose=0, mode='auto') callbacks = [earlystop] model.fit_generator( self._generator(self._input_size, self._num_of_classes, self._batch_size), steps_per_epoch=self.file_len(self._path) / self._batch_size, epochs=self._epochs, callbacks=callbacks) model.save(os.path.join(self._root_model, 'model.h5')) + def train_on_cluster(self): + + rdd = self._get_train_rdd() + model = self._get_model() + adagrad = elephas_optimizers.Adagrad() + + spark_model = SparkModel(sc, model, optimizer=adagrad, frequency='epoch', mode='asynchronous', num_workers=2) + spark_model.train(rdd, nb_epoch=self._epochs, batch_size=self._batch_size, verbose=0, categorical=True, nb_classes=self._num_of_classes) + + model.save(os.path.join(self._root_model, 'model.h5')) + + def _get_train_rdd(self): + print('Prepairing RDD for training...') + X_train = np.empty((0, self._input_size)) + Y_train = np.empty((0, self._num_of_classes)) + with open(self._path, newline='') as csvfile: + r = csv.reader(csvfile, delimiter=' ', quotechar='|') + for pair in r: + label, string = pair + label = int(label) + print(label, end='\r') + string = literal_eval(string) + tokens = [x + 1 for x in tokenizer(string, 'letter')] + X_train = np.append(X_train, + pad_sequences([tokens], maxlen=self._input_size), + axis=0) + label = array(np_utils.to_categorical([label], self._num_of_classes)) + Y_train = np.append(Y_train, label, axis=0) + rdd = to_labeled_point(sc, X_train, Y_train, categorical=True) + def _generator(self, length, total_class, batch_size=128): counter = 0 while True: with open(self._path, newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: if counter == 0: X = np.empty((0, length)) Y = np.empty((0, total_class)) label, string = pair label = int(label) string = literal_eval(string) tokens = [x + 1 for x in tokenizer(string, 'letter')] X = np.append(X, pad_sequences([tokens], maxlen=length), axis=0) label = array(np_utils.to_categorical([label], total_class)) Y = np.append(Y, label, axis=0) counter += 1 if counter == batch_size: counter = 0 yield(X,Y) def _get_model(self): input_size = self._input_size alphabet_size = self._vocab_size embedding_size = 256 conv_layers = [(256,7,3), (256,7,3), (256,3,-1), (256,3,-1), (256,3,-1), (256,3,3)] threshold = 1e-6 fully_connected_layers = [1024, 1024] dropout_p = 0.2 optimizer = 'adam' loss = 'categorical_crossentropy' num_of_classes = self._num_of_classes # Input layer inputs = Input(shape=(input_size,), name='sent_input', dtype='int64') # Embedding layers x = Embedding(alphabet_size + 1, embedding_size, input_length=input_size)(inputs) # Convolution layers for cl in conv_layers: x = Convolution1D(cl[0], cl[1])(x) x = ThresholdedReLU(threshold)(x) if cl[2] != -1: x = MaxPooling1D(cl[2])(x) x = Flatten()(x) # Fully connected layers for fl in fully_connected_layers: x = Dense(fl)(x) x = ThresholdedReLU(threshold)(x) x = Dropout(dropout_p)(x) # Output layer predictions = Dense(num_of_classes, activation='softmax')(x) # Build and compile model model = Model(inputs=inputs, outputs=predictions) model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) print(model.summary()) return model def _max_len(self, texts): return max([len(text) for text in texts]) def test(self): csv.field_size_limit(sys.maxsize) try: r = open(self._path_result, 'rb') test_result = load(r) r.close() except FileNotFoundError: test_result = {} model = self._load_model() for language in [x for x in self._languages if x not in test_result.keys()]: test_result[language] = self.test_class(model, language) with open(self._path_result, 'wb') as f: dump(test_result, f) def _load_model(self): model = load_model(os.path.join(self._root_model, 'model.h5')) return model def _count_size(self, files): size = 0 for f in files: size += os.path.getsize(f) return size def test_class(self, model, language): ok = 0 results = [] count = 0 total_test = self.file_len(os.path.join(self._path_test_csv, language + '.csv')) with open(os.path.join(self._path_test_csv, language + '.csv'), newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: label, string = pair label = int(label) string = literal_eval(string) tokens = [x + 1 for x in tokenizer(string, 'letter')] result = self._guess_file_language(model, tokens) count += 1 print('[{0:4d}/{1:4d}] {2}:{3} '.format(count, total_test, result[0][1], result[0][0]),end='\r') results.append(result[0]) if result[0][1] == language: ok += 1 accuracy = ok / total_test print('Tests for {} '.format(language)) print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) print('Accuracy : {}%'.format(accuracy * 100)) return (ok, total_test, accuracy, results) def speed_benchmark(self): language = self._languages[10] model = self._load_model() test_set = self._get_test_set(language) total_size = self._count_size(test_set) print('{} kB in total'.format(total_size / 1024)) t_start = time.perf_counter() self.test_class(model, language) t_end = time.perf_counter() print('{} seconds.'.format(t_end - t_start)) print('{} seconds per KiB'.format(((t_end - t_start) / total_size) * 1024)) def _guess_file_language(self, model, tokens): X = pad_sequences([tokens], maxlen=self._input_size) result = list(model.predict(X))[0] result = [(s, self._languages[i]) for i, s in enumerate(result)] return sorted(result, reverse=True) if __name__ == '__main__': main()