diff --git a/scripts/comparison.pdf b/scripts/comparison.pdf index 1817b22..0107d69 100644 Binary files a/scripts/comparison.pdf and b/scripts/comparison.pdf differ diff --git a/scripts/draw_accuracy.py b/scripts/draw_accuracy.py index ad3a51a..680b387 100644 --- a/scripts/draw_accuracy.py +++ b/scripts/draw_accuracy.py @@ -1,144 +1,145 @@ #!/bin/bash/python3 -import sys +import sys, os from pickle import load from collections import namedtuple, Counter try: import numpy as np import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator except ImportError: raise ImportError('Please install matplotlib') def heatmap(path): with open(path, 'rb') as f: data = load(f) mat = process(data) labels = sorted(data) fig, ax = plt.subplots() fig.set_size_inches(100,100) heatmap = ax.matshow(mat, cmap='Blues') fig = plt.gcf() ax.set_frame_on(False) ax.set_yticks(np.arange(len(labels)), minor=False) ax.set_xticks(np.arange(len(labels)), minor=False) ax.set_xlabel('Classification of test files') ax.set_ylabel('Ground truth class of test files') ax.set_xticklabels(labels, minor=False) ax.set_yticklabels(labels, minor=False) ax.xaxis.tick_top() ax.xaxis.set_label_position('top') plt.xticks(rotation=90) ax.grid(False) ''' for i in np.arange(len(mat)): for j in np.arange(len(mat[i])): ax.text(i, j, "%.1f" % (mat[i][j] * 100), color='white') ''' ax = plt.gca() for t in ax.xaxis.get_major_ticks(): t.tick1On = False t.tick2On = False for t in ax.yaxis.get_major_ticks(): t.tick1On = False t.tick2On = False fig.savefig("results.pdf", bbox_inches='tight') def process(data): ''' ''' ldata = sorted(data) length = len(ldata) out = [[0 for x in range(length)] for y in range(length)] for lang in ldata: index_lan = ldata.index(lang) ok = data[lang][0] if data[lang][1] > 1000 : test_size = 1000 else: test_size = data[lang][1] result = [x[1] for x in data[lang][3]] counter = dict(Counter(result)) for res_lan in counter.keys(): index_res = ldata.index(res_lan) out[index_lan][index_res] = counter.get(res_lan, 0) / test_size return out def get_accuracy(data): ldata = sorted(data) out = {} for lang in ldata: ok = data[lang][0] if data[lang][1] > 1000: test_size = 1000 else: test_size = data[lang][1] result = [x[1] for x in data[lang][3]] counter = dict(Counter(result)) out[lang] = counter.get(lang, 0) / test_size return out -def compare(baseline, target1, target2): - with open(baseline, 'rb') as f: - data = load(f) - dict_base = get_accuracy(data) - - with open(target1, 'rb') as f: - data = load(f) - dict_targ1 = get_accuracy(data) - - with open(target2, 'rb') as f: - data = load(f) - dict_targ2 = get_accuracy(data) +def compare(results): + + datas = [] + for result in results: + with open(result, 'rb') as f: + datas.append(load(f)) + + dicts = [] + for data in datas: + dicts.append(get_accuracy(data)) - all_lang = sorted(list(set().union(dict_base.keys(),dict_targ1.keys())))[::-1] + all_lang = sorted(list(set().union(dicts[0].keys(),dicts[1].keys())))[::-1] n = len(all_lang) - acc_base = [dict_base.get(lang, 0) for lang in all_lang] - acc_targ1 = [dict_targ1.get(lang, 0) for lang in all_lang] - acc_targ2 = [dict_targ2.get(lang, 0) for lang in all_lang] + accs = [] + for d in dicts: + accs.append([d.get(lang, 0) for lang in all_lang]) + fig, ax = plt.subplots() - fig.set_size_inches(10,250) + fig.set_size_inches(10, 75 * len(results)) ind = np.arange(n) - width = 0.25 + width = 0.75 / len(results) opacity = 0.4 - rects1 = ax.barh(ind + 1.5 * width, acc_base, width, alpha=opacity, color='b', label='N-grams with frequency distance') - rects2 = ax.barh(ind + 0.5 * width, acc_targ1, width, alpha=opacity, color='r', label='N-grams with probability') - rects3 = ax.barh(ind - 0.5 * width, acc_targ2, width, alpha=opacity, color='y', label='Multinominal Naive Bayes') + + rectss = [] + colors = ['b', 'r', 'c', 'm', 'y', 'g'] + + for idx, result in enumerate(results): + rectss.append(ax.barh(ind - (idx - len(results) / 2) * width, accs[idx], width, alpha=opacity, color=colors[idx % len(colors)], label=os.path.basename(result))) ax.set_xlabel('Accuracy / %') ax.set_yticks(ind + width / 2) ax.set_yticklabels(all_lang) vals = ax.get_xticks() ax.set_xticklabels(['{:3.0f}%'.format(x * 100) for x in vals]) ax.xaxis.tick_top() ax.legend() def autolabel(rects): for rect in rects: width = rect.get_width() ax.text(width + 0.01, rect.get_y() + rect.get_height() / 2., '{0:.1f}%'.format(width * 100), ha='left', va='center') - autolabel(rects1) - autolabel(rects2) - autolabel(rects3) + for rects in rectss: + autolabel(rects) plt.ylim([-1,n+1]) fig.tight_layout() fig.savefig("comparison.pdf", bbox_inches='tight') if __name__ == '__main__': if len(sys.argv) == 2: heatmap(sys.argv[1]) - elif len(sys.argv) == 4: - compare(sys.argv[1],sys.argv[2],sys.argv[3]) + elif len(sys.argv) > 2: + compare(sys.argv[1:]) else: print('Please check arguments.') diff --git a/swh/langdetect/cnn_w.py b/swh/langdetect/cnn_w.py index 4dad105..6500e74 100644 --- a/swh/langdetect/cnn_w.py +++ b/swh/langdetect/cnn_w.py @@ -1,300 +1,302 @@ import os import sys import subprocess import time import random import csv import numpy as np import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") import tensorflow as tf import json import argparse from ast import literal_eval from pickle import dump from pickle import load from numpy import array from .utils.common import tokenizer from .utils.common import file_to_string from keras.preprocessing.sequence import pad_sequences from keras.callbacks import EarlyStopping from keras.models import Model from keras.models import Sequential from keras.models import load_model from keras.layers import Input from keras.layers import Dense from keras.layers import Flatten from keras.layers import Merge from keras.layers import Dropout from keras.layers import ThresholdedReLU from keras.layers import Activation from keras.layers import Lambda from keras.layers import Embedding from keras.layers.convolutional import Convolution1D from keras.layers.convolutional import MaxPooling1D from keras.layers.normalization import BatchNormalization from keras.layers import Concatenate from keras.utils import np_utils from keras.optimizers import SGD from collections import Counter +csv.field_size_limit(sys.maxsize) + from keras import backend as K K.set_session(K.tf.Session(config=K.tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1))) def main(): parser = argparse.ArgumentParser(description='Training and test tool of charactor-level ConvNet text categorisation.') subparsers = parser.add_subparsers(dest='sub_command') parser_train = subparsers.add_parser('train', help='Training on the dataset, dataset must be a *.csv file. A model will be created in the same directory.') parser_train.add_argument('train_path', metavar='PATH', type=str, help='Path of the training dataset.') parser_train.add_argument('-ms', '--maxsize', metavar='SIZE', dest='train_maxsize', type=int, help='Set maximum input size of ConvNet, default 1024.') parser_train.add_argument('-e', '--epochs', metavar='N', dest='train_epochs', type=int, help='Number of training epochs (iterations), default 50.') parser_test = subparsers.add_parser('test', help='Test on the dataset, dataset must be a directory with *.csv dataset named by corresponding language.') parser_test.add_argument('test_root', metavar='ROOT', type=str, help='Root of the test dataset.') if len(sys.argv[1:]) == 0: parser.print_help() parser.exit() args = parser.parse_args() if args.sub_command == "train": if args.train_maxsize: if args.train_epochs: n = CNNword(args.train_path, maxsize=args.train_maxsize, epochs=args.train_epochs) n.train() else: n = CNNword(args.train_path, maxsize=args.train_maxsize) n.train() else: if args.train_epochs: n = CNNword(args.train_path, epochs=args.train_epochs) n.train() else: n = CNNword(args.train_path) n.train() elif args.sub_command == "test": n = CNNword(args.test_root) print(args.test_root) n.test() else: parser.parse_args('-h') class CNNword: def __init__(self, path, maxsize=768, epochs=30): self._path = path # Root of model folder self._root_model = os.path.join(os.path.dirname(path), 'model_cnn_word') try: os.mkdir(self._root_model) except: pass # Path of result self._path_result = os.path.join(os.path.dirname(path), 'result_cnn_word') dir_path = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(dir_path, 'static_data', 'languages.json'), 'r') as f: self._languages = json.load(f) self._path_test_csv = path self._path_vocab = os.path.join(self._root_model, 'vocab') self._input_size = maxsize self._vocab_size = 20001 self._num_of_classes = len(self._languages) self._batch_size = 64 self._epochs = epochs if not os.path.isfile(self._path_vocab): self._learn_vocab(self._input_size, self._num_of_classes) with open(self._path_vocab, 'rb') as f: c = load(f) l = c.most_common(20000) print(l) self._indexer = dict((v[0], i + 1) for i, v in enumerate(l)) self._oov_index = len(self._indexer) + 1 def file_len(self, fname): with open(fname) as f: count = 0 for l in f: count += 1 return count def train(self): model = self._get_model() earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=2, verbose=0, mode='auto') callbacks = [earlystop] model.fit_generator( self._generator(self._input_size, self._num_of_classes, self._batch_size), steps_per_epoch=self.file_len(self._path) / self._batch_size, epochs=self._epochs, callbacks=callbacks) model.save(os.path.join(self._root_model, 'model.h5')) def _learn_vocab(self, length, total_class): c = Counter() with open(self._path, newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: label, string = pair label = int(label) print(label, end='\r') string = literal_eval(string) tokens = tokenizer(string, 'word') c.update(tokens) with open(self._path_vocab, 'wb') as f: dump(c, f) def _generator(self, length, total_class, batch_size=64): counter = 0 while True: with open(self._path, newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: if counter == 0: X = np.empty((0, length)) Y = np.empty((0, total_class)) label, string = pair label = int(label) string = literal_eval(string) tokens = [self._indexer.get(x, self._oov_index) for x in tokenizer(string, 'word')] X = np.append(X, pad_sequences([tokens], maxlen=length), axis=0) label = array(np_utils.to_categorical([label], total_class)) Y = np.append(Y, label, axis=0) counter += 1 if counter == batch_size: counter = 0 yield(X,Y) def _get_model(self): input_size = self._input_size vocab_size = self._vocab_size embedding_size = 128 optimizer = 'adam' loss = 'categorical_crossentropy' num_of_classes = self._num_of_classes embedding_layer = Embedding(vocab_size + 1, embedding_size, input_length=input_size, # trainable=False, ) # applying a more complex convolutional approach convs = [] filter_sizes = [3,4,5] sequence_input = Input(shape=(input_size,), dtype='int64') embedded_sequences = embedding_layer(sequence_input) for fsz in filter_sizes: - l_conv = Convolution1D(filters=32, kernel_size=fsz, activation='relu')(embedded_sequences) + l_conv = Convolution1D(filters=10, kernel_size=fsz, activation='relu')(embedded_sequences) l_pool = MaxPooling1D(5)(l_conv) convs.append(l_pool) l_merge = Concatenate(axis=1)(convs) - l_cov1= Convolution1D(128, 5, activation='relu')(l_merge) - l_pool1 = MaxPooling1D(5)(l_cov1) - l_cov2 = Convolution1D(128, 5, activation='relu')(l_pool1) - l_pool2 = MaxPooling1D(5)(l_cov2) + l_conv1= Convolution1D(128, 5, activation='relu')(l_merge) + l_pool1 = MaxPooling1D(5)(l_conv1) + l_conv2 = Convolution1D(128, 5, activation='relu')(l_pool1) + l_pool2 = MaxPooling1D(5)(l_conv2) l_flat = Flatten()(l_pool2) l_dense = Dense(512, activation='relu')(l_flat) preds = Dense(num_of_classes, activation='softmax')(l_dense) model = Model(sequence_input, preds) model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy']) print(model.summary()) return model def _max_len(self, texts): return max([len(text) for text in texts]) def test(self): csv.field_size_limit(sys.maxsize) try: r = open(self._path_result, 'rb') test_result = load(r) r.close() except FileNotFoundError: test_result = {} model = self._load_model() for language in [x for x in self._languages if x not in test_result.keys()]: test_result[language] = self.test_class(model, language) with open(self._path_result, 'wb') as f: dump(test_result, f) def _load_model(self): model = load_model(os.path.join(self._root_model, 'model.h5')) return model def _count_size(self, files): size = 0 for f in files: size += os.path.getsize(f) return size def test_class(self, model, language): ok = 0 results = [] count = 0 total_test = self.file_len(os.path.join(self._path_test_csv, language + '.csv')) with open(os.path.join(self._path_test_csv, language + '.csv'), newline='') as csvfile: r = csv.reader(csvfile, delimiter=' ', quotechar='|') for pair in r: label, string = pair label = int(label) string = literal_eval(string) tokens = [self._indexer.get(x, self._oov_index) for x in tokenizer(string, 'word')] result = self._guess_file_language(model, tokens) count += 1 print('[{0:4d}/{1:4d}] {2}:{3} '.format(count, total_test, result[0][1], result[0][0]),end='\r') results.append(result[0]) if result[0][1] == language: ok += 1 accuracy = ok / total_test print('Tests for {} '.format(language)) print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) print('Accuracy : {}%'.format(accuracy * 100)) return (ok, total_test, accuracy, results) def speed_benchmark(self): language = self._languages[10] model = self._load_model() test_set = self._get_test_set(language) total_size = self._count_size(test_set) print('{} kB in total'.format(total_size / 1024)) t_start = time.perf_counter() self.test_class(model, language) t_end = time.perf_counter() print('{} seconds.'.format(t_end - t_start)) print('{} seconds per KiB'.format(((t_end - t_start) / total_size) * 1024)) def _guess_file_language(self, model, tokens): X = pad_sequences([tokens], maxlen=self._input_size) result = list(model.predict(X))[0] result = [(s, self._languages[i]) for i, s in enumerate(result)] return sorted(result, reverse=True) if __name__ == '__main__': main()