Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/langdetect/cnn.py b/swh/langdetect/cnn.py
index 30aa209..06d00eb 100644
--- a/swh/langdetect/cnn.py
+++ b/swh/langdetect/cnn.py
@@ -1,247 +1,258 @@
import os
import sys
import subprocess
import time
import random
import csv
import numpy as np
import warnings
-import tensorflow as tf
+with warnings.catch_warnings():
+ import tensorflow as tf
import json
import argparse
from ast import literal_eval
from pickle import dump
from pickle import load
from numpy import array
from utils.common import tokenizer
from utils.common import file_to_string
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import ThresholdedReLU
from keras.layers import Activation
from keras.layers import Lambda
from keras.layers import Embedding
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate
from keras.utils import np_utils
from keras.optimizers import SGD
def main():
parser = argparse.ArgumentParser(description='Training and test tool of charactor-level ConvNet text categorisation.')
subparsers = parser.add_subparsers()
parser_train = subparsers.add_parser('train', help='Training on the dataset, dataset must be a *.csv file. A model will be created in the same directory.')
parser_train.add_argument('train_path', metavar='PATH', type=str, help='Path of the training dataset.')
- parser_train.add_argument('-ms', '--maxsize', metavar='SIZE', dest='train_maxsize', type=int, help='Set maximum input size of ConvNet.')
+ parser_train.add_argument('-ms', '--maxsize', metavar='SIZE', dest='train_maxsize', type=int, help='Set maximum input size of ConvNet, default 1024.')
+ parser_train.add_argument('-e', '--epochs', metavar='N', dest='train_epochs', type=int, help='Number of training epoches (iteration), default 50.')
parser_test = subparsers.add_parser('test', help='Test on the dataset, dataset must be a directory with *.csv dataset named by corresponding language.')
parser_test.add_argument('test_root', metavar='ROOT', type=str, help='Root of the test dataset.')
args = parser.parse_args()
print(args)
if args.train_path:
if args.train_maxsize:
- n = CNN(args.train_path, args.train_maxsize)
- n.train()
+ if args.train_epochs:
+ n = CNN(args.train_path, maxsize=args.train_maxsize, epochs=args.train_epochs)
+ n.train()
+ else:
+ n = CNN(args.train_path, maxsize=args.train_maxsize)
+ n.train()
else:
- n = CNN(args.train_path)
- n.train()
+ if args.train_epochs:
+ n = CNN(args.train_path, epochs=args.train_epochs)
+ n.train()
+ else:
+ n = CNN(args.train_path)
+ n.train()
elif args.test_root:
n = CNN(args.test_root)
n.test()
else:
parser.parse_args('-h')
class CNN:
- def __init__(self, path, maxsize=1024):
+ def __init__(self, path, maxsize=1024, epochs=50):
self._path = path
# Root of model folder
self._root_model = os.path.join(os.path.dirname(path), 'model_cnn')
try:
os.mkdir(self._root_model)
except:
pass
# Path of result
self._path_result = os.path.join(os.path.dirname(path), 'result_cnn')
with open('static_data/languages.json', 'r') as f:
self._languages = json.load(f)
self._path_test_csv = path
self._input_size = maxsize
self._vocab_size = 256
self._num_of_classes = len(self._languages)
self._batch_size = 128
+ self._epochs = epochs
def file_len(self, fname):
with open(fname) as f:
count = 0
for l in f:
count += 1
return count
def train(self):
model = self._get_model()
model.fit_generator(
self._generator(self._input_size, self._num_of_classes, self._batch_size),
- steps_per_epoch=self.file_len(self._path) / self._batch_size, epochs=50)
+ steps_per_epoch=self.file_len(self._path) / self._batch_size, epochs=self._epochs)
model.save(os.path.join(self._root_model, 'model.h5'))
def _generator(self, length, total_class, batch_size=128):
counter = 0
while True:
with open(self._path, newline='') as csvfile:
r = csv.reader(csvfile, delimiter=' ', quotechar='|')
for pair in r:
if counter == 0:
X = np.empty((0, length))
Y = np.empty((0, total_class))
label, string = pair
label = int(label)
string = literal_eval(string)
tokens = [x + 1 for x in tokenizer(string, 'letter')]
X = np.append(X, pad_sequences([tokens], maxlen=length), axis=0)
label = array(np_utils.to_categorical([label], total_class))
Y = np.append(Y, label, axis=0)
counter += 1
if counter == batch_size:
counter = 0
yield(X,Y)
def _get_model(self):
input_size = self._input_size
alphabet_size = self._vocab_size
embedding_size = 256
conv_layers = [(256,7,3), (256,7,3), (256,3,-1), (256,3,-1), (256,3,-1), (256,3,3)]
threshold = 1e-6
fully_connected_layers = [1024, 1024]
dropout_p = 0.2
optimizer = 'adam'
loss = 'categorical_crossentropy'
num_of_classes = self._num_of_classes
# Input layer
inputs = Input(shape=(input_size,), name='sent_input', dtype='int64')
# Embedding layers
x = Embedding(alphabet_size + 1, embedding_size, input_length=input_size)(inputs)
# Convolution layers
for cl in conv_layers:
x = Convolution1D(cl[0], cl[1])(x)
x = ThresholdedReLU(threshold)(x)
if cl[2] != -1:
x = MaxPooling1D(cl[2])(x)
x = Flatten()(x)
# Fully connected layers
for fl in fully_connected_layers:
x = Dense(fl)(x)
x = ThresholdedReLU(threshold)(x)
x = Dropout(dropout_p)(x)
# Output layer
predictions = Dense(num_of_classes, activation='softmax')(x)
# Build and compile model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
print(model.summary())
return model
def _max_len(self, texts):
return max([len(text) for text in texts])
def test(self):
csv.field_size_limit(sys.maxsize)
try:
r = open(self._path_result, 'rb')
test_result = load(r)
r.close()
except FileNotFoundError:
test_result = {}
model = self._load_model()
for language in [x for x in self._languages if x not in test_result.keys()]:
test_result[language] = self.test_class(model, language)
with open(self._path_result, 'wb') as f:
dump(test_result, f)
def _load_model(self):
model = load_model(os.path.join(self._root_model, 'model.h5'))
return model
def _count_size(self, files):
size = 0
for f in files:
size += os.path.getsize(f)
return size
def test_class(self, model, language):
ok = 0
results = []
count = 0
total_test = self.file_len(os.path.join(self._path_test_csv, language + '.csv'))
with open(os.path.join(self._path_test_csv, language + '.csv'), newline='') as csvfile:
r = csv.reader(csvfile, delimiter=' ', quotechar='|')
for pair in r:
label, string = pair
label = int(label)
string = literal_eval(string)
tokens = [x + 1 for x in tokenizer(string, 'letter')]
result = self._guess_file_language(model, tokens)
count += 1
print('[{0:4d}/{1:4d}] {2}:{3} '.format(count, total_test, result[0][1], result[0][0]),end='\r')
results.append(result[0])
if result[0][1] == language:
ok += 1
accuracy = ok / total_test
print('Tests for {} '.format(language))
print('Total test files : {}'.format(total_test))
print('Correctly classified files : {}'.format(ok))
print('Accuracy : {}%'.format(accuracy * 100))
return (ok, total_test, accuracy, results)
def speed_benchmark(self):
language = self._languages[10]
model = self._load_model()
test_set = self._get_test_set(language)
total_size = self._count_size(test_set)
print('{} kB in total'.format(total_size / 1024))
t_start = time.perf_counter()
self.test_class(model, language)
t_end = time.perf_counter()
print('{} seconds.'.format(t_end - t_start))
print('{} seconds per KiB'.format(((t_end - t_start) / total_size) * 1024))
def _guess_file_language(self, model, tokens):
X = pad_sequences([tokens], maxlen=self._input_size)
result = list(model.predict(X))[0]
result = [(s, self._languages[i]) for i, s in enumerate(result)]
return sorted(result, reverse=True)
if __name__ == '__main__':
main()

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 1:48 PM (3 d, 20 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3304614

Event Timeline