diff --git a/scripts/draw_accuracy.py b/scripts/draw_accuracy.py index b159c73..614c34f 100644 --- a/scripts/draw_accuracy.py +++ b/scripts/draw_accuracy.py @@ -1,77 +1,77 @@ #!/bin/bash/python3 import sys from pickle import load from collections import namedtuple, Counter try: import numpy as np import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator except ImportError: raise ImportError('Please install matplotlib') def main(path): with open(path, 'rb') as f: data = load(f) mat = process(data) labels = sorted(data) fig, ax = plt.subplots() fig.set_size_inches(100,100) heatmap = ax.matshow(mat, cmap='Blues') fig = plt.gcf() ax.set_frame_on(False) ax.set_yticks(np.arange(len(labels)), minor=False) ax.set_xticks(np.arange(len(labels)), minor=False) ax.set_xticklabels(labels, minor=False) ax.set_yticklabels(labels, minor=False) ax.xaxis.tick_top() - plt.xticks(rotation=45) + plt.xticks(rotation=90) ax.grid(False) ''' for i in np.arange(len(mat)): for j in np.arange(len(mat[i])): ax.text(i, j, "%.1f" % (mat[i][j] * 100), color='white') ''' ax = plt.gca() for t in ax.xaxis.get_major_ticks(): t.tick1On = False t.tick2On = False for t in ax.yaxis.get_major_ticks(): t.tick1On = False t.tick2On = False fig.savefig("results.pdf", bbox_inches='tight') def process(data): ''' ''' ldata = sorted(data) length = len(ldata) out = [[0 for x in range(length)] for y in range(length)] for lang in ldata: index_lan = ldata.index(lang) ok = data[lang][0] if data[lang][1] >= 1000 : test_size = 1000 else: test_size = data[lang][1] result = [x[1] for x in data[lang][3]] counter = dict(Counter(result)) for res_lan in counter.keys(): index_res = ldata.index(res_lan) out[index_lan][index_res] = counter[res_lan] / test_size return out if __name__ == '__main__': if len(sys.argv) != 2: print('Only argument acceptable is a path.') else: main(sys.argv[1]) diff --git a/swh/langdetect/ngramprob.py b/swh/langdetect/ngramprob.py index ef64e30..ba8b5a3 100644 --- a/swh/langdetect/ngramprob.py +++ b/swh/langdetect/ngramprob.py @@ -1,135 +1,137 @@ import os, sys, subprocess import kenlm +from pickle import dump, load from utils.common import tokenizer, file_to_string, find_file, count_files class NGramProb: def __init__(self, root): # Root of dataset self._root = root # Root of training set self._root_training_set = os.path.join(self._root, '..', 'training_set') # Root of model folder self._root_model = os.path.join(self._root, '..', 'model_ngram_prob') # Root of arranged dataset self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language') # Path of result self._path_result = os.path.join(self._root, '..', 'result_prob') def train(self): try: if len(os.listdir(self._root_training_set)) == 0: build_training_set(self._root) try: os.mkdir(self._root_model) except FileExistsError: pass except FileNotFoundError: os.mkdir(self._root_training_set) build_training_set(self._root) for language in [x for x in os.listdir(self._root_training_set) if not x.startswith('.')]: root_training_set_language = os.path.join(self._root_training_set, language) texts = [] root_stat_language = os.path.join(self._root_model, language) if os.path.isfile(root_stat_language): continue for f in [x for x in os.listdir(root_training_set_language) if not x.startswith('.')]: filename = os.path.join(root_training_set_language, f) tokens = tokenizer(file_to_string(filename), 'letter') texts.append(' '.join(tokens)) train_text = ' '.join(texts) - command = ['../../bin/lmplz', '-o', '3', '--discount_fallback'] + command = ['../../bin/lmplz', '-o', '5', '--discount_fallback'] with open(root_stat_language, 'wb') as f: proc = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=f) proc.communicate(train_text.encode()) if os.path.getsize(root_stat_language) == 0: os.remove(root_stat_language) # st = os.stat(root_stat_language) # os.chmod(root_stat_language, st.st_mode | stat.S_IEXEC) def test(self): - test_result = {} + with open(self._path_result, 'rb') as r : + test_result = load(r) models = self._load_models() - for language in [x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')]: + for language in [x for x in os.listdir(self._root_language_dataset) if not x.startswith('.') and x not in test_result.keys()]: test_result[language] = self.test_class(models, language) - with open(self._path_result, 'wb') as f: - dump(test_result, f) + with open(self._path_result, 'wb') as f: + dump(test_result, f) def _load_models(self): models = {} for model in [model for model in os.listdir(self._root_model) if not model.startswith('.')]: root_model = os.path.join(self._root_model, model) models[model] = kenlm.LanguageModel(root_model) return models def _get_test_set(self, language): root_training_language = os.path.join(self._root_training_set, language) root_language = os.path.join(self._root_language_dataset, language) total = count_files(root_language) training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')] test_set = [find_file(root_language, x) for x in range(1, total + 1) if x not in training_set][:1000] return test_set def test_class(self, models, language): test_set = self._get_test_set(language) ok = 0 results = [] for test in test_set: result = self._guess_file_language(models, test) print('{} '.format(result[0]),end='\r') results.append(result[0]) if result[0][1] == language: ok += 1 total_test = len(test_set) accuracy = ok / len(test_set) print('Tests for {} '.format(language)) print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) print('Accuracy : {}%'.format(accuracy * 100)) return (ok, len(test_set), accuracy, results) def _guess_file_language(self, models, filename): tokens = tokenizer(file_to_string(filename), 'letter') text = ' '.join(tokens) result = [] for model_key in models.keys(): root_model = os.path.join(self._root_model, model_key) model = models[model_key] score = model.score(text) result.append((score, model_key)) return sorted(result, reverse=True) if __name__ == '__main__': if len(sys.argv) == 3 and sys.argv[1] == '--train': n = NGramProb(sys.argv[2]) n.train() elif len(sys.argv) == 3 and sys.argv[1] == '--test': n = NGramProb(sys.argv[2]) n.test() elif len(sys.argv) == 3 and sys.argv[1] == '--benchmark': n = NGramProb(sys.argv[2]) n.speed_benchmark() elif len(sys.argv) == 4 and sys.argv[1] == '--test': n = NGramProb(sys.argv[2]) n.test_class(n.load_models(), sys.argv[3]) else: print('Wrong arguments, please check your input.')