diff --git a/scripts/draw_accuracy.py b/scripts/draw_accuracy.py index 614c34f..b159c73 100644 --- a/scripts/draw_accuracy.py +++ b/scripts/draw_accuracy.py @@ -1,77 +1,77 @@ #!/bin/bash/python3 import sys from pickle import load from collections import namedtuple, Counter try: import numpy as np import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator except ImportError: raise ImportError('Please install matplotlib') def main(path): with open(path, 'rb') as f: data = load(f) mat = process(data) labels = sorted(data) fig, ax = plt.subplots() fig.set_size_inches(100,100) heatmap = ax.matshow(mat, cmap='Blues') fig = plt.gcf() ax.set_frame_on(False) ax.set_yticks(np.arange(len(labels)), minor=False) ax.set_xticks(np.arange(len(labels)), minor=False) ax.set_xticklabels(labels, minor=False) ax.set_yticklabels(labels, minor=False) ax.xaxis.tick_top() - plt.xticks(rotation=90) + plt.xticks(rotation=45) ax.grid(False) ''' for i in np.arange(len(mat)): for j in np.arange(len(mat[i])): ax.text(i, j, "%.1f" % (mat[i][j] * 100), color='white') ''' ax = plt.gca() for t in ax.xaxis.get_major_ticks(): t.tick1On = False t.tick2On = False for t in ax.yaxis.get_major_ticks(): t.tick1On = False t.tick2On = False fig.savefig("results.pdf", bbox_inches='tight') def process(data): ''' ''' ldata = sorted(data) length = len(ldata) out = [[0 for x in range(length)] for y in range(length)] for lang in ldata: index_lan = ldata.index(lang) ok = data[lang][0] if data[lang][1] >= 1000 : test_size = 1000 else: test_size = data[lang][1] result = [x[1] for x in data[lang][3]] counter = dict(Counter(result)) for res_lan in counter.keys(): index_res = ldata.index(res_lan) out[index_lan][index_res] = counter[res_lan] / test_size return out if __name__ == '__main__': if len(sys.argv) != 2: print('Only argument acceptable is a path.') else: main(sys.argv[1]) diff --git a/swh/langdetect/ngramdist.py b/swh/langdetect/ngramdist.py index 43547c2..ff13c89 100644 --- a/swh/langdetect/ngramdist.py +++ b/swh/langdetect/ngramdist.py @@ -1,215 +1,244 @@ """ Baseline approach """ -import os, sys, operator, nltk +import os, sys, operator, nltk, random, time from pickle import dump, load from nltk.util import ngrams from utils.common import tokenizer, file_to_string, find_file, count_files from utils.training import build_training_set class NGramDist: def __init__(self, root): # Root of dataset self._root = root # Root of training set self._root_training_set = os.path.join(self._root, '..', 'training_set') # Root of model folder self._root_model = os.path.join(self._root, '..', 'model_ngram_dist') # Root of arranged dataset self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language') # Path of result self._path_result = os.path.join(self._root, '..', 'result') def train(self): ''' train () generates and stores counted n-grams in '_root_model' folder ''' try: if len(os.listdir(self._root_training_set)) == 0: build_training_set(self._root) try: os.mkdir(self._root_model) except FileExistsError: pass except FileNotFoundError: os.mkdir(self._root_training_set) build_training_set(self._root) ''' Calculate frequencies of generated n-grams then store them into a sorted list of (ngram, count) ''' for language in os.listdir(self._root_training_set): if not language.startswith('.'): root_training_set_language = os.path.join(self._root_training_set, language) root_stat_language = os.path.join(self._root_model, language) if os.path.isfile(root_stat_language): continue else: statistics = {} for f in os.listdir(root_training_set_language): print(f) if not f.startswith('.'): filename = os.path.join(root_training_set_language, f) tokens = tokenizer(file_to_string(filename)) generated_ngrams = self._generate_ngrams(tokens, 3) self._count_ngrams(statistics, generated_ngrams) with open(root_stat_language, 'wb') as f: dump(self._sort_by_value(statistics), f) def _generate_ngrams(self, tokens, n): ''' :param tokens: generated tokens from a string. :param n: maximum n of n-grams :type tokens: list :type n: int :return: generated 1-grams, ... , n-grams :rtype: list ''' generated_ngrams = [] for i in range(1, n+1): igrams = ngrams(tokens, i, pad_left=True, pad_right=True, left_pad_symbol = '$BOF$', right_pad_symbol = '$EOF$') for igram in igrams: generated_ngrams.append(''.join(igram)) return generated_ngrams def _count_ngrams(self, statistics, ngrams): ''' :param statistics: shared dictionary for statistics :param ngrams: n-grams to be accumulated into statistics ''' for ngram in ngrams: statistics[ngram] = statistics.get(ngram, 0) + 1 def test(self): test_result = {} models = self._load_models() for language in [x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')]: test_result[language] = self.test_class(models, language) with open(self._path_result, 'wb') as f: dump(test_result, f) + def speed_benchmark(self): + language = random.choice([x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')]) + models = self._load_models() + + test_set = self._get_test_set(language) + total_size = self._count_size(test_set) + print('{} kB in total'.format(total_size / 1024)) + + t_start = time.perf_counter() + self.test_class(models, language) + t_end = time.perf_counter() + + print('{} seconds.'.format(t_end - t_start)) + print('{} seconds per kB'.format(((t_end - t_start) / total_size) * 1024)) + + def _load_models(self): models = {} for model in [model for model in os.listdir(self._root_model) if not model.startswith('.')]: root_model = os.path.join(self._root_model, model) with open(root_model, 'rb') as sorted_file: models[model] = self._list_to_dict(load(sorted_file)) return models def _list_to_dict(self, model): model_ngrams = [x[0] for x in model] model_dict = {} index = 0 for ngram in model_ngrams: index += 1 model_dict[ngram] = index return model_dict - def test_class(self, models, language): + def _get_test_set(self, language): root_training_language = os.path.join(self._root_training_set, language) root_language = os.path.join(self._root_language_dataset, language) total = count_files(root_language) training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')] - test_set = [x for x in range(1, total + 1) if x not in training_set][:1000] + test_set = [find_file(root_language, x) for x in range(1, total + 1) if x not in training_set][:1000] + return test_set + + def _count_size(self, files): + size = 0 + for f in files: + size += os.path.getsize(f) + return size + + def test_class(self, models, language): + test_set = self._get_test_set(language) ok = 0 results = [] for test in test_set: - result = self._guess_file_language(models, find_file(root_language, test)) + result = self._guess_file_language(models, test) print('{} '.format(result[0]),end='\r') results.append(result[0]) if result[0][1] == language: ok += 1 total_test = len(test_set) accuracy = ok / len(test_set) print('Tests for {} '.format(language)) print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) print('Accuracy : {}%'.format(accuracy * 100)) - return (ok, total, accuracy, results) + return (ok, len(test_set), accuracy, results) - def test_single(self, filename): - self._guess_file_language(filename) + def test_single(self, models, filename): + self._guess_file_language(models, filename) def _guess_file_language(self, models, filename): tokens = tokenizer(file_to_string(filename)) generated_ngrams = self._generate_ngrams(tokens, 3) statistics = {} self._count_ngrams(statistics, generated_ngrams) test_profile = self._list_to_dict(self._sort_by_value(statistics)) result = [] for model in models.keys(): root_model = os.path.join(self._root_model, model) model_profile = models[model] distance = self._distance(model_profile, test_profile) result.append((distance, model)) return sorted(result) def _sort_by_value(self, statistics): statistics_sorted = sorted(statistics.items(), key = operator.itemgetter(1), reverse = True)[:500] return statistics_sorted def _distance(self, model_profile, test_profile): distance = 0 maximum = len(test_profile) for test_ngram in test_profile.keys(): test_rank = test_profile.get(test_ngram) model_rank = model_profile.get(test_ngram, maximum) d = abs(test_rank - model_rank) distance += d return distance ''' def _prob(model, trigrams): print('Checking {} model ...'.format(model)) with open(model, 'rb') as f: kneser_ney = load(f) result = 1 for trigram in trigrams: prob = kneser_ney.prob(trigram) result = result * prob return result ''' if __name__ == '__main__': if len(sys.argv) == 3 and sys.argv[1] == '--train': n = NGramDist(sys.argv[2]) n.train() elif len(sys.argv) == 3 and sys.argv[1] == '--test': n = NGramDist(sys.argv[2]) n.test() + elif len(sys.argv) == 3 and sys.argv[1] == '--benchmark': + n = NGramDist(sys.argv[2]) + n.speed_benchmark() elif len(sys.argv) == 4 and sys.argv[1] == '--test': n = NGramDist(sys.argv[2]) n.test_class(n.load_models(), sys.argv[3]) else: print('Wrong arguments, please check your input.')