diff --git a/scripts/draw_accuracy.py b/scripts/draw_accuracy.py new file mode 100644 index 0000000..614c34f --- /dev/null +++ b/scripts/draw_accuracy.py @@ -0,0 +1,77 @@ +#!/bin/bash/python3 + +import sys +from pickle import load +from collections import namedtuple, Counter +try: + import numpy as np + import matplotlib.pyplot as plt + from matplotlib.ticker import MaxNLocator +except ImportError: + raise ImportError('Please install matplotlib') + +def main(path): + with open(path, 'rb') as f: + data = load(f) + mat = process(data) + labels = sorted(data) + + fig, ax = plt.subplots() + fig.set_size_inches(100,100) + heatmap = ax.matshow(mat, cmap='Blues') + + fig = plt.gcf() + ax.set_frame_on(False) + + ax.set_yticks(np.arange(len(labels)), minor=False) + ax.set_xticks(np.arange(len(labels)), minor=False) + + ax.set_xticklabels(labels, minor=False) + ax.set_yticklabels(labels, minor=False) + ax.xaxis.tick_top() + plt.xticks(rotation=90) + ax.grid(False) + + ''' + for i in np.arange(len(mat)): + for j in np.arange(len(mat[i])): + ax.text(i, j, "%.1f" % (mat[i][j] * 100), color='white') + ''' + + ax = plt.gca() + + for t in ax.xaxis.get_major_ticks(): + t.tick1On = False + t.tick2On = False + for t in ax.yaxis.get_major_ticks(): + t.tick1On = False + t.tick2On = False + + fig.savefig("results.pdf", bbox_inches='tight') + +def process(data): + ''' + ''' + ldata = sorted(data) + length = len(ldata) + out = [[0 for x in range(length)] for y in range(length)] + for lang in ldata: + index_lan = ldata.index(lang) + ok = data[lang][0] + if data[lang][1] >= 1000 : + test_size = 1000 + else: + test_size = data[lang][1] + result = [x[1] for x in data[lang][3]] + counter = dict(Counter(result)) + for res_lan in counter.keys(): + index_res = ldata.index(res_lan) + out[index_lan][index_res] = counter[res_lan] / test_size + + return out + +if __name__ == '__main__': + if len(sys.argv) != 2: + print('Only argument acceptable is a path.') + else: + main(sys.argv[1]) diff --git a/scripts/result_ngrams_frequency_distance.pdf b/scripts/result_ngrams_frequency_distance.pdf new file mode 100644 index 0000000..be05137 Binary files /dev/null and b/scripts/result_ngrams_frequency_distance.pdf differ diff --git a/swh/langdetect/ngramdist.py b/swh/langdetect/ngramdist.py index 7db4fa6..43547c2 100644 --- a/swh/langdetect/ngramdist.py +++ b/swh/langdetect/ngramdist.py @@ -1,186 +1,215 @@ """ Baseline approach """ import os, sys, operator, nltk from pickle import dump, load from nltk.util import ngrams from utils.common import tokenizer, file_to_string, find_file, count_files from utils.training import build_training_set class NGramDist: def __init__(self, root): + # Root of dataset self._root = root + + # Root of training set self._root_training_set = os.path.join(self._root, '..', 'training_set') + + # Root of model folder self._root_model = os.path.join(self._root, '..', 'model_ngram_dist') + + # Root of arranged dataset self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language') + + # Path of result self._path_result = os.path.join(self._root, '..', 'result') def train(self): + ''' + train () generates and stores counted n-grams in '_root_model' folder + ''' + try: if len(os.listdir(self._root_training_set)) == 0: build_training_set(self._root) try: os.mkdir(self._root_model) except FileExistsError: pass except FileNotFoundError: os.mkdir(self._root_training_set) build_training_set(self._root) - + + ''' + Calculate frequencies of generated n-grams then store + them into a sorted list of (ngram, count) + ''' for language in os.listdir(self._root_training_set): if not language.startswith('.'): root_training_set_language = os.path.join(self._root_training_set, language) root_stat_language = os.path.join(self._root_model, language) if os.path.isfile(root_stat_language): continue else: statistics = {} for f in os.listdir(root_training_set_language): print(f) if not f.startswith('.'): filename = os.path.join(root_training_set_language, f) tokens = tokenizer(file_to_string(filename)) generated_ngrams = self._generate_ngrams(tokens, 3) self._count_ngrams(statistics, generated_ngrams) with open(root_stat_language, 'wb') as f: dump(self._sort_by_value(statistics), f) def _generate_ngrams(self, tokens, n): + ''' + :param tokens: generated tokens from a string. + :param n: maximum n of n-grams + :type tokens: list + :type n: int + :return: generated 1-grams, ... , n-grams + :rtype: list + ''' generated_ngrams = [] for i in range(1, n+1): igrams = ngrams(tokens, i, pad_left=True, pad_right=True, left_pad_symbol = '$BOF$', right_pad_symbol = '$EOF$') for igram in igrams: generated_ngrams.append(''.join(igram)) return generated_ngrams def _count_ngrams(self, statistics, ngrams): + ''' + :param statistics: shared dictionary for statistics + :param ngrams: n-grams to be accumulated into statistics + ''' for ngram in ngrams: statistics[ngram] = statistics.get(ngram, 0) + 1 def test(self): test_result = {} models = self._load_models() for language in [x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')]: test_result[language] = self.test_class(models, language) with open(self._path_result, 'wb') as f: dump(test_result, f) def _load_models(self): models = {} for model in [model for model in os.listdir(self._root_model) if not model.startswith('.')]: root_model = os.path.join(self._root_model, model) with open(root_model, 'rb') as sorted_file: models[model] = self._list_to_dict(load(sorted_file)) return models def _list_to_dict(self, model): model_ngrams = [x[0] for x in model] model_dict = {} index = 0 for ngram in model_ngrams: index += 1 model_dict[ngram] = index return model_dict def test_class(self, models, language): root_training_language = os.path.join(self._root_training_set, language) root_language = os.path.join(self._root_language_dataset, language) total = count_files(root_language) training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')] test_set = [x for x in range(1, total + 1) if x not in training_set][:1000] ok = 0 results = [] for test in test_set: result = self._guess_file_language(models, find_file(root_language, test)) print('{} '.format(result[0]),end='\r') results.append(result[0]) if result[0][1] == language: ok += 1 total_test = len(test_set) accuracy = ok / len(test_set) print('Tests for {} '.format(language)) print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) print('Accuracy : {}%'.format(accuracy * 100)) return (ok, total, accuracy, results) def test_single(self, filename): self._guess_file_language(filename) def _guess_file_language(self, models, filename): tokens = tokenizer(file_to_string(filename)) generated_ngrams = self._generate_ngrams(tokens, 3) statistics = {} self._count_ngrams(statistics, generated_ngrams) test_profile = self._list_to_dict(self._sort_by_value(statistics)) result = [] for model in models.keys(): root_model = os.path.join(self._root_model, model) model_profile = models[model] distance = self._distance(model_profile, test_profile) result.append((distance, model)) return sorted(result) def _sort_by_value(self, statistics): statistics_sorted = sorted(statistics.items(), key = operator.itemgetter(1), reverse = True)[:500] return statistics_sorted def _distance(self, model_profile, test_profile): distance = 0 maximum = len(test_profile) for test_ngram in test_profile.keys(): test_rank = test_profile.get(test_ngram) model_rank = model_profile.get(test_ngram, maximum) d = abs(test_rank - model_rank) distance += d return distance ''' def _prob(model, trigrams): print('Checking {} model ...'.format(model)) with open(model, 'rb') as f: kneser_ney = load(f) result = 1 for trigram in trigrams: prob = kneser_ney.prob(trigram) result = result * prob return result ''' if __name__ == '__main__': if len(sys.argv) == 3 and sys.argv[1] == '--train': n = NGramDist(sys.argv[2]) n.train() elif len(sys.argv) == 3 and sys.argv[1] == '--test': n = NGramDist(sys.argv[2]) n.test() elif len(sys.argv) == 4 and sys.argv[1] == '--test': n = NGramDist(sys.argv[2]) n.test_class(n.load_models(), sys.argv[3]) else: print('Wrong arguments, please check your input.')