diff --git a/swh/langdetect/ngramdist.py b/swh/langdetect/ngramdist.py index 01b3dd4..7db4fa6 100644 --- a/swh/langdetect/ngramdist.py +++ b/swh/langdetect/ngramdist.py @@ -1,150 +1,186 @@ """ Baseline approach """ import os, sys, operator, nltk from pickle import dump, load from nltk.util import ngrams from utils.common import tokenizer, file_to_string, find_file, count_files from utils.training import build_training_set class NGramDist: def __init__(self, root): self._root = root self._root_training_set = os.path.join(self._root, '..', 'training_set') self._root_model = os.path.join(self._root, '..', 'model_ngram_dist') + self._root_language_dataset = os.path.join(self._root, '..', 'code_by_language') + self._path_result = os.path.join(self._root, '..', 'result') def train(self): try: if len(os.listdir(self._root_training_set)) == 0: build_training_set(self._root) try: os.mkdir(self._root_model) except FileExistsError: pass except FileNotFoundError: os.mkdir(self._root_training_set) build_training_set(self._root) for language in os.listdir(self._root_training_set): if not language.startswith('.'): root_training_set_language = os.path.join(self._root_training_set, language) root_stat_language = os.path.join(self._root_model, language) if os.path.isfile(root_stat_language): continue else: statistics = {} for f in os.listdir(root_training_set_language): print(f) if not f.startswith('.'): filename = os.path.join(root_training_set_language, f) tokens = tokenizer(file_to_string(filename)) generated_ngrams = self._generate_ngrams(tokens, 3) self._count_ngrams(statistics, generated_ngrams) with open(root_stat_language, 'wb') as f: dump(self._sort_by_value(statistics), f) def _generate_ngrams(self, tokens, n): generated_ngrams = [] for i in range(1, n+1): igrams = ngrams(tokens, i, - pad_left = True, - pad_right = True, + pad_left=True, + pad_right=True, left_pad_symbol = '$BOF$', right_pad_symbol = '$EOF$') for igram in igrams: generated_ngrams.append(''.join(igram)) return generated_ngrams def _count_ngrams(self, statistics, ngrams): for ngram in ngrams: statistics[ngram] = statistics.get(ngram, 0) + 1 - def test_class(self, root_language): - language = os.path.basename(root_language) - root_training_language = os.path.join(self._root_training_set, language) + def test(self): + test_result = {} + models = self._load_models() + + for language in [x for x in os.listdir(self._root_language_dataset) if not x.startswith('.')]: + test_result[language] = self.test_class(models, language) + with open(self._path_result, 'wb') as f: + dump(test_result, f) + def _load_models(self): + models = {} + + for model in [model + for model in os.listdir(self._root_model) + if not model.startswith('.')]: + root_model = os.path.join(self._root_model, model) + with open(root_model, 'rb') as sorted_file: + models[model] = self._list_to_dict(load(sorted_file)) + + return models + + def _list_to_dict(self, model): + model_ngrams = [x[0] for x in model] + model_dict = {} + index = 0 + for ngram in model_ngrams: + index += 1 + model_dict[ngram] = index + return model_dict + + def test_class(self, models, language): + root_training_language = os.path.join(self._root_training_set, language) + root_language = os.path.join(self._root_language_dataset, language) total = count_files(root_language) training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')] - test_set = [x for x in range(1, total + 1) if x not in training_set] + test_set = [x for x in range(1, total + 1) if x not in training_set][:1000] ok = 0 - for test in test_set[:1000]: - result = self._guess_file_language(find_file(root_language, test)) - print(result[0]) + results = [] + for test in test_set: + result = self._guess_file_language(models, find_file(root_language, test)) + print('{} '.format(result[0]),end='\r') + results.append(result[0]) if result[0][1] == language: ok += 1 - print('Total test files : {}'.format(len(test_set))) + total_test = len(test_set) + accuracy = ok / len(test_set) + print('Tests for {} '.format(language)) + print('Total test files : {}'.format(total_test)) print('Correctly classified files : {}'.format(ok)) - print('Accuracy : {}%'.format(ok / len(test_set))) + print('Accuracy : {}%'.format(accuracy * 100)) + return (ok, total, accuracy, results) def test_single(self, filename): self._guess_file_language(filename) - def _guess_file_language(self, filename): + def _guess_file_language(self, models, filename): + tokens = tokenizer(file_to_string(filename)) generated_ngrams = self._generate_ngrams(tokens, 3) + statistics = {} self._count_ngrams(statistics, generated_ngrams) - test_profile = self._sort_by_value(statistics) + + test_profile = self._list_to_dict(self._sort_by_value(statistics)) result = [] - for model in os.listdir(self._root_model): - if not model.startswith('.'): - root_model = os.path.join(self._root_model, model) - with open(root_model, 'rb') as sorted_file: - model_profile = load(sorted_file) - distance = self._distance(model_profile, test_profile) - result.append((distance, os.path.splitext(model)[0])) + for model in models.keys(): + root_model = os.path.join(self._root_model, model) + model_profile = models[model] + distance = self._distance(model_profile, test_profile) + result.append((distance, model)) return sorted(result) def _sort_by_value(self, statistics): statistics_sorted = sorted(statistics.items(), key = operator.itemgetter(1), reverse = True)[:500] return statistics_sorted def _distance(self, model_profile, test_profile): distance = 0 - model_ngrams = [x[0] for x in model_profile ] - test_ngrams = [x[0] for x in test_profile ] - maximum = len(test_ngrams) + maximum = len(test_profile) - for test_ngram in test_ngrams: - test_rank = test_ngrams.index(test_ngram) - try: - model_rank = model_ngrams.index(test_ngram) - except ValueError: - model_rank = maximum + for test_ngram in test_profile.keys(): + test_rank = test_profile.get(test_ngram) + model_rank = model_profile.get(test_ngram, maximum) d = abs(test_rank - model_rank) distance += d return distance ''' def _prob(model, trigrams): print('Checking {} model ...'.format(model)) with open(model, 'rb') as f: kneser_ney = load(f) result = 1 for trigram in trigrams: prob = kneser_ney.prob(trigram) result = result * prob return result ''' if __name__ == '__main__': if len(sys.argv) == 3 and sys.argv[1] == '--train': n = NGramDist(sys.argv[2]) n.train() + elif len(sys.argv) == 3 and sys.argv[1] == '--test': + n = NGramDist(sys.argv[2]) + n.test() elif len(sys.argv) == 4 and sys.argv[1] == '--test': n = NGramDist(sys.argv[2]) - n.test_class(sys.argv[3]) + n.test_class(n.load_models(), sys.argv[3]) else: print('Wrong arguments, please check your input.')