diff --git a/scripts/dataset/ground_truth_arrange.py b/scripts/dataset/ground_truth_arrange.py index 65b09f0..7eb9dac 100644 --- a/scripts/dataset/ground_truth_arrange.py +++ b/scripts/dataset/ground_truth_arrange.py @@ -1,113 +1,123 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information #!/usr/bin/env python3 # coding: utf-8 import os import sys import io import json def main(root): root_ground_truth = root + '/../ground_truth' root_ground_truth_text = root_ground_truth + '_text' root_code_by_language = root + '/../code_by_language' counts = dict() try: os.mkdir(root_code_by_language) except FileExistsError: pass for r in os.listdir(root): if not r.startswith('.'): for d in os.listdir(root + '/' + r): if not d.startswith('.'): try: ground_truth = io.open(root_ground_truth + '/' + r + '/' + d + '.json') try: j = json.load(ground_truth) for language in j.keys(): root_language = root_code_by_language + '/' + language try: os.mkdir(root_language) except FileExistsError: pass for f in j.get(language): + copy_src = root + '/' + r + '/' + d + '/' + f + try: + if os.path.getsize(copy_src) > 10485760 : + continue + except FileNotFoundError: + continue counts[language] = counts.get(language, 0) + 1 start = (counts[language] - 1) // 1000 * 1000 + 1 end = start + 999 root_count = root_language + '/' + str(start) + '-' + str(end) if counts[language] % 1000 == 1: try: os.mkdir(root_count) except FileExistsError: pass (_,ext) = os.path.splitext(f) new_name = str(counts[language]) + ext - copy_src = root + '/' + r + '/' + d + '/' + f + copy_des = root_count + '/' + new_name try: os.symlink(copy_src, copy_des) print('{} successfully copied.'.format(copy_src)) except FileExistsError: pass except json.decoder.JSONDecodeError: ground_truth.close() ground_truth = io.open(root_ground_truth_text + '/' + r + '/' + d, 'r') while(True): line = ground_truth.readline() if line == '\n' or line == '': break else: pass while(True): line = ground_truth.readline() stripped = line.strip() if line == '': break else: stripped = line.strip() language = stripped.replace(':','') root_language = root_code_by_language + '/' + language try: os.mkdir(root_language) except FileExistsError: pass while(True): line = ground_truth.readline() if line == '\n': break else: - f = line.strip() + copy_src = root + '/' + r + '/' + d + '/' + f + try: + if os.path.getsize(copy_src) > 10485760 : + continue + except FileNotFoundError: + continue counts[language] = counts.get(language, 0) + 1 start = (counts[language] - 1) // 1000 * 1000 + 1 end = start + 999 root_count = root_language + '/' + str(start) + '-' + str(end) if counts[language] % 1000 == 1: try: os.mkdir(root_count) except FileExistsError: pass (_,ext) = os.path.splitext(f) new_name = str(counts[language]) + ext - copy_src = root + '/' + r + '/' + d + '/' + f copy_des = root_count + '/' + new_name try: os.symlink(copy_src, copy_des) print('{} successfully copied.'.format(copy_src)) except FileExistsError: pass finally: ground_truth.close() if __name__ == '__main__': if len(sys.argv) != 2: print('Only argument acceptable is a path.') else: main(sys.argv[1]) diff --git a/swh/langdetect/ngram.py b/swh/langdetect/ngram.py deleted file mode 100644 index d9b4a9a..0000000 --- a/swh/langdetect/ngram.py +++ /dev/null @@ -1,107 +0,0 @@ -""" -Calculate frequencies for classes -""" - -import os, sys, nltk - -from pickle import dump, load -from collections import Counter -from nltk.util import ngrams -from utils.common import tokenizer, file_to_string -from utils.training import build_training_set -from nltk.probability import * - -class NGramTrain: - - def __init__(self, root): - self._root = root - - def train(self): - root_training_set = self._root + '/../training_set' - root_model = '../../dataset/model' - - try: - if len(os.listdir(root_training_set)) == 0: - build_training_set(self._root) - try: - os.mkdir(root_model) - except FileExistsError: - pass - except FileNotFoundError: - os.mkdir(root_training_set) - build_training_set(self._root) - - for language in os.listdir(root_training_set): - if not language.startswith('.'): - root_training_set_language = root_training_set + '/' + language - for f in os.listdir(root_training_set_language): - print(f) - if not f.startswith('.'): - ngram_lan = ngrams_max(root_training_set_language + '/' + f) - prob_lan = self._kneser_key_prob(ngram_lan) - #with open(root_model + '/' + language + '.model', 'wb') as f: - # dump(ngram_lan, f) - with open(root_model + '/' + language + '.model', 'wb') as f: - dump(prob_lan, f) - - def ngrams_max(filename, n=3): - ngram_lan = {} - tokens = tokenizer(file_to_string(filename)) - for i in range(n - 1, n): - ngram_lan[i + 1] = ngram_lan.get(i + 1, []) + \ - list(ngrams(tokens, i + 1, - pad_left = True, - pad_right = True, - left_pad_symbol = '$BOS$', - right_pad_symbol = '$EOS$')) - return ngram_lan - - def _kneser_key_prob(self, ngram_lan): - c = Counter() - for key in ngram_lan.keys(): - c = c + Counter(ngram_lan[key]) - freq_dist = nltk.FreqDist(c) - return nltk.KneserNeyProbDist(freq_dist) - - -class NGramTest: - - def test(filename): - NGramTest._guess_file_language(filename) - - def _guess_file_language(filename): - root_model_folder = '../../dataset/model' - trigrams = NGramTrain.ngrams_max(filename) - result = [] - - for model in os.listdir(root_model_folder): - if not model.startswith('.'): - root_model = root_model_folder + '/' + model - (language,_) = os.path.splitext(model) - result.append((NGramTest._prob(root_model, trigrams[3]), language)) - print(sorted(result)) - - def _prob(model, trigrams): - with open(model, 'rb') as f: - kneser_ney = load(f) - result = 0 - for trigram in trigrams: - prob = kneser_ney.prob(trigram) - if prob != 0: - if result == 0: - result = prob - else: - result = result * prob - return result - - -if __name__ == '__main__': - if len(sys.argv) != 3: - print('Only acceptable arguments are an option and a path.') - elif sys.argv[1] == '--train': - model = NGramTrain(sys.argv[2]) - model.train() - elif sys.argv[1] == '--test': - NGramTest.test(sys.argv[2]) - else: - print('Wrong arguments, please check your input.') diff --git a/swh/langdetect/ngramdist.py b/swh/langdetect/ngramdist.py new file mode 100644 index 0000000..01b3dd4 --- /dev/null +++ b/swh/langdetect/ngramdist.py @@ -0,0 +1,150 @@ +""" +Baseline approach +""" + +import os, sys, operator, nltk + +from pickle import dump, load +from nltk.util import ngrams +from utils.common import tokenizer, file_to_string, find_file, count_files +from utils.training import build_training_set + +class NGramDist: + + def __init__(self, root): + self._root = root + self._root_training_set = os.path.join(self._root, '..', 'training_set') + self._root_model = os.path.join(self._root, '..', 'model_ngram_dist') + + def train(self): + try: + if len(os.listdir(self._root_training_set)) == 0: + build_training_set(self._root) + try: + os.mkdir(self._root_model) + except FileExistsError: + pass + except FileNotFoundError: + os.mkdir(self._root_training_set) + build_training_set(self._root) + + for language in os.listdir(self._root_training_set): + if not language.startswith('.'): + root_training_set_language = os.path.join(self._root_training_set, language) + root_stat_language = os.path.join(self._root_model, language) + if os.path.isfile(root_stat_language): + continue + else: + statistics = {} + for f in os.listdir(root_training_set_language): + print(f) + if not f.startswith('.'): + filename = os.path.join(root_training_set_language, f) + tokens = tokenizer(file_to_string(filename)) + generated_ngrams = self._generate_ngrams(tokens, 3) + self._count_ngrams(statistics, generated_ngrams) + with open(root_stat_language, 'wb') as f: + dump(self._sort_by_value(statistics), f) + + def _generate_ngrams(self, tokens, n): + generated_ngrams = [] + + for i in range(1, n+1): + igrams = ngrams(tokens, i, + pad_left = True, + pad_right = True, + left_pad_symbol = '$BOF$', + right_pad_symbol = '$EOF$') + for igram in igrams: + generated_ngrams.append(''.join(igram)) + + return generated_ngrams + + def _count_ngrams(self, statistics, ngrams): + for ngram in ngrams: + statistics[ngram] = statistics.get(ngram, 0) + 1 + + def test_class(self, root_language): + language = os.path.basename(root_language) + root_training_language = os.path.join(self._root_training_set, language) + + total = count_files(root_language) + training_set = [int(os.path.splitext(x)[0]) for x in os.listdir(root_training_language) if not x.startswith('.')] + test_set = [x for x in range(1, total + 1) if x not in training_set] + + ok = 0 + for test in test_set[:1000]: + result = self._guess_file_language(find_file(root_language, test)) + print(result[0]) + if result[0][1] == language: + ok += 1 + + print('Total test files : {}'.format(len(test_set))) + print('Correctly classified files : {}'.format(ok)) + print('Accuracy : {}%'.format(ok / len(test_set))) + + def test_single(self, filename): + self._guess_file_language(filename) + + def _guess_file_language(self, filename): + tokens = tokenizer(file_to_string(filename)) + generated_ngrams = self._generate_ngrams(tokens, 3) + statistics = {} + self._count_ngrams(statistics, generated_ngrams) + test_profile = self._sort_by_value(statistics) + + result = [] + + for model in os.listdir(self._root_model): + if not model.startswith('.'): + root_model = os.path.join(self._root_model, model) + with open(root_model, 'rb') as sorted_file: + model_profile = load(sorted_file) + distance = self._distance(model_profile, test_profile) + result.append((distance, os.path.splitext(model)[0])) + + return sorted(result) + + def _sort_by_value(self, statistics): + statistics_sorted = sorted(statistics.items(), + key = operator.itemgetter(1), + reverse = True)[:500] + return statistics_sorted + + def _distance(self, model_profile, test_profile): + distance = 0 + model_ngrams = [x[0] for x in model_profile ] + test_ngrams = [x[0] for x in test_profile ] + maximum = len(test_ngrams) + + for test_ngram in test_ngrams: + test_rank = test_ngrams.index(test_ngram) + try: + model_rank = model_ngrams.index(test_ngram) + except ValueError: + model_rank = maximum + d = abs(test_rank - model_rank) + distance += d + + return distance + ''' + def _prob(model, trigrams): + print('Checking {} model ...'.format(model)) + with open(model, 'rb') as f: + kneser_ney = load(f) + result = 1 + for trigram in trigrams: + prob = kneser_ney.prob(trigram) + result = result * prob + return result + ''' + +if __name__ == '__main__': + if len(sys.argv) == 3 and sys.argv[1] == '--train': + n = NGramDist(sys.argv[2]) + n.train() + elif len(sys.argv) == 4 and sys.argv[1] == '--test': + n = NGramDist(sys.argv[2]) + n.test_class(sys.argv[3]) + else: + print('Wrong arguments, please check your input.') diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py index 52d387f..4f89523 100644 --- a/swh/langdetect/utils/common.py +++ b/swh/langdetect/utils/common.py @@ -1,75 +1,77 @@ """ Here regroup basic preprocessing methods used in learning stage for different approaches. """ import re, os _re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""") _re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]') _re_separator = re.compile(r'(\W)') _not_start_with_point = lambda x: not x.startswith('.') def tokenizer(text): ''' Splits text into tokens ''' - return [word for word in _re_separator.split(text) if word.strip(' \t')] + #return [word for word in _re_separator.split(text) if word.strip(' \t')] + return list(text) def file_to_string(filename): """ Read a file to a string. """ with open(filename, 'r', errors='ignore') as f: - data = f.read().replace('\n',' ').lower() + data = f.read().lower() return replace_string_and_number(data) def count_files(root_language): all_folders = natural_sort(filter (_not_start_with_point, os.listdir(root_language))) files = natural_sort(filter (_not_start_with_point, os.listdir(root_language + '/' + all_folders[-1]))) (max,_) = os.path.splitext(files[-1]) return int(max) def find_file(root_language, n): '''Find the n-th file in language folder''' if n > count_files(root_language): return '' else: start = (n - 1) // 1000 * 1000 + 1 end = start + 999 root_count = root_language + '/' + str(start) + '-' + str(end) files = natural_sort(filter (_not_start_with_point, os.listdir(root_count))) return root_count + '/' + files[n - start] def replace_string_and_number(text): """ Replace strings and numbers in a file by special tokens """ - str_replaced = re.sub(_re_string, '__str__', text) - str_num_replaced = re.sub(_re_number, '__num__', str_replaced) + # str_replaced = re.sub(_re_string, '__str__', text) + # str_num_replaced = re.sub(_re_number, '__num__', str_replaced) + str_num_replaced = text return str_num_replaced def natural_sort(l): convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] return sorted(l, key = alphanum_key) def remove_comment(text): # TODO: remove only inline comments and block comments # TODO: maybe build a list of comment markers pass def purify(text, lang): # TODO: for some language like HTML, remove code other than principal language pass