""" Here regroup basic preprocessing methods used in learning stage for different approaches. """ import re, os _re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""") _re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]') _re_separator = re.compile(r'(\W)') _not_start_with_point = lambda x: not x.startswith('.') def tokenizer(text, re_name): ''' Splits text into tokens ''' if re_name == 'letter': return list(text) elif re_name == 'word': return [word for word in _re_separator.split(text) if word.strip(' \t')] def file_to_string(filename): """ Read a file to a string. """ with open(filename, 'rb') as f: data = f.read() return replace_string_and_number(data) def count_files(root_language): all_folders = natural_sort(filter (_not_start_with_point, os.listdir(root_language))) files = natural_sort(filter (_not_start_with_point, os.listdir(root_language + '/' + all_folders[-1]))) (max,_) = os.path.splitext(files[-1]) return int(max) def find_file(root_language, n): '''Find the n-th file in language folder''' if n > count_files(root_language): return '' else: start = (n - 1) // 1000 * 1000 + 1 end = start + 999 root_count = root_language + '/' + str(start) + '-' + str(end) files = natural_sort(filter (_not_start_with_point, os.listdir(root_count))) return root_count + '/' + files[n - start] def replace_string_and_number(text): """ Replace strings and numbers in a file by special tokens """ # str_replaced = re.sub(_re_string, '__str__', text) # str_num_replaced = re.sub(_re_number, '__num__', str_replaced) str_num_replaced = text return str_num_replaced def natural_sort(l): convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] return sorted(l, key = alphanum_key) def remove_comment(text): # TODO: remove only inline comments and block comments # TODO: maybe build a list of comment markers pass def purify(text, lang): # TODO: for some language like HTML, remove code other than principal language pass