"""
Here regroup basic preprocessing methods
used in learning stage for different 
approaches.

"""

import re, os

_re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
_re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
_re_separator = re.compile(r'(\W)')
_not_start_with_point = lambda x: not x.startswith('.')

def tokenizer(text, re_name):
    ''' Splits text into tokens '''
    if re_name == 'letter':
        return list(text)
    elif re_name == 'word':
        return [word for word in _re_separator.split(text) if word.strip(' \t')]

def file_to_string(filename):
    """ Read a file to a string. """
    with open(filename, 'rb') as f:
        data = f.read()
    return replace_string_and_number(data)

def count_files(root_language):    
    all_folders = natural_sort(filter
                               (_not_start_with_point,
                                os.listdir(root_language)))
    files = natural_sort(filter
                         (_not_start_with_point,
                          os.listdir(root_language + '/' + all_folders[-1])))
    (max,_) = os.path.splitext(files[-1])
    return int(max)

def find_file(root_language, n):
    '''Find the n-th file in language folder'''
    if n > count_files(root_language):
        return ''
    else:
        start = (n - 1) // 1000 * 1000 + 1
        end = start + 999
        root_count = root_language + '/' + str(start) + '-' + str(end)
        files = natural_sort(filter
                             (_not_start_with_point,
                              os.listdir(root_count)))
        return root_count + '/' + files[n - start]

def replace_string_and_number(text):
    """ Replace strings and numbers in a file by special tokens 
    """
    # str_replaced = re.sub(_re_string, '__str__', text)
    # str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
    str_num_replaced = text
    return str_num_replaced

def natural_sort(l): 
    convert = lambda text: int(text) if text.isdigit() else text.lower() 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

def remove_comment(text):
    # TODO: remove only inline comments and block comments
    # TODO: maybe build a list of comment markers
    pass

def purify(text, lang):
    # TODO: for some language like HTML, remove code other than principal language
    pass