Page MenuHomeSoftware Heritage

common.py
No OneTemporary

common.py

"""
Here regroup basic preprocessing methods
used in learning stage for different
approaches.
"""
import re, os
_re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
_re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
_re_separator = re.compile(r'(\W)')
_not_start_with_point = lambda x: not x.startswith('.')
def tokenizer(text, re_name):
''' Splits text into tokens '''
if re_name == 'letter':
return list(text)
elif re_name == 'word':
return [word for word in _re_separator.split(text) if word.strip(' \t')]
def file_to_string(filename):
""" Read a file to a string. """
with open(filename, 'rb') as f:
data = f.read()
return replace_string_and_number(data)
def count_files(root_language):
all_folders = natural_sort(filter
(_not_start_with_point,
os.listdir(root_language)))
files = natural_sort(filter
(_not_start_with_point,
os.listdir(root_language + '/' + all_folders[-1])))
(max,_) = os.path.splitext(files[-1])
return int(max)
def find_file(root_language, n):
'''Find the n-th file in language folder'''
if n > count_files(root_language):
return ''
else:
start = (n - 1) // 1000 * 1000 + 1
end = start + 999
root_count = root_language + '/' + str(start) + '-' + str(end)
files = natural_sort(filter
(_not_start_with_point,
os.listdir(root_count)))
return root_count + '/' + files[n - start]
def replace_string_and_number(text):
""" Replace strings and numbers in a file by special tokens
"""
# str_replaced = re.sub(_re_string, '__str__', text)
# str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
str_num_replaced = text
return str_num_replaced
def natural_sort(l):
convert = lambda text: int(text) if text.isdigit() else text.lower()
alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
return sorted(l, key = alphanum_key)
def remove_comment(text):
# TODO: remove only inline comments and block comments
# TODO: maybe build a list of comment markers
pass
def purify(text, lang):
# TODO: for some language like HTML, remove code other than principal language
pass

File Metadata

Mime Type
text/plain
Expires
Wed, Jun 4, 7:19 PM (1 d, 11 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3399063

Event Timeline