common.py
View Options

	"""
	Here regroup basic preprocessing methods
	used in learning stage for different
	approaches.

	"""

	import re, os

	_re_string = re.compile(r"""("(\\.\|[^"\\])"\|'(\\.\|[^'\\])')""")
	_re_number = re.compile(r'([\d]+)\|([\d]+.[\d]+)[^A-Za-z]')
	_re_separator = re.compile(r'(\W)')
	_not_start_with_point = lambda x: not x.startswith('.')

	def tokenizer(text, re_name):
	''' Splits text into tokens '''
	if re_name == 'letter':
	return list(text)
	elif re_name == 'word':
	return [word for word in _re_separator.split(text) if word.strip(' \t')]

	def file_to_string(filename):
	""" Read a file to a string. """
	with open(filename, 'rb') as f:
	data = f.read()
	return replace_string_and_number(data)

	def count_files(root_language):
	all_folders = natural_sort(filter
	(_not_start_with_point,
	os.listdir(root_language)))
	files = natural_sort(filter
	(_not_start_with_point,
	os.listdir(root_language + '/' + all_folders[-1])))
	(max,_) = os.path.splitext(files[-1])
	return int(max)

	def find_file(root_language, n):
	'''Find the n-th file in language folder'''
	if n > count_files(root_language):
	return ''
	else:
	start = (n - 1) // 1000 * 1000 + 1
	end = start + 999
	root_count = root_language + '/' + str(start) + '-' + str(end)
	files = natural_sort(filter
	(_not_start_with_point,
	os.listdir(root_count)))
	return root_count + '/' + files[n - start]

	def replace_string_and_number(text):
	""" Replace strings and numbers in a file by special tokens
	"""
	# str_replaced = re.sub(_re_string, '__str__', text)
	# str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
	str_num_replaced = text
	return str_num_replaced

	def natural_sort(l):
	convert = lambda text: int(text) if text.isdigit() else text.lower()
	alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
	return sorted(l, key = alphanum_key)

	def remove_comment(text):
	# TODO: remove only inline comments and block comments
	# TODO: maybe build a list of comment markers
	pass

	def purify(text, lang):
	# TODO: for some language like HTML, remove code other than principal language
	pass

File Metadata

Mime Type: text/plain
Expires: Wed, Jun 4, 7:19 PM (1 d, 11 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3399063

common.py
No OneTemporary
Actions

common.py
View Options

File Metadata

Event Timeline

common.pyNo OneTemporaryActions

common.pyView Options

File Metadata

Event Timeline

common.py
No OneTemporary
Actions

common.py
View Options