Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F8393919
common.py
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
2 KB
Subscribers
None
common.py
View Options
"""
Here regroup basic preprocessing methods
used in learning stage for different
approaches.
"""
import
re
,
os
_re_string
=
re
.
compile
(
r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')"""
)
_re_number
=
re
.
compile
(
r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]'
)
_re_separator
=
re
.
compile
(
r'(\W)'
)
_not_start_with_point
=
lambda
x
:
not
x
.
startswith
(
'.'
)
def
tokenizer
(
text
,
re_name
):
''' Splits text into tokens '''
if
re_name
==
'letter'
:
return
list
(
text
)
elif
re_name
==
'word'
:
return
[
word
for
word
in
_re_separator
.
split
(
text
)
if
word
.
strip
(
'
\t
'
)]
def
file_to_string
(
filename
):
""" Read a file to a string. """
with
open
(
filename
,
'rb'
)
as
f
:
data
=
f
.
read
()
return
replace_string_and_number
(
data
)
def
count_files
(
root_language
):
all_folders
=
natural_sort
(
filter
(
_not_start_with_point
,
os
.
listdir
(
root_language
)))
files
=
natural_sort
(
filter
(
_not_start_with_point
,
os
.
listdir
(
root_language
+
'/'
+
all_folders
[
-
1
])))
(
max
,
_
)
=
os
.
path
.
splitext
(
files
[
-
1
])
return
int
(
max
)
def
find_file
(
root_language
,
n
):
'''Find the n-th file in language folder'''
if
n
>
count_files
(
root_language
):
return
''
else
:
start
=
(
n
-
1
)
//
1000
*
1000
+
1
end
=
start
+
999
root_count
=
root_language
+
'/'
+
str
(
start
)
+
'-'
+
str
(
end
)
files
=
natural_sort
(
filter
(
_not_start_with_point
,
os
.
listdir
(
root_count
)))
return
root_count
+
'/'
+
files
[
n
-
start
]
def
replace_string_and_number
(
text
):
""" Replace strings and numbers in a file by special tokens
"""
# str_replaced = re.sub(_re_string, '__str__', text)
# str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
str_num_replaced
=
text
return
str_num_replaced
def
natural_sort
(
l
):
convert
=
lambda
text
:
int
(
text
)
if
text
.
isdigit
()
else
text
.
lower
()
alphanum_key
=
lambda
key
:
[
convert
(
c
)
for
c
in
re
.
split
(
'([0-9]+)'
,
key
)
]
return
sorted
(
l
,
key
=
alphanum_key
)
def
remove_comment
(
text
):
# TODO: remove only inline comments and block comments
# TODO: maybe build a list of comment markers
pass
def
purify
(
text
,
lang
):
# TODO: for some language like HTML, remove code other than principal language
pass
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Jun 4, 7:19 PM (1 d, 11 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3399063
Attached To
R131 Internship - Large-scale progamming language detection
Event Timeline
Log In to Comment