Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9348545
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
13 KB
Subscribers
None
View Options
diff --git a/scripts/dataset/ground_truth_arrange.py b/scripts/dataset/ground_truth_arrange.py
index 47e2dd3..65b09f0 100644
--- a/scripts/dataset/ground_truth_arrange.py
+++ b/scripts/dataset/ground_truth_arrange.py
@@ -1,107 +1,113 @@
# Copyright (C) 2015-2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
#!/usr/bin/env python3
# coding: utf-8
import os
import sys
import io
import json
def main(root):
root_ground_truth = root + '/../ground_truth'
root_ground_truth_text = root_ground_truth + '_text'
root_code_by_language = root + '/../code_by_language'
counts = dict()
try:
os.mkdir(root_code_by_language)
except FileExistsError:
pass
for r in os.listdir(root):
if not r.startswith('.'):
for d in os.listdir(root + '/' + r):
if not d.startswith('.'):
try:
ground_truth = io.open(root_ground_truth + '/' + r + '/' + d + '.json')
try:
j = json.load(ground_truth)
for language in j.keys():
root_language = root_code_by_language + '/' + language
try:
os.mkdir(root_language)
except FileExistsError:
pass
for f in j.get(language):
counts[language] = counts.get(language, 0) + 1
start = (counts[language] - 1) // 1000 * 1000 + 1
end = start + 999
root_count = root_language + '/' + str(start) + '-' + str(end)
if counts[language] % 1000 == 1:
try:
os.mkdir(root_count)
except FileExistsError:
pass
(_,ext) = os.path.splitext(f)
new_name = str(counts[language]) + ext
copy_src = root + '/' + r + '/' + d + '/' + f
copy_des = root_count + '/' + new_name
- os.symlink(copy_src, copy_des)
- print(copy_src + ' successfully copied.')
+ try:
+ os.symlink(copy_src, copy_des)
+ print('{} successfully copied.'.format(copy_src))
+ except FileExistsError:
+ pass
except json.decoder.JSONDecodeError:
ground_truth.close()
ground_truth = io.open(root_ground_truth_text + '/' + r + '/' + d, 'r')
while(True):
line = ground_truth.readline()
if line == '\n' or line == '':
break
else:
pass
while(True):
line = ground_truth.readline()
stripped = line.strip()
if line == '':
break
else:
stripped = line.strip()
language = stripped.replace(':','')
root_language = root_code_by_language + '/' + language
try:
os.mkdir(root_language)
except FileExistsError:
pass
while(True):
line = ground_truth.readline()
if line == '\n':
break
else:
f = line.strip()
counts[language] = counts.get(language, 0) + 1
start = (counts[language] - 1) // 1000 * 1000 + 1
end = start + 999
root_count = root_language + '/' + str(start) + '-' + str(end)
if counts[language] % 1000 == 1:
try:
os.mkdir(root_count)
except FileExistsError:
pass
(_,ext) = os.path.splitext(f)
new_name = str(counts[language]) + ext
copy_src = root + '/' + r + '/' + d + '/' + f
copy_des = root_count + '/' + new_name
- os.symlink(copy_src, copy_des)
- print('{} successfully copied.'.format(copy_src))
+ try:
+ os.symlink(copy_src, copy_des)
+ print('{} successfully copied.'.format(copy_src))
+ except FileExistsError:
+ pass
finally:
ground_truth.close()
if __name__ == '__main__':
if len(sys.argv) != 2:
print('Only argument acceptable is a path.')
else:
main(sys.argv[1])
diff --git a/swh/langdetect/__init__.py b/swh/langdetect/__init__.py
index db2d42b..5f8841b 100644
--- a/swh/langdetect/__init__.py
+++ b/swh/langdetect/__init__.py
@@ -1,4 +1,5 @@
"""
Detectlang detects the programming language of source code file.
"""
+
diff --git a/swh/langdetect/ngram.py b/swh/langdetect/ngram.py
new file mode 100644
index 0000000..9db7303
--- /dev/null
+++ b/swh/langdetect/ngram.py
@@ -0,0 +1,63 @@
+"""
+Calculate frequencies for classes
+"""
+
+import os,sys,nltk
+
+from collections import Counter
+from nltk.util import ngrams
+from utils.common import tokenizer, file_to_string
+from utils.training import build_training_set
+
+class NGramTrain:
+
+ def __init__(self, root):
+ self._ngram_list = {}
+ self._prob = {}
+ self._root = root
+
+ def train(self):
+ root_training_set = self._root + '/../training_set'
+ root_model = '../../dataset/model'
+
+ try:
+ if len(os.listdir(root_training_set)) == 0:
+ build_training_set(self._root)
+ except FileNotFoundError:
+ os.mkdir(root_training_set)
+ build_training_set(self._root)
+
+ for language in os.listdir(root_training_set):
+ if not language.startswith('.'):
+ _ngram_list[language] = {}
+ _prob[language] = None
+ root_training_set_language = root_training_set + '/' + language
+ for f in os.listdir(root_training_set_language):
+ if not f.startswith('.'):
+ _ngrams_max(language, f)
+
+ def _ngrams_max(self, language, filename, n=3):
+ ngram_lan = self._ngram_list[language]
+ tokens = tokenizer(file_to_string(filename))
+ for i in range(1, n):
+ ngram_lan[i + 1] = ngram_lan.get(i + 1, []) + \
+ list(ngram(tokens, i + 1,
+ pad_left = True,
+ pad_right = True,
+ left_pad_symbol = '$BOS$',
+ right_pad_symbol = '$EOS$'))
+
+ def _kneser_key_prob(self, language):
+ c = Counter()
+ ngram_lan = self._ngram_list[language]
+ for key in self._ngram_lan.keys():
+ c = c + Counter(self.ngram_list[key])
+ freq_dist = nltk.FreqDist(c)
+ self._prob[language] = nltk.KneserNeyProbDist(freq_dist)
+
+if __name__ == '__main__':
+ if len(sys.argv) != 2:
+ print('Only argument acceptable is a path.')
+ else:
+ model = NGramTrain(sys.argv[1])
+ model.train()
diff --git a/swh/langdetect/utils.py b/swh/langdetect/utils.py
deleted file mode 100644
index 70d84c1..0000000
--- a/swh/langdetect/utils.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""
-Here regroup basic preprocessing methods
-used in learning stage for different
-approaches.
-
-"""
-
-import re
-
-_re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
-_re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
-
-def replace_string_and_number(text):
- """ Replace strings and numbers in a file by special tokens
- """
- str_replaced = re.sub(_re_string, '__str__', text)
- str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
- return str_num_replaced
-
-def remove_comment(text):
- # TODO: remove only inline comments and block comments
- # TODO: maybe build a list of comment markers
- pass
-
-def purify(text, lang):
- # TODO: for some language like HTML, remove code other than principal language
- pass
-
-
-
-
-
-
-
-
-
diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py
new file mode 100644
index 0000000..37648f9
--- /dev/null
+++ b/swh/langdetect/utils/common.py
@@ -0,0 +1,75 @@
+"""
+Here regroup basic preprocessing methods
+used in learning stage for different
+approaches.
+
+"""
+
+import re, os
+
+_re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
+_re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
+_re_separator = re.compile(r'(\W)')
+_not_start_with_point = lambda x: not x.startswith('.')
+
+def tokenizer(text):
+ ''' Splits text into tokens '''
+ return [word for word in _re_separator.split(text) if word.strip(' \t')]
+
+def file_to_string(filename):
+ """ Read a file to a string. """
+ with open(filename, 'r') as f:
+ data = f.read().replace('\n',' ').lower
+ return data
+
+def count_files(root_language):
+ all_folders = natural_sort(filter
+ (_not_start_with_point,
+ os.listdir(root_language)))
+ files = natural_sort(filter
+ (_not_start_with_point,
+ os.listdir(root_language + '/' + all_folders[-1])))
+ (max,_) = os.path.splitext(files[-1])
+ return int(max)
+
+def find_file(root_language, n):
+ '''Find the n-th file in language folder'''
+ if n > count_files(root_language):
+ return ''
+ else:
+ start = (n - 1) // 1000 * 1000 + 1
+ end = start + 999
+ root_count = root_language + '/' + str(start) + '-' + str(end)
+ files = natural_sort(filter
+ (_not_start_with_point,
+ os.listdir(root_count)))
+ return root_count + '/' + files[n - start]
+
+def replace_string_and_number(text):
+ """ Replace strings and numbers in a file by special tokens
+ """
+ str_replaced = re.sub(_re_string, '__str__', text)
+ str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
+ return str_num_replaced
+
+def natural_sort(l):
+ convert = lambda text: int(text) if text.isdigit() else text.lower()
+ alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
+ return sorted(l, key = alphanum_key)
+
+def remove_comment(text):
+ # TODO: remove only inline comments and block comments
+ # TODO: maybe build a list of comment markers
+ pass
+
+def purify(text, lang):
+ # TODO: for some language like HTML, remove code other than principal language
+ pass
+
+
+
+
+
+
+
+
diff --git a/swh/langdetect/utils/training.py b/swh/langdetect/utils/training.py
new file mode 100644
index 0000000..07872ab
--- /dev/null
+++ b/swh/langdetect/utils/training.py
@@ -0,0 +1,35 @@
+import os,random
+
+from utils.common import count_files, find_file
+
+def build_training_set(root):
+ root_code = root + '/../code_by_language'
+ root_training = root + '/../training_set'
+ for language in os.listdir(root_code):
+ if not language.startswith('.'):
+ root_language = root_code + '/' + language
+ root_training_language = root_training + '/' + language
+ build_language_training_set(count_files(root_language),
+ root_language,
+ root_training_language)
+
+def build_language_training_set(total, root_language, root_training_language):
+ # limit defines the size of training set
+ # upper defines the maximum size
+
+ try:
+ os.mkdir(root_training_language)
+ except FileExistsError:
+ pass
+
+ upper = 4000
+ if total >= upper:
+ limit = upper // 2
+ else:
+ limit = total // 2
+ indices = random.sample(range(1, total + 1), limit)
+ files = map(lambda x : find_file(root_language, x), indices)
+ for src in files:
+ basename = os.path.basename(src)
+ des = root_training_language + '/' + basename
+ os.symlink(src, des)
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Jul 4 2025, 6:35 PM (5 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3304443
Attached To
R131 Internship - Large-scale progamming language detection
Event Timeline
Log In to Comment