Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/scripts/dataset/ground_truth_arrange.py b/scripts/dataset/ground_truth_arrange.py
index 47e2dd3..65b09f0 100644
--- a/scripts/dataset/ground_truth_arrange.py
+++ b/scripts/dataset/ground_truth_arrange.py
@@ -1,107 +1,113 @@
# Copyright (C) 2015-2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
#!/usr/bin/env python3
# coding: utf-8
import os
import sys
import io
import json
def main(root):
root_ground_truth = root + '/../ground_truth'
root_ground_truth_text = root_ground_truth + '_text'
root_code_by_language = root + '/../code_by_language'
counts = dict()
try:
os.mkdir(root_code_by_language)
except FileExistsError:
pass
for r in os.listdir(root):
if not r.startswith('.'):
for d in os.listdir(root + '/' + r):
if not d.startswith('.'):
try:
ground_truth = io.open(root_ground_truth + '/' + r + '/' + d + '.json')
try:
j = json.load(ground_truth)
for language in j.keys():
root_language = root_code_by_language + '/' + language
try:
os.mkdir(root_language)
except FileExistsError:
pass
for f in j.get(language):
counts[language] = counts.get(language, 0) + 1
start = (counts[language] - 1) // 1000 * 1000 + 1
end = start + 999
root_count = root_language + '/' + str(start) + '-' + str(end)
if counts[language] % 1000 == 1:
try:
os.mkdir(root_count)
except FileExistsError:
pass
(_,ext) = os.path.splitext(f)
new_name = str(counts[language]) + ext
copy_src = root + '/' + r + '/' + d + '/' + f
copy_des = root_count + '/' + new_name
- os.symlink(copy_src, copy_des)
- print(copy_src + ' successfully copied.')
+ try:
+ os.symlink(copy_src, copy_des)
+ print('{} successfully copied.'.format(copy_src))
+ except FileExistsError:
+ pass
except json.decoder.JSONDecodeError:
ground_truth.close()
ground_truth = io.open(root_ground_truth_text + '/' + r + '/' + d, 'r')
while(True):
line = ground_truth.readline()
if line == '\n' or line == '':
break
else:
pass
while(True):
line = ground_truth.readline()
stripped = line.strip()
if line == '':
break
else:
stripped = line.strip()
language = stripped.replace(':','')
root_language = root_code_by_language + '/' + language
try:
os.mkdir(root_language)
except FileExistsError:
pass
while(True):
line = ground_truth.readline()
if line == '\n':
break
else:
f = line.strip()
counts[language] = counts.get(language, 0) + 1
start = (counts[language] - 1) // 1000 * 1000 + 1
end = start + 999
root_count = root_language + '/' + str(start) + '-' + str(end)
if counts[language] % 1000 == 1:
try:
os.mkdir(root_count)
except FileExistsError:
pass
(_,ext) = os.path.splitext(f)
new_name = str(counts[language]) + ext
copy_src = root + '/' + r + '/' + d + '/' + f
copy_des = root_count + '/' + new_name
- os.symlink(copy_src, copy_des)
- print('{} successfully copied.'.format(copy_src))
+ try:
+ os.symlink(copy_src, copy_des)
+ print('{} successfully copied.'.format(copy_src))
+ except FileExistsError:
+ pass
finally:
ground_truth.close()
if __name__ == '__main__':
if len(sys.argv) != 2:
print('Only argument acceptable is a path.')
else:
main(sys.argv[1])
diff --git a/swh/langdetect/__init__.py b/swh/langdetect/__init__.py
index db2d42b..5f8841b 100644
--- a/swh/langdetect/__init__.py
+++ b/swh/langdetect/__init__.py
@@ -1,4 +1,5 @@
"""
Detectlang detects the programming language of source code file.
"""
+
diff --git a/swh/langdetect/ngram.py b/swh/langdetect/ngram.py
new file mode 100644
index 0000000..9db7303
--- /dev/null
+++ b/swh/langdetect/ngram.py
@@ -0,0 +1,63 @@
+"""
+Calculate frequencies for classes
+"""
+
+import os,sys,nltk
+
+from collections import Counter
+from nltk.util import ngrams
+from utils.common import tokenizer, file_to_string
+from utils.training import build_training_set
+
+class NGramTrain:
+
+ def __init__(self, root):
+ self._ngram_list = {}
+ self._prob = {}
+ self._root = root
+
+ def train(self):
+ root_training_set = self._root + '/../training_set'
+ root_model = '../../dataset/model'
+
+ try:
+ if len(os.listdir(root_training_set)) == 0:
+ build_training_set(self._root)
+ except FileNotFoundError:
+ os.mkdir(root_training_set)
+ build_training_set(self._root)
+
+ for language in os.listdir(root_training_set):
+ if not language.startswith('.'):
+ _ngram_list[language] = {}
+ _prob[language] = None
+ root_training_set_language = root_training_set + '/' + language
+ for f in os.listdir(root_training_set_language):
+ if not f.startswith('.'):
+ _ngrams_max(language, f)
+
+ def _ngrams_max(self, language, filename, n=3):
+ ngram_lan = self._ngram_list[language]
+ tokens = tokenizer(file_to_string(filename))
+ for i in range(1, n):
+ ngram_lan[i + 1] = ngram_lan.get(i + 1, []) + \
+ list(ngram(tokens, i + 1,
+ pad_left = True,
+ pad_right = True,
+ left_pad_symbol = '$BOS$',
+ right_pad_symbol = '$EOS$'))
+
+ def _kneser_key_prob(self, language):
+ c = Counter()
+ ngram_lan = self._ngram_list[language]
+ for key in self._ngram_lan.keys():
+ c = c + Counter(self.ngram_list[key])
+ freq_dist = nltk.FreqDist(c)
+ self._prob[language] = nltk.KneserNeyProbDist(freq_dist)
+
+if __name__ == '__main__':
+ if len(sys.argv) != 2:
+ print('Only argument acceptable is a path.')
+ else:
+ model = NGramTrain(sys.argv[1])
+ model.train()
diff --git a/swh/langdetect/utils.py b/swh/langdetect/utils.py
deleted file mode 100644
index 70d84c1..0000000
--- a/swh/langdetect/utils.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""
-Here regroup basic preprocessing methods
-used in learning stage for different
-approaches.
-
-"""
-
-import re
-
-_re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
-_re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
-
-def replace_string_and_number(text):
- """ Replace strings and numbers in a file by special tokens
- """
- str_replaced = re.sub(_re_string, '__str__', text)
- str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
- return str_num_replaced
-
-def remove_comment(text):
- # TODO: remove only inline comments and block comments
- # TODO: maybe build a list of comment markers
- pass
-
-def purify(text, lang):
- # TODO: for some language like HTML, remove code other than principal language
- pass
-
-
-
-
-
-
-
-
-
diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py
new file mode 100644
index 0000000..37648f9
--- /dev/null
+++ b/swh/langdetect/utils/common.py
@@ -0,0 +1,75 @@
+"""
+Here regroup basic preprocessing methods
+used in learning stage for different
+approaches.
+
+"""
+
+import re, os
+
+_re_string = re.compile(r"""("(\\.|[^"\\])*"|'(\\.|[^'\\])*')""")
+_re_number = re.compile(r'([\d]+)|([\d]+.[\d]+)[^A-Za-z]')
+_re_separator = re.compile(r'(\W)')
+_not_start_with_point = lambda x: not x.startswith('.')
+
+def tokenizer(text):
+ ''' Splits text into tokens '''
+ return [word for word in _re_separator.split(text) if word.strip(' \t')]
+
+def file_to_string(filename):
+ """ Read a file to a string. """
+ with open(filename, 'r') as f:
+ data = f.read().replace('\n',' ').lower
+ return data
+
+def count_files(root_language):
+ all_folders = natural_sort(filter
+ (_not_start_with_point,
+ os.listdir(root_language)))
+ files = natural_sort(filter
+ (_not_start_with_point,
+ os.listdir(root_language + '/' + all_folders[-1])))
+ (max,_) = os.path.splitext(files[-1])
+ return int(max)
+
+def find_file(root_language, n):
+ '''Find the n-th file in language folder'''
+ if n > count_files(root_language):
+ return ''
+ else:
+ start = (n - 1) // 1000 * 1000 + 1
+ end = start + 999
+ root_count = root_language + '/' + str(start) + '-' + str(end)
+ files = natural_sort(filter
+ (_not_start_with_point,
+ os.listdir(root_count)))
+ return root_count + '/' + files[n - start]
+
+def replace_string_and_number(text):
+ """ Replace strings and numbers in a file by special tokens
+ """
+ str_replaced = re.sub(_re_string, '__str__', text)
+ str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
+ return str_num_replaced
+
+def natural_sort(l):
+ convert = lambda text: int(text) if text.isdigit() else text.lower()
+ alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
+ return sorted(l, key = alphanum_key)
+
+def remove_comment(text):
+ # TODO: remove only inline comments and block comments
+ # TODO: maybe build a list of comment markers
+ pass
+
+def purify(text, lang):
+ # TODO: for some language like HTML, remove code other than principal language
+ pass
+
+
+
+
+
+
+
+
diff --git a/swh/langdetect/utils/training.py b/swh/langdetect/utils/training.py
new file mode 100644
index 0000000..07872ab
--- /dev/null
+++ b/swh/langdetect/utils/training.py
@@ -0,0 +1,35 @@
+import os,random
+
+from utils.common import count_files, find_file
+
+def build_training_set(root):
+ root_code = root + '/../code_by_language'
+ root_training = root + '/../training_set'
+ for language in os.listdir(root_code):
+ if not language.startswith('.'):
+ root_language = root_code + '/' + language
+ root_training_language = root_training + '/' + language
+ build_language_training_set(count_files(root_language),
+ root_language,
+ root_training_language)
+
+def build_language_training_set(total, root_language, root_training_language):
+ # limit defines the size of training set
+ # upper defines the maximum size
+
+ try:
+ os.mkdir(root_training_language)
+ except FileExistsError:
+ pass
+
+ upper = 4000
+ if total >= upper:
+ limit = upper // 2
+ else:
+ limit = total // 2
+ indices = random.sample(range(1, total + 1), limit)
+ files = map(lambda x : find_file(root_language, x), indices)
+ for src in files:
+ basename = os.path.basename(src)
+ des = root_training_language + '/' + basename
+ os.symlink(src, des)

File Metadata

Mime Type
text/x-diff
Expires
Jul 4 2025, 6:35 PM (5 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3304443

Event Timeline