No OneTemporary
Actions

Size

13 KB

Subscribers

None

View Options

	diff --git a/scripts/dataset/ground_truth_arrange.py b/scripts/dataset/ground_truth_arrange.py
	index 47e2dd3..65b09f0 100644
	--- a/scripts/dataset/ground_truth_arrange.py
	+++ b/scripts/dataset/ground_truth_arrange.py
	@@ -1,107 +1,113 @@
	# Copyright (C) 2015-2016 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	#!/usr/bin/env python3
	# coding: utf-8

	import os
	import sys
	import io
	import json

	def main(root):

	root_ground_truth = root + '/../ground_truth'
	root_ground_truth_text = root_ground_truth + '_text'
	root_code_by_language = root + '/../code_by_language'
	counts = dict()

	try:
	os.mkdir(root_code_by_language)
	except FileExistsError:
	pass

	for r in os.listdir(root):
	if not r.startswith('.'):
	for d in os.listdir(root + '/' + r):
	if not d.startswith('.'):
	try:
	ground_truth = io.open(root_ground_truth + '/' + r + '/' + d + '.json')
	try:
	j = json.load(ground_truth)
	for language in j.keys():
	root_language = root_code_by_language + '/' + language
	try:
	os.mkdir(root_language)
	except FileExistsError:
	pass
	for f in j.get(language):
	counts[language] = counts.get(language, 0) + 1
	start = (counts[language] - 1) // 1000 * 1000 + 1
	end = start + 999
	root_count = root_language + '/' + str(start) + '-' + str(end)
	if counts[language] % 1000 == 1:
	try:
	os.mkdir(root_count)
	except FileExistsError:
	pass
	(_,ext) = os.path.splitext(f)
	new_name = str(counts[language]) + ext
	copy_src = root + '/' + r + '/' + d + '/' + f
	copy_des = root_count + '/' + new_name
	- os.symlink(copy_src, copy_des)
	- print(copy_src + ' successfully copied.')
	+ try:
	+ os.symlink(copy_src, copy_des)
	+ print('{} successfully copied.'.format(copy_src))
	+ except FileExistsError:
	+ pass
	except json.decoder.JSONDecodeError:
	ground_truth.close()
	ground_truth = io.open(root_ground_truth_text + '/' + r + '/' + d, 'r')
	while(True):
	line = ground_truth.readline()
	if line == '\n' or line == '':
	break
	else:
	pass

	while(True):
	line = ground_truth.readline()
	stripped = line.strip()
	if line == '':
	break
	else:
	stripped = line.strip()
	language = stripped.replace(':','')
	root_language = root_code_by_language + '/' + language
	try:
	os.mkdir(root_language)
	except FileExistsError:
	pass
	while(True):
	line = ground_truth.readline()
	if line == '\n':
	break
	else:
	f = line.strip()
	counts[language] = counts.get(language, 0) + 1
	start = (counts[language] - 1) // 1000 * 1000 + 1
	end = start + 999
	root_count = root_language + '/' + str(start) + '-' + str(end)
	if counts[language] % 1000 == 1:
	try:
	os.mkdir(root_count)
	except FileExistsError:
	pass
	(_,ext) = os.path.splitext(f)
	new_name = str(counts[language]) + ext
	copy_src = root + '/' + r + '/' + d + '/' + f
	copy_des = root_count + '/' + new_name
	- os.symlink(copy_src, copy_des)
	- print('{} successfully copied.'.format(copy_src))
	+ try:
	+ os.symlink(copy_src, copy_des)
	+ print('{} successfully copied.'.format(copy_src))
	+ except FileExistsError:
	+ pass
	finally:
	ground_truth.close()

	if __name__ == '__main__':
	if len(sys.argv) != 2:
	print('Only argument acceptable is a path.')
	else:
	main(sys.argv[1])
	diff --git a/swh/langdetect/__init__.py b/swh/langdetect/__init__.py
	index db2d42b..5f8841b 100644
	--- a/swh/langdetect/__init__.py
	+++ b/swh/langdetect/__init__.py
	@@ -1,4 +1,5 @@
	"""
	Detectlang detects the programming language of source code file.

	"""
	+
	diff --git a/swh/langdetect/ngram.py b/swh/langdetect/ngram.py
	new file mode 100644
	index 0000000..9db7303
	--- /dev/null
	+++ b/swh/langdetect/ngram.py
	@@ -0,0 +1,63 @@
	+"""
	+Calculate frequencies for classes
	+"""
	+
	+import os,sys,nltk
	+
	+from collections import Counter
	+from nltk.util import ngrams
	+from utils.common import tokenizer, file_to_string
	+from utils.training import build_training_set
	+
	+class NGramTrain:
	+
	+ def __init__(self, root):
	+ self._ngram_list = {}
	+ self._prob = {}
	+ self._root = root
	+
	+ def train(self):
	+ root_training_set = self._root + '/../training_set'
	+ root_model = '../../dataset/model'
	+
	+ try:
	+ if len(os.listdir(root_training_set)) == 0:
	+ build_training_set(self._root)
	+ except FileNotFoundError:
	+ os.mkdir(root_training_set)
	+ build_training_set(self._root)
	+
	+ for language in os.listdir(root_training_set):
	+ if not language.startswith('.'):
	+ _ngram_list[language] = {}
	+ _prob[language] = None
	+ root_training_set_language = root_training_set + '/' + language
	+ for f in os.listdir(root_training_set_language):
	+ if not f.startswith('.'):
	+ _ngrams_max(language, f)
	+
	+ def _ngrams_max(self, language, filename, n=3):
	+ ngram_lan = self._ngram_list[language]
	+ tokens = tokenizer(file_to_string(filename))
	+ for i in range(1, n):
	+ ngram_lan[i + 1] = ngram_lan.get(i + 1, []) + \
	+ list(ngram(tokens, i + 1,
	+ pad_left = True,
	+ pad_right = True,
	+ left_pad_symbol = '$BOS$',
	+ right_pad_symbol = '$EOS$'))
	+
	+ def _kneser_key_prob(self, language):
	+ c = Counter()
	+ ngram_lan = self._ngram_list[language]
	+ for key in self._ngram_lan.keys():
	+ c = c + Counter(self.ngram_list[key])
	+ freq_dist = nltk.FreqDist(c)
	+ self._prob[language] = nltk.KneserNeyProbDist(freq_dist)
	+
	+if __name__ == '__main__':
	+ if len(sys.argv) != 2:
	+ print('Only argument acceptable is a path.')
	+ else:
	+ model = NGramTrain(sys.argv[1])
	+ model.train()
	diff --git a/swh/langdetect/utils.py b/swh/langdetect/utils.py
	deleted file mode 100644
	index 70d84c1..0000000
	--- a/swh/langdetect/utils.py
	+++ /dev/null
	@@ -1,36 +0,0 @@
	-"""
	-Here regroup basic preprocessing methods
	-used in learning stage for different
	-approaches.
	-
	-"""
	-
	-import re
	-
	-_re_string = re.compile(r"""("(\\.\|[^"\\])"\|'(\\.\|[^'\\])')""")
	-_re_number = re.compile(r'([\d]+)\|([\d]+.[\d]+)[^A-Za-z]')
	-
	-def replace_string_and_number(text):
	- """ Replace strings and numbers in a file by special tokens
	- """
	- str_replaced = re.sub(_re_string, '__str__', text)
	- str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
	- return str_num_replaced
	-
	-def remove_comment(text):
	- # TODO: remove only inline comments and block comments
	- # TODO: maybe build a list of comment markers
	- pass
	-
	-def purify(text, lang):
	- # TODO: for some language like HTML, remove code other than principal language
	- pass
	-
	-
	-
	-
	-
	-
	-
	-
	-
	diff --git a/swh/langdetect/utils/common.py b/swh/langdetect/utils/common.py
	new file mode 100644
	index 0000000..37648f9
	--- /dev/null
	+++ b/swh/langdetect/utils/common.py
	@@ -0,0 +1,75 @@
	+"""
	+Here regroup basic preprocessing methods
	+used in learning stage for different
	+approaches.
	+
	+"""
	+
	+import re, os
	+
	+_re_string = re.compile(r"""("(\\.\|[^"\\])"\|'(\\.\|[^'\\])')""")
	+_re_number = re.compile(r'([\d]+)\|([\d]+.[\d]+)[^A-Za-z]')
	+_re_separator = re.compile(r'(\W)')
	+_not_start_with_point = lambda x: not x.startswith('.')
	+
	+def tokenizer(text):
	+ ''' Splits text into tokens '''
	+ return [word for word in _re_separator.split(text) if word.strip(' \t')]
	+
	+def file_to_string(filename):
	+ """ Read a file to a string. """
	+ with open(filename, 'r') as f:
	+ data = f.read().replace('\n',' ').lower
	+ return data
	+
	+def count_files(root_language):
	+ all_folders = natural_sort(filter
	+ (_not_start_with_point,
	+ os.listdir(root_language)))
	+ files = natural_sort(filter
	+ (_not_start_with_point,
	+ os.listdir(root_language + '/' + all_folders[-1])))
	+ (max,_) = os.path.splitext(files[-1])
	+ return int(max)
	+
	+def find_file(root_language, n):
	+ '''Find the n-th file in language folder'''
	+ if n > count_files(root_language):
	+ return ''
	+ else:
	+ start = (n - 1) // 1000 * 1000 + 1
	+ end = start + 999
	+ root_count = root_language + '/' + str(start) + '-' + str(end)
	+ files = natural_sort(filter
	+ (_not_start_with_point,
	+ os.listdir(root_count)))
	+ return root_count + '/' + files[n - start]
	+
	+def replace_string_and_number(text):
	+ """ Replace strings and numbers in a file by special tokens
	+ """
	+ str_replaced = re.sub(_re_string, '__str__', text)
	+ str_num_replaced = re.sub(_re_number, '__num__', str_replaced)
	+ return str_num_replaced
	+
	+def natural_sort(l):
	+ convert = lambda text: int(text) if text.isdigit() else text.lower()
	+ alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
	+ return sorted(l, key = alphanum_key)
	+
	+def remove_comment(text):
	+ # TODO: remove only inline comments and block comments
	+ # TODO: maybe build a list of comment markers
	+ pass
	+
	+def purify(text, lang):
	+ # TODO: for some language like HTML, remove code other than principal language
	+ pass
	+
	+
	+
	+
	+
	+
	+
	+
	diff --git a/swh/langdetect/utils/training.py b/swh/langdetect/utils/training.py
	new file mode 100644
	index 0000000..07872ab
	--- /dev/null
	+++ b/swh/langdetect/utils/training.py
	@@ -0,0 +1,35 @@
	+import os,random
	+
	+from utils.common import count_files, find_file
	+
	+def build_training_set(root):
	+ root_code = root + '/../code_by_language'
	+ root_training = root + '/../training_set'
	+ for language in os.listdir(root_code):
	+ if not language.startswith('.'):
	+ root_language = root_code + '/' + language
	+ root_training_language = root_training + '/' + language
	+ build_language_training_set(count_files(root_language),
	+ root_language,
	+ root_training_language)
	+
	+def build_language_training_set(total, root_language, root_training_language):
	+ # limit defines the size of training set
	+ # upper defines the maximum size
	+
	+ try:
	+ os.mkdir(root_training_language)
	+ except FileExistsError:
	+ pass
	+
	+ upper = 4000
	+ if total >= upper:
	+ limit = upper // 2
	+ else:
	+ limit = total // 2
	+ indices = random.sample(range(1, total + 1), limit)
	+ files = map(lambda x : find_file(root_language, x), indices)
	+ for src in files:
	+ basename = os.path.basename(src)
	+ des = root_training_language + '/' + basename
	+ os.symlink(src, des)

File Metadata

Mime Type: text/x-diff
Expires: Jul 4 2025, 6:35 PM (5 w, 4 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3304443

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions