No OneTemporary
Actions

Size

6 KB

Subscribers

None

View Options

	diff --git a/swh/indexer/language.py b/swh/indexer/language.py
	index a93f8c3..0aa633f 100644
	--- a/swh/indexer/language.py
	+++ b/swh/indexer/language.py
	@@ -1,153 +1,199 @@
	# Copyright (C) 2016-2017 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information


	import io

	from pygments.lexers import guess_lexer
	from pygments.util import ClassNotFound
	from chardet.universaldetector import UniversalDetector

	from .indexer import BaseIndexer


	def _cleanup_classname(classname):
	"""Determine the language from the pygments' lexer names.

	"""
	return classname.lower().replace(' ', '-')


	def _read_raw(raw_content, size=2048):
	"""Read raw content in chunk.

	"""
	bs = io.BytesIO(raw_content)
	while True:
	chunk = bs.read(size)
	if not chunk:
	break
	yield chunk


	def _detect_encoding(raw_content):
	"""Given a raw content, try and detect its encoding.

	"""
	detector = UniversalDetector()
	for chunk in _read_raw(raw_content):
	detector.feed(chunk)
	if detector.done:
	break
	detector.close()
	return detector.result['encoding']


	-def compute_language(raw_content, log=None):
	+def compute_language_from_chunk(encoding, length, raw_content, max_size,
	+ log=None):
	"""Determine the raw content's language.

	Args:
	- raw_content (bytes): content to determine raw content
	+ encoding (str): Encoding to use to decode the content
	+ length (int): raw_content's length
	+ raw_content (bytes): raw content to work with
	+ max_size (int): max size to split the raw content at
	+
	+ Returns:
	+ Dict with keys:
	+ - lang: None if nothing found or the possible language
	+
	+ """
	+ try:
	+ if max_size <= length:
	+ raw_content = raw_content[0:max_size]
	+
	+ content = raw_content.decode(encoding)
	+ lang = _cleanup_classname(
	+ guess_lexer(content).name)
	+ except ClassNotFound:
	+ lang = None
	+ except UnicodeDecodeError:
	+ raise
	+ except Exception:
	+ if log:
	+ log.exception('Problem during language detection, skipping')
	+ lang = None
	+ return {
	+ 'lang': lang
	+ }
	+
	+
	+def compute_language(raw_content, encoding=None, log=None):
	+ """Determine the raw content's language.
	+
	+ Args:
	+ raw_content (bytes): raw content to work with

	Returns:
	Dict with keys:
	- lang: None if nothing found or the possible language
	- - decoding_failure: True if a decoding failure happened

	"""
	try:
	encoding = _detect_encoding(raw_content)
	content = raw_content.decode(encoding)
	lang = _cleanup_classname(
	guess_lexer(content).name)
	except ClassNotFound:
	lang = None
	except Exception:
	if log:
	log.exception('Problem during language detection, skipping')
	lang = None
	return {
	'lang': lang
	}


	class ContentLanguageIndexer(BaseIndexer):
	"""Indexer in charge of:
	- filtering out content already indexed
	- reading content from objstorage per the content's id (sha1)
	- computing {mimetype, encoding} from that content
	- store result in storage

	"""
	CONFIG_BASE_FILENAME = 'indexer/language'

	ADDITIONAL_CONFIG = {
	'tools': ('dict', {
	'name': 'pygments',
	'version': '2.0.1+dfsg-1.1+deb8u1',
	'configuration': {
	'type': 'library',
	'debian-package': 'python3-pygments',
	'max_content_size': 10240,
	},
	}),
	}

	def __init__(self):
	super().__init__()
	c = self.config
	self.max_content_size = c['tools']['configuration']['max_content_size']

	def filter_contents(self, sha1s):
	"""Filter out known sha1s and return only missing ones.

	"""
	tools = self.retrieve_tools_information()
	yield from self.storage.content_language_missing((
	{
	'id': sha1,
	'indexer_configuration_id': tools['id'],
	} for sha1 in sha1s
	))

	def index_content(self, sha1, raw_content):
	"""Index sha1s' content and store result.

	Args:
	sha1 (bytes): content's identifier
	raw_content (bytes): raw content in bytes

	Returns:
	A dict, representing a content_mimetype, with keys:
	- id (bytes): content's identifier (sha1)
	- lang (bytes): detected language

	"""
	+ encoding = _detect_encoding(raw_content)
	+
	l = len(raw_content)
	- if self.max_content_size <= l:
	- raw_content = raw_content[0:self.max_content_size]
	-
	- result = compute_language(raw_content, log=self.log)
	- result.update({
	- 'id': sha1,
	- 'indexer_configuration_id': self.tools['id'],
	- })
	+ for i in range(0, 4): # we could split at the wrong index,
	+ # thus raising a UnicodeDecodeError
	+ max_size = self.max_content_size + i
	+
	+ try:
	+ result = compute_language_from_chunk(
	+ encoding, l, raw_content, max_size, log=self.log)
	+ except UnicodeDecodeError:
	+ self.log.warn('Decoding failed on wrong byte chunk at [0-%s]'
	+ ', trying again at next ending byte.' % max_size)
	+ continue
	+
	+ # we found something, so we return it
	+ result.update({
	+ 'id': sha1,
	+ 'indexer_configuration_id': self.tools['id'],
	+ })
	+ break

	return result

	def persist_index_computations(self, results, policy_update):
	"""Persist the results in storage.

	Args:
	results ([dict]): list of content_mimetype, dict with the
	following keys:
	- id (bytes): content's identifier (sha1)
	- lang (bytes): detected language
	policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	self.storage.content_language_add(
	results, conflict_update=(policy_update == 'update-dups'))

File Metadata

Mime Type: text/x-diff
Expires: Fri, Jul 4, 3:37 PM (1 w, 1 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3255101

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions