diff --git a/swh/indexer/language.py b/swh/indexer/language.py index a93f8c3..0aa633f 100644 --- a/swh/indexer/language.py +++ b/swh/indexer/language.py @@ -1,153 +1,199 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io from pygments.lexers import guess_lexer from pygments.util import ClassNotFound from chardet.universaldetector import UniversalDetector from .indexer import BaseIndexer def _cleanup_classname(classname): """Determine the language from the pygments' lexer names. """ return classname.lower().replace(' ', '-') def _read_raw(raw_content, size=2048): """Read raw content in chunk. """ bs = io.BytesIO(raw_content) while True: chunk = bs.read(size) if not chunk: break yield chunk def _detect_encoding(raw_content): """Given a raw content, try and detect its encoding. """ detector = UniversalDetector() for chunk in _read_raw(raw_content): detector.feed(chunk) if detector.done: break detector.close() return detector.result['encoding'] -def compute_language(raw_content, log=None): +def compute_language_from_chunk(encoding, length, raw_content, max_size, + log=None): """Determine the raw content's language. Args: - raw_content (bytes): content to determine raw content + encoding (str): Encoding to use to decode the content + length (int): raw_content's length + raw_content (bytes): raw content to work with + max_size (int): max size to split the raw content at + + Returns: + Dict with keys: + - lang: None if nothing found or the possible language + + """ + try: + if max_size <= length: + raw_content = raw_content[0:max_size] + + content = raw_content.decode(encoding) + lang = _cleanup_classname( + guess_lexer(content).name) + except ClassNotFound: + lang = None + except UnicodeDecodeError: + raise + except Exception: + if log: + log.exception('Problem during language detection, skipping') + lang = None + return { + 'lang': lang + } + + +def compute_language(raw_content, encoding=None, log=None): + """Determine the raw content's language. + + Args: + raw_content (bytes): raw content to work with Returns: Dict with keys: - lang: None if nothing found or the possible language - - decoding_failure: True if a decoding failure happened """ try: encoding = _detect_encoding(raw_content) content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) except ClassNotFound: lang = None except Exception: if log: log.exception('Problem during language detection, skipping') lang = None return { 'lang': lang } class ContentLanguageIndexer(BaseIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ CONFIG_BASE_FILENAME = 'indexer/language' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'pygments', 'version': '2.0.1+dfsg-1.1+deb8u1', 'configuration': { 'type': 'library', 'debian-package': 'python3-pygments', 'max_content_size': 10240, }, }), } def __init__(self): super().__init__() c = self.config self.max_content_size = c['tools']['configuration']['max_content_size'] def filter_contents(self, sha1s): """Filter out known sha1s and return only missing ones. """ tools = self.retrieve_tools_information() yield from self.storage.content_language_missing(( { 'id': sha1, 'indexer_configuration_id': tools['id'], } for sha1 in sha1s )) def index_content(self, sha1, raw_content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language """ + encoding = _detect_encoding(raw_content) + l = len(raw_content) - if self.max_content_size <= l: - raw_content = raw_content[0:self.max_content_size] - - result = compute_language(raw_content, log=self.log) - result.update({ - 'id': sha1, - 'indexer_configuration_id': self.tools['id'], - }) + for i in range(0, 4): # we could split at the wrong index, + # thus raising a UnicodeDecodeError + max_size = self.max_content_size + i + + try: + result = compute_language_from_chunk( + encoding, l, raw_content, max_size, log=self.log) + except UnicodeDecodeError: + self.log.warn('Decoding failed on wrong byte chunk at [0-%s]' + ', trying again at next ending byte.' % max_size) + continue + + # we found something, so we return it + result.update({ + 'id': sha1, + 'indexer_configuration_id': self.tools['id'], + }) + break return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.storage.content_language_add( results, conflict_update=(policy_update == 'update-dups'))