Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/indexer/language.py b/swh/indexer/language.py
index a93f8c3..0aa633f 100644
--- a/swh/indexer/language.py
+++ b/swh/indexer/language.py
@@ -1,153 +1,199 @@
# Copyright (C) 2016-2017 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import io
from pygments.lexers import guess_lexer
from pygments.util import ClassNotFound
from chardet.universaldetector import UniversalDetector
from .indexer import BaseIndexer
def _cleanup_classname(classname):
"""Determine the language from the pygments' lexer names.
"""
return classname.lower().replace(' ', '-')
def _read_raw(raw_content, size=2048):
"""Read raw content in chunk.
"""
bs = io.BytesIO(raw_content)
while True:
chunk = bs.read(size)
if not chunk:
break
yield chunk
def _detect_encoding(raw_content):
"""Given a raw content, try and detect its encoding.
"""
detector = UniversalDetector()
for chunk in _read_raw(raw_content):
detector.feed(chunk)
if detector.done:
break
detector.close()
return detector.result['encoding']
-def compute_language(raw_content, log=None):
+def compute_language_from_chunk(encoding, length, raw_content, max_size,
+ log=None):
"""Determine the raw content's language.
Args:
- raw_content (bytes): content to determine raw content
+ encoding (str): Encoding to use to decode the content
+ length (int): raw_content's length
+ raw_content (bytes): raw content to work with
+ max_size (int): max size to split the raw content at
+
+ Returns:
+ Dict with keys:
+ - lang: None if nothing found or the possible language
+
+ """
+ try:
+ if max_size <= length:
+ raw_content = raw_content[0:max_size]
+
+ content = raw_content.decode(encoding)
+ lang = _cleanup_classname(
+ guess_lexer(content).name)
+ except ClassNotFound:
+ lang = None
+ except UnicodeDecodeError:
+ raise
+ except Exception:
+ if log:
+ log.exception('Problem during language detection, skipping')
+ lang = None
+ return {
+ 'lang': lang
+ }
+
+
+def compute_language(raw_content, encoding=None, log=None):
+ """Determine the raw content's language.
+
+ Args:
+ raw_content (bytes): raw content to work with
Returns:
Dict with keys:
- lang: None if nothing found or the possible language
- - decoding_failure: True if a decoding failure happened
"""
try:
encoding = _detect_encoding(raw_content)
content = raw_content.decode(encoding)
lang = _cleanup_classname(
guess_lexer(content).name)
except ClassNotFound:
lang = None
except Exception:
if log:
log.exception('Problem during language detection, skipping')
lang = None
return {
'lang': lang
}
class ContentLanguageIndexer(BaseIndexer):
"""Indexer in charge of:
- filtering out content already indexed
- reading content from objstorage per the content's id (sha1)
- computing {mimetype, encoding} from that content
- store result in storage
"""
CONFIG_BASE_FILENAME = 'indexer/language'
ADDITIONAL_CONFIG = {
'tools': ('dict', {
'name': 'pygments',
'version': '2.0.1+dfsg-1.1+deb8u1',
'configuration': {
'type': 'library',
'debian-package': 'python3-pygments',
'max_content_size': 10240,
},
}),
}
def __init__(self):
super().__init__()
c = self.config
self.max_content_size = c['tools']['configuration']['max_content_size']
def filter_contents(self, sha1s):
"""Filter out known sha1s and return only missing ones.
"""
tools = self.retrieve_tools_information()
yield from self.storage.content_language_missing((
{
'id': sha1,
'indexer_configuration_id': tools['id'],
} for sha1 in sha1s
))
def index_content(self, sha1, raw_content):
"""Index sha1s' content and store result.
Args:
sha1 (bytes): content's identifier
raw_content (bytes): raw content in bytes
Returns:
A dict, representing a content_mimetype, with keys:
- id (bytes): content's identifier (sha1)
- lang (bytes): detected language
"""
+ encoding = _detect_encoding(raw_content)
+
l = len(raw_content)
- if self.max_content_size <= l:
- raw_content = raw_content[0:self.max_content_size]
-
- result = compute_language(raw_content, log=self.log)
- result.update({
- 'id': sha1,
- 'indexer_configuration_id': self.tools['id'],
- })
+ for i in range(0, 4): # we could split at the wrong index,
+ # thus raising a UnicodeDecodeError
+ max_size = self.max_content_size + i
+
+ try:
+ result = compute_language_from_chunk(
+ encoding, l, raw_content, max_size, log=self.log)
+ except UnicodeDecodeError:
+ self.log.warn('Decoding failed on wrong byte chunk at [0-%s]'
+ ', trying again at next ending byte.' % max_size)
+ continue
+
+ # we found something, so we return it
+ result.update({
+ 'id': sha1,
+ 'indexer_configuration_id': self.tools['id'],
+ })
+ break
return result
def persist_index_computations(self, results, policy_update):
"""Persist the results in storage.
Args:
results ([dict]): list of content_mimetype, dict with the
following keys:
- id (bytes): content's identifier (sha1)
- lang (bytes): detected language
policy_update ([str]): either 'update-dups' or 'ignore-dups' to
respectively update duplicates or ignore them
"""
self.storage.content_language_add(
results, conflict_update=(policy_update == 'update-dups'))

File Metadata

Mime Type
text/x-diff
Expires
Fri, Jul 4, 3:37 PM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3255101

Event Timeline