diff --git a/swh/indexer/language.py b/swh/indexer/language.py index 7d0e3a4..5076de8 100644 --- a/swh/indexer/language.py +++ b/swh/indexer/language.py @@ -1,209 +1,209 @@ -# Copyright (C) 2016-2017 The Software Heritage developers +# Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import io from pygments.lexers import guess_lexer from pygments.util import ClassNotFound from chardet.universaldetector import UniversalDetector from .indexer import ContentIndexer def _cleanup_classname(classname): """Determine the language from the pygments' lexer names. """ return classname.lower().replace(' ', '-') def _read_raw(raw_content, size=2048): """Read raw content in chunk. """ bs = io.BytesIO(raw_content) while True: chunk = bs.read(size) if not chunk: break yield chunk def _detect_encoding(raw_content): """Given a raw content, try and detect its encoding. """ detector = UniversalDetector() for chunk in _read_raw(raw_content): detector.feed(chunk) if detector.done: break detector.close() return detector.result['encoding'] def compute_language_from_chunk(encoding, length, raw_content, max_size, log=None): """Determine the raw content's language. Args: encoding (str): Encoding to use to decode the content length (int): raw_content's length raw_content (bytes): raw content to work with max_size (int): max size to split the raw content at Returns: dict: Dict with keys: - **lang**: None if nothing found or the possible language """ try: if max_size <= length: raw_content = raw_content[0:max_size] content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) except ClassNotFound: lang = None except UnicodeDecodeError: raise except Exception: if log: log.exception('Problem during language detection, skipping') lang = None return { 'lang': lang } def compute_language(raw_content, encoding=None, log=None): """Determine the raw content's language. Args: raw_content (bytes): raw content to work with Returns: dict: Dict with keys: - **lang**: None if nothing found or the possible language """ try: encoding = _detect_encoding(raw_content) content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) except ClassNotFound: lang = None except Exception: if log: log.exception('Problem during language detection, skipping') lang = None return { 'lang': lang } -class ContentLanguageIndexer(ContentIndexer): +class LanguageIndexer(ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ CONFIG_BASE_FILENAME = 'indexer/language' ADDITIONAL_CONFIG = { 'tools': ('dict', { 'name': 'pygments', 'version': '2.0.1+dfsg-1.1+deb8u1', 'configuration': { 'type': 'library', 'debian-package': 'python3-pygments', 'max_content_size': 10240, }, }), } def prepare(self): super().prepare() c = self.config self.max_content_size = c['tools']['configuration']['max_content_size'] self.tool = self.tools[0] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_language_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'] } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: Dict that represents a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language """ result = { 'id': id, 'indexer_configuration_id': self.tool['id'], 'lang': None, } encoding = _detect_encoding(data) if not encoding: return result _len = len(data) for i in range(0, 9): max_size = self.max_content_size + i try: result = compute_language_from_chunk( encoding, _len, data, max_size, log=self.log) except UnicodeDecodeError: self.log.warning( 'Decoding failed on wrong byte chunk at [0-%s]' ', trying again at next ending byte.' % max_size) continue # we found something, so we return it result.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) break return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_language_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py index da4e585..6b7372f 100644 --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -1,119 +1,119 @@ # Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from swh.scheduler.task import Task as SchedulerTask from .mimetype import MimetypeIndexer, MimetypeRangeIndexer -from .language import ContentLanguageIndexer +from .language import LanguageIndexer from .ctags import CtagsIndexer from .fossology_license import ( FossologyLicenseIndexer, FossologyLicenseRangeIndexer ) from .rehash import RecomputeChecksums from .metadata import RevisionMetadataIndexer, OriginMetadataIndexer from .origin_head import OriginHeadIndexer logging.basicConfig(level=logging.INFO) class Task(SchedulerTask): """Task whose results is needed for other computations. """ def run_task(self, *args, **kwargs): indexer = self.Indexer().run(*args, **kwargs) if hasattr(indexer, 'results'): # indexer tasks return indexer.results return indexer class StatusTask(SchedulerTask): """Task which returns a status either eventful or uneventful. """ def run_task(self, *args, **kwargs): results = self.Indexer().run(*args, **kwargs) return {'status': 'eventful' if results else 'uneventful'} class RevisionMetadata(Task): task_queue = 'swh_indexer_revision_metadata' serializer = 'msgpack' Indexer = RevisionMetadataIndexer class OriginMetadata(Task): task_queue = 'swh_indexer_origin_intrinsic_metadata' Indexer = OriginMetadataIndexer class OriginHead(Task): task_queue = 'swh_indexer_origin_head' Indexer = OriginHeadIndexer class ContentMimetype(StatusTask): """Compute (mimetype, encoding) on a list of sha1s' content. """ task_queue = 'swh_indexer_content_mimetype' Indexer = MimetypeIndexer class ContentRangeMimetype(StatusTask): """Compute (mimetype, encoding) on a range of sha1s. """ task_queue = 'swh_indexer_content_mimetype_range' Indexer = MimetypeRangeIndexer class ContentLanguage(Task): """Task which computes the language from the sha1's content. """ task_queue = 'swh_indexer_content_language' - Indexer = ContentLanguageIndexer + Indexer = LanguageIndexer class Ctags(Task): """Task which computes ctags from the sha1's content. """ task_queue = 'swh_indexer_content_ctags' Indexer = CtagsIndexer class ContentFossologyLicense(Task): """Compute fossology licenses on a list of sha1s' content. """ task_queue = 'swh_indexer_content_fossology_license' Indexer = FossologyLicenseIndexer class ContentRangeFossologyLicense(StatusTask): """Compute fossology license on a range of sha1s. """ task_queue = 'swh_indexer_content_fossology_license_range' Indexer = FossologyLicenseRangeIndexer class RecomputeChecksums(Task): """Task which recomputes hashes and possibly new ones. """ task_queue = 'swh_indexer_content_rehash' Indexer = RecomputeChecksums diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py index dbe1e57..c53524e 100644 --- a/swh/indexer/tests/test_language.py +++ b/swh/indexer/tests/test_language.py @@ -1,98 +1,98 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from swh.indexer import language -from swh.indexer.language import ContentLanguageIndexer +from swh.indexer.language import LanguageIndexer from swh.indexer.tests.test_utils import ( BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, BASE_TEST_CONFIG ) -class LanguageTestIndexer(ContentLanguageIndexer): +class LanguageTestIndexer(LanguageIndexer): """Specific language whose configuration is enough to satisfy the indexing tests. """ def parse_config_file(self, *args, **kwargs): return { **BASE_TEST_CONFIG, 'tools': { 'name': 'pygments', 'version': '2.0.1+dfsg-1.1+deb8u1', 'configuration': { 'type': 'library', 'debian-package': 'python3-pygments', 'max_content_size': 10240, }, } } def prepare(self): super().prepare() self.idx_storage = BasicMockIndexerStorage() self.objstorage = MockObjStorage() self.tool_config = self.config['tools']['configuration'] class Language(unittest.TestCase): """Tests pygments tool for language detection """ def test_compute_language_none(self): # given self.content = "" self.declared_language = { 'lang': None } # when result = language.compute_language(self.content) # then self.assertEqual(self.declared_language, result) class TestLanguageIndexer(CommonContentIndexerTest, unittest.TestCase): """Language indexer test scenarios: - Known sha1s in the input list have their data indexed - Unknown sha1 in the input list are not indexed """ def setUp(self): self.indexer = LanguageTestIndexer() self.id0 = '02fb2c89e14f7fab46701478c83779c7beb7b069' self.id1 = '103bc087db1d26afc3a0283f38663d081e9b01e6' self.id2 = 'd4c647f0fc257591cc9ba1722484229780d1c607' tool_id = self.indexer.tool['id'] self.expected_results = { self.id0: { 'id': self.id0, 'indexer_configuration_id': tool_id, 'lang': 'python', }, self.id1: { 'id': self.id1, 'indexer_configuration_id': tool_id, 'lang': 'c' }, self.id2: { 'id': self.id2, 'indexer_configuration_id': tool_id, 'lang': 'text-only' } } class LanguageIndexerUnknownToolTestStorage( CommonIndexerNoTool, LanguageTestIndexer): """Fossology license indexer with wrong configuration""" class TestLanguageIndexersErrors( CommonIndexerWithErrorsTest, unittest.TestCase): """Test the indexer raise the right errors when wrongly initialized""" Indexer = LanguageIndexerUnknownToolTestStorage