diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ vcversioner -pygments click -chardet file-magic pyld xmltodict diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -8,7 +8,6 @@ from swh.model import hashutil -from .language import compute_language from .indexer import ContentIndexer, write_to_temp @@ -23,6 +22,12 @@ ] +def compute_language(content): + raise NotImplementedError( + 'Language detection was unreliable, so it is currently disabled. ' + 'See https://forge.softwareheritage.org/D1455') + + def run_ctags(path, lang=None, ctags_command='ctags'): """Run ctags on file path with optional language. diff --git a/swh/indexer/language.py b/swh/indexer/language.py deleted file mode 100644 --- a/swh/indexer/language.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright (C) 2016-2018 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -import io - -from pygments.lexers import guess_lexer -from pygments.util import ClassNotFound -from chardet.universaldetector import UniversalDetector - -from .indexer import ContentIndexer - - -def _cleanup_classname(classname): - """Determine the language from the pygments' lexer names. - - """ - return classname.lower().replace(' ', '-') - - -def _read_raw(raw_content, size=2048): - """Read raw content in chunk. - - """ - bs = io.BytesIO(raw_content) - while True: - chunk = bs.read(size) - if not chunk: - break - yield chunk - - -def _detect_encoding(raw_content): - """Given a raw content, try and detect its encoding. - - """ - detector = UniversalDetector() - for chunk in _read_raw(raw_content): - detector.feed(chunk) - if detector.done: - break - detector.close() - return detector.result['encoding'] - - -def compute_language_from_chunk(encoding, length, raw_content, max_size, - log=None): - """Determine the raw content's language. - - Args: - encoding (str): Encoding to use to decode the content - length (int): raw_content's length - raw_content (bytes): raw content to work with - max_size (int): max size to split the raw content at - - Returns: - dict: Dict with keys: - - **lang**: None if nothing found or the possible language - - """ - try: - if max_size <= length: - raw_content = raw_content[0:max_size] - - content = raw_content.decode(encoding) - lang = _cleanup_classname( - guess_lexer(content).name) - except ClassNotFound: - lang = None - except UnicodeDecodeError: - raise - except Exception: - if log: - log.exception('Problem during language detection, skipping') - lang = None - return { - 'lang': lang - } - - -def compute_language(raw_content, encoding=None, log=None): - """Determine the raw content's language. - - Args: - raw_content (bytes): raw content to work with - - Returns: - dict: Dict with keys: - - **lang**: None if nothing found or the possible language - - """ - try: - encoding = _detect_encoding(raw_content) - content = raw_content.decode(encoding) - lang = _cleanup_classname( - guess_lexer(content).name) - except ClassNotFound: - lang = None - except Exception: - if log: - log.exception('Problem during language detection, skipping') - lang = None - return { - 'lang': lang - } - - -class LanguageIndexer(ContentIndexer): - """Indexer in charge of: - - - filtering out content already indexed - - reading content from objstorage per the content's id (sha1) - - computing {mimetype, encoding} from that content - - store result in storage - - """ - CONFIG_BASE_FILENAME = 'indexer/language' - - ADDITIONAL_CONFIG = { - 'tools': ('dict', { - 'name': 'pygments', - 'version': '2.0.1+dfsg-1.1+deb8u1', - 'configuration': { - 'type': 'library', - 'debian-package': 'python3-pygments', - 'max_content_size': 10240, - }, - }), - } - - @property - def max_content_size(self): - return self.tool['tool_configuration']['max_content_size'] - - def filter(self, ids): - """Filter out known sha1s and return only missing ones. - - """ - yield from self.idx_storage.content_language_missing(( - { - 'id': sha1, - 'indexer_configuration_id': self.tool['id'] - } for sha1 in ids - )) - - def index(self, id, data): - """Index sha1s' content and store result. - - Args: - id (bytes): content's identifier - data (bytes): raw content in bytes - - Returns: - dict: Dict that represents a content_mimetype, with keys: - - id (bytes): content's identifier (sha1) - - lang (bytes): detected language - - """ - result = { - 'id': id, - 'indexer_configuration_id': self.tool['id'], - 'lang': None, - } - - encoding = _detect_encoding(data) - - if not encoding: - return result - - _len = len(data) - for i in range(0, 9): - max_size = self.max_content_size + i - - try: - result = compute_language_from_chunk( - encoding, _len, data, max_size, log=self.log) - except UnicodeDecodeError: - self.log.warning( - 'Decoding failed on wrong byte chunk at [0-%s]' - ', trying again at next ending byte.' % max_size) - continue - - # we found something, so we return it - result.update({ - 'id': id, - 'indexer_configuration_id': self.tool['id'], - }) - break - - return result - - def persist_index_computations(self, results, policy_update): - """Persist the results in storage. - - Args: - results ([dict]): list of content_mimetype, dict with the - following keys: - - id (bytes): content's identifier (sha1) - - lang (bytes): detected language - policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them - - """ - self.idx_storage.content_language_add( - results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -7,7 +7,6 @@ from celery import current_app as app from .mimetype import MimetypeIndexer, MimetypeRangeIndexer -from .language import LanguageIndexer from .ctags import CtagsIndexer from .fossology_license import ( FossologyLicenseIndexer, FossologyLicenseRangeIndexer @@ -22,12 +21,6 @@ return getattr(results, 'results', results) -@app.task(name=__name__ + '.ContentLanguage') -def content_language(*args, **kwargs): - results = LanguageIndexer().run(*args, **kwargs) - return getattr(results, 'results', results) - - @app.task(name=__name__ + '.Ctags') def ctags(*args, **kwargs): results = CtagsIndexer().run(*args, **kwargs) diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py deleted file mode 100644 --- a/swh/indexer/tests/test_language.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (C) 2017-2018 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest -import pytest - -from swh.indexer import language -from swh.indexer.language import LanguageIndexer -from swh.indexer.tests.utils import ( - CommonContentIndexerTest, - BASE_TEST_CONFIG, fill_storage, fill_obj_storage, filter_dict, -) - - -CONFIG = { - **BASE_TEST_CONFIG, - 'tools': { - 'name': 'pygments', - 'version': '2.0.1+dfsg-1.1+deb8u1', - 'configuration': { - 'type': 'library', - 'debian-package': 'python3-pygments', - 'max_content_size': 10240, - }, - } -} - - -class Language(unittest.TestCase): - """Tests pygments tool for language detection - - """ - def test_compute_language_none(self): - # given - self.content = "" - self.declared_language = { - 'lang': None - } - # when - result = language.compute_language(self.content) - # then - self.assertEqual(self.declared_language, result) - - -class TestLanguageIndexer(CommonContentIndexerTest, unittest.TestCase): - """Language indexer test scenarios: - - - Known sha1s in the input list have their data indexed - - Unknown sha1 in the input list are not indexed - - """ - - legacy_get_format = True - - def get_indexer_results(self, ids): - yield from self.indexer.idx_storage.content_language_get(ids) - - def setUp(self): - self.indexer = LanguageIndexer(config=CONFIG) - self.indexer.catch_exceptions = False - fill_storage(self.indexer.storage) - fill_obj_storage(self.indexer.objstorage) - - self.id0 = '02fb2c89e14f7fab46701478c83779c7beb7b069' - self.id1 = '103bc087db1d26afc3a0283f38663d081e9b01e6' - self.id2 = 'd4c647f0fc257591cc9ba1722484229780d1c607' - - tool = {k.replace('tool_', ''): v - for (k, v) in self.indexer.tool.items()} - - self.expected_results = { - self.id0: { - 'id': self.id0, - 'tool': tool, - 'lang': 'python', - }, - self.id1: { - 'id': self.id1, - 'tool': tool, - 'lang': 'c' - }, - self.id2: { - 'id': self.id2, - 'tool': tool, - 'lang': 'text-only' - } - } - - -def test_language_w_no_tool(): - with pytest.raises(ValueError): - LanguageIndexer(config=filter_dict(CONFIG, 'tools'))