diff --git a/requirements.txt b/requirements.txt index a578b91..84e7278 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ vcversioner -pygments click -chardet file-magic pyld xmltodict diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py index dbf7e15..28c82cf 100644 --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -1,146 +1,151 @@ # Copyright (C) 2015-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import subprocess import json from swh.model import hashutil -from .language import compute_language from .indexer import ContentIndexer, write_to_temp # Options used to compute tags __FLAGS = [ '--fields=+lnz', # +l: language # +n: line number of tag definition # +z: include the symbol's kind (function, variable, ...) '--sort=no', # sort output on tag name '--links=no', # do not follow symlinks '--output-format=json', # outputs in json ] +def compute_language(content): + raise NotImplementedError( + 'Language detection was unreliable, so it is currently disabled. ' + 'See https://forge.softwareheritage.org/D1455') + + def run_ctags(path, lang=None, ctags_command='ctags'): """Run ctags on file path with optional language. Args: path: path to the file lang: language for that path (optional) Yields: dict: ctags' output """ optional = [] if lang: optional = ['--language-force=%s' % lang] cmd = [ctags_command] + __FLAGS + optional + [path] output = subprocess.check_output(cmd, universal_newlines=True) for symbol in output.split('\n'): if not symbol: continue js_symbol = json.loads(symbol) yield { 'name': js_symbol['name'], 'kind': js_symbol['kind'], 'line': js_symbol['line'], 'lang': js_symbol['language'], } class CtagsIndexer(ContentIndexer): CONFIG_BASE_FILENAME = 'indexer/ctags' ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.ctags'), 'tools': ('dict', { 'name': 'universal-ctags', 'version': '~git7859817b', 'configuration': { 'command_line': '''ctags --fields=+lnz --sort=no --links=no ''' '''--output-format=json ''' }, }), 'languages': ('dict', { 'ada': 'Ada', 'adl': None, 'agda': None, # ... }) } def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.language_map = self.config['languages'] def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.idx_storage.content_ctags_missing(( { 'id': sha1, 'indexer_configuration_id': self.tool['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: dict: a dict representing a content_mimetype with keys: - **id** (bytes): content's identifier (sha1) - **ctags** ([dict]): ctags list of symbols """ lang = compute_language(data, log=self.log)['lang'] if not lang: return None ctags_lang = self.language_map.get(lang) if not ctags_lang: return None ctags = { 'id': id, } filename = hashutil.hash_to_hex(id) with write_to_temp( filename=filename, data=data, working_directory=self.working_directory) as content_path: result = run_ctags(content_path, lang=ctags_lang) ctags.update({ 'ctags': list(result), 'indexer_configuration_id': self.tool['id'], }) return ctags def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_ctags_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/language.py b/swh/indexer/language.py deleted file mode 100644 index c69b1dc..0000000 --- a/swh/indexer/language.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright (C) 2016-2018 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - - -import io - -from pygments.lexers import guess_lexer -from pygments.util import ClassNotFound -from chardet.universaldetector import UniversalDetector - -from .indexer import ContentIndexer - - -def _cleanup_classname(classname): - """Determine the language from the pygments' lexer names. - - """ - return classname.lower().replace(' ', '-') - - -def _read_raw(raw_content, size=2048): - """Read raw content in chunk. - - """ - bs = io.BytesIO(raw_content) - while True: - chunk = bs.read(size) - if not chunk: - break - yield chunk - - -def _detect_encoding(raw_content): - """Given a raw content, try and detect its encoding. - - """ - detector = UniversalDetector() - for chunk in _read_raw(raw_content): - detector.feed(chunk) - if detector.done: - break - detector.close() - return detector.result['encoding'] - - -def compute_language_from_chunk(encoding, length, raw_content, max_size, - log=None): - """Determine the raw content's language. - - Args: - encoding (str): Encoding to use to decode the content - length (int): raw_content's length - raw_content (bytes): raw content to work with - max_size (int): max size to split the raw content at - - Returns: - dict: Dict with keys: - - **lang**: None if nothing found or the possible language - - """ - try: - if max_size <= length: - raw_content = raw_content[0:max_size] - - content = raw_content.decode(encoding) - lang = _cleanup_classname( - guess_lexer(content).name) - except ClassNotFound: - lang = None - except UnicodeDecodeError: - raise - except Exception: - if log: - log.exception('Problem during language detection, skipping') - lang = None - return { - 'lang': lang - } - - -def compute_language(raw_content, encoding=None, log=None): - """Determine the raw content's language. - - Args: - raw_content (bytes): raw content to work with - - Returns: - dict: Dict with keys: - - **lang**: None if nothing found or the possible language - - """ - try: - encoding = _detect_encoding(raw_content) - content = raw_content.decode(encoding) - lang = _cleanup_classname( - guess_lexer(content).name) - except ClassNotFound: - lang = None - except Exception: - if log: - log.exception('Problem during language detection, skipping') - lang = None - return { - 'lang': lang - } - - -class LanguageIndexer(ContentIndexer): - """Indexer in charge of: - - - filtering out content already indexed - - reading content from objstorage per the content's id (sha1) - - computing {mimetype, encoding} from that content - - store result in storage - - """ - CONFIG_BASE_FILENAME = 'indexer/language' - - ADDITIONAL_CONFIG = { - 'tools': ('dict', { - 'name': 'pygments', - 'version': '2.0.1+dfsg-1.1+deb8u1', - 'configuration': { - 'type': 'library', - 'debian-package': 'python3-pygments', - 'max_content_size': 10240, - }, - }), - } - - @property - def max_content_size(self): - return self.tool['tool_configuration']['max_content_size'] - - def filter(self, ids): - """Filter out known sha1s and return only missing ones. - - """ - yield from self.idx_storage.content_language_missing(( - { - 'id': sha1, - 'indexer_configuration_id': self.tool['id'] - } for sha1 in ids - )) - - def index(self, id, data): - """Index sha1s' content and store result. - - Args: - id (bytes): content's identifier - data (bytes): raw content in bytes - - Returns: - dict: Dict that represents a content_mimetype, with keys: - - id (bytes): content's identifier (sha1) - - lang (bytes): detected language - - """ - result = { - 'id': id, - 'indexer_configuration_id': self.tool['id'], - 'lang': None, - } - - encoding = _detect_encoding(data) - - if not encoding: - return result - - _len = len(data) - for i in range(0, 9): - max_size = self.max_content_size + i - - try: - result = compute_language_from_chunk( - encoding, _len, data, max_size, log=self.log) - except UnicodeDecodeError: - self.log.warning( - 'Decoding failed on wrong byte chunk at [0-%s]' - ', trying again at next ending byte.' % max_size) - continue - - # we found something, so we return it - result.update({ - 'id': id, - 'indexer_configuration_id': self.tool['id'], - }) - break - - return result - - def persist_index_computations(self, results, policy_update): - """Persist the results in storage. - - Args: - results ([dict]): list of content_mimetype, dict with the - following keys: - - id (bytes): content's identifier (sha1) - - lang (bytes): detected language - policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them - - """ - self.idx_storage.content_language_add( - results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py index 97f921c..dc47146 100644 --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -1,64 +1,57 @@ # Copyright (C) 2016-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from celery import current_app as app from .mimetype import MimetypeIndexer, MimetypeRangeIndexer -from .language import LanguageIndexer from .ctags import CtagsIndexer from .fossology_license import ( FossologyLicenseIndexer, FossologyLicenseRangeIndexer ) from .rehash import RecomputeChecksums from .metadata import OriginMetadataIndexer @app.task(name=__name__ + '.OriginMetadata') def origin_metadata(*args, **kwargs): results = OriginMetadataIndexer().run(*args, **kwargs) return getattr(results, 'results', results) -@app.task(name=__name__ + '.ContentLanguage') -def content_language(*args, **kwargs): - results = LanguageIndexer().run(*args, **kwargs) - return getattr(results, 'results', results) - - @app.task(name=__name__ + '.Ctags') def ctags(*args, **kwargs): results = CtagsIndexer().run(*args, **kwargs) return getattr(results, 'results', results) @app.task(name=__name__ + '.ContentFossologyLicense') def fossology_license(*args, **kwargs): results = FossologyLicenseIndexer().run(*args, **kwargs) return getattr(results, 'results', results) @app.task(name=__name__ + '.RecomputeChecksums') def recompute_checksums(*args, **kwargs): results = RecomputeChecksums().run(*args, **kwargs) return getattr(results, 'results', results) @app.task(name=__name__ + '.ContentMimetype') def mimetype(*args, **kwargs): results = MimetypeIndexer().run(*args, **kwargs) return {'status': 'eventful' if results else 'uneventful'} @app.task(name=__name__ + '.ContentRangeMimetype') def range_mimetype(*args, **kwargs): results = MimetypeRangeIndexer().run(*args, **kwargs) return {'status': 'eventful' if results else 'uneventful'} @app.task(name=__name__ + '.ContentRangeFossologyLicense') def range_license(*args, **kwargs): results = FossologyLicenseRangeIndexer().run(*args, **kwargs) return {'status': 'eventful' if results else 'uneventful'} diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py deleted file mode 100644 index dc4e0c0..0000000 --- a/swh/indexer/tests/test_language.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (C) 2017-2018 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest -import pytest - -from swh.indexer import language -from swh.indexer.language import LanguageIndexer -from swh.indexer.tests.utils import ( - CommonContentIndexerTest, - BASE_TEST_CONFIG, fill_storage, fill_obj_storage, filter_dict, -) - - -CONFIG = { - **BASE_TEST_CONFIG, - 'tools': { - 'name': 'pygments', - 'version': '2.0.1+dfsg-1.1+deb8u1', - 'configuration': { - 'type': 'library', - 'debian-package': 'python3-pygments', - 'max_content_size': 10240, - }, - } -} - - -class Language(unittest.TestCase): - """Tests pygments tool for language detection - - """ - def test_compute_language_none(self): - # given - self.content = "" - self.declared_language = { - 'lang': None - } - # when - result = language.compute_language(self.content) - # then - self.assertEqual(self.declared_language, result) - - -class TestLanguageIndexer(CommonContentIndexerTest, unittest.TestCase): - """Language indexer test scenarios: - - - Known sha1s in the input list have their data indexed - - Unknown sha1 in the input list are not indexed - - """ - - legacy_get_format = True - - def get_indexer_results(self, ids): - yield from self.indexer.idx_storage.content_language_get(ids) - - def setUp(self): - self.indexer = LanguageIndexer(config=CONFIG) - self.indexer.catch_exceptions = False - fill_storage(self.indexer.storage) - fill_obj_storage(self.indexer.objstorage) - - self.id0 = '02fb2c89e14f7fab46701478c83779c7beb7b069' - self.id1 = '103bc087db1d26afc3a0283f38663d081e9b01e6' - self.id2 = 'd4c647f0fc257591cc9ba1722484229780d1c607' - - tool = {k.replace('tool_', ''): v - for (k, v) in self.indexer.tool.items()} - - self.expected_results = { - self.id0: { - 'id': self.id0, - 'tool': tool, - 'lang': 'python', - }, - self.id1: { - 'id': self.id1, - 'tool': tool, - 'lang': 'c' - }, - self.id2: { - 'id': self.id2, - 'tool': tool, - 'lang': 'text-only' - } - } - - -def test_language_w_no_tool(): - with pytest.raises(ValueError): - LanguageIndexer(config=filter_dict(CONFIG, 'tools'))