diff --git a/swh/indexer/__init__.py b/swh/indexer/__init__.py index 2350643..7bf9448 100644 --- a/swh/indexer/__init__.py +++ b/swh/indexer/__init__.py @@ -1,27 +1,30 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from .mimetype import ContentMimetypeIndexer from .language import ContentLanguageIndexer +from .ctags import CtagsIndexer INDEXER_CLASSES = { 'mimetype': ContentMimetypeIndexer, 'language': ContentLanguageIndexer, + 'ctags': CtagsIndexer, } TASK_NAMES = { 'orchestrator_all': 'swh.indexer.tasks.SWHOrchestratorAllContentsTask', 'orchestrator_text': 'swh.indexer.tasks.SWHOrchestratorTextContentsTask', 'mimetype': 'swh.indexer.tasks.SWHContentMimetypeTask', 'language': 'swh.indexer.tasks.SWHContentLanguageTask', + 'ctags': 'swh.indexer.tasks.SWHCtagsTask', } __all__ = [ - 'INDEXER_CLASSES', 'TASK_NAMES', 'ContentMimetypeIndexer', - 'ContentLanguageIndexer' + 'INDEXER_CLASSES', 'TASK_NAMES', + 'ContentMimetypeIndexer', 'ContentLanguageIndexer', 'CtagsIndexer', ] diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py index 6b27d66..02100a5 100644 --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -1,57 +1,142 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import subprocess import json +from swh.core import hashutil + +from .language import compute_language +from .indexer import BaseIndexer, DiskIndexer + # Options used to compute tags __FLAGS = [ - '--fields=+lnz', # +l: language of source file containing tag - # +n: line number of tag definition - # +z: include the symbol's kind (function, variable, ...) - '--sort=no', # sort output on tag name - '--links=no', # do not follow symlinks + '--fields=+nz', # +n: line number of tag definition + # +z: include the symbol's kind (function, variable, ...) + '--sort=no', # sort output on tag name + '--links=no', # do not follow symlinks '--output-format=json', # outputs in json ] -def run_ctags(path, lang=None): +def run_ctags(path, lang=None, ctags_binary='ctags'): """Run ctags on file path with optional language. Args: path: path to the file lang: language for that path (optional) Returns: ctags' output """ optional = [] - # if lang: - # optional = ['--language-force', lang] + if lang: + optional = ['--language-force=%s' % lang] - cmd = ['ctags'] + __FLAGS + optional + [path] + cmd = [ctags_binary] + __FLAGS + optional + [path] output = subprocess.check_output(cmd, universal_newlines=True) for symbol in output.split('\n'): if not symbol: continue js_symbol = json.loads(symbol) yield { k: v for k, v in js_symbol.items() if k != '_type' and k != 'path' } +class CtagsIndexer(BaseIndexer, DiskIndexer): + CONFIG_BASE_FILENAME = 'indexer/ctags' + + ADDITIONAL_CONFIG = { + 'ctags': ('str', '/usr/bin/ctags'), + 'workdir': ('str', '/tmp/swh/indexer.ctags'), + 'languages': ('dict', { + 'ada': 'Ada', + 'adl': None, + 'agda': None, + # ... + }) + } + + def __init__(self): + super().__init__() + self.working_directory = self.config['workdir'] + self.language_map = self.config['languages'] + self.ctags_binary = self.config['ctags'] + + def filter_contents(self, sha1s): + """Filter out known sha1s and return only missing ones. + + """ + yield from self.storage.content_ctags_missing(sha1s) + + def index_content(self, sha1, raw_content): + """Index sha1s' content and store result. + + Args: + sha1 (bytes): content's identifier + raw_content (bytes): raw content in bytes + + Returns: + A dict, representing a content_mimetype, with keys: + - id (bytes): content's identifier (sha1) + - ctags ([dict]): ctags list of symbols + + """ + lang = compute_language(raw_content)['lang'] + + ctags = { + 'id': sha1, + 'ctags': [] + } + + if lang: + ctags_lang = self.language_map.get(lang) + if ctags_lang: + filename = hashutil.hash_to_hex(sha1) + content_path = self.write_to_temp( + filename=filename, + data=raw_content) + + result = run_ctags(content_path, + lang=ctags_lang, + ctags_binary=self.ctags_binary) + ctags.update({ + 'ctags': list(result), + }) + + self.cleanup(content_path) + + return ctags + + def persist_index_computations(self, results, policy_update): + """Persist the results in storage. + + Args: + results ([dict]): list of content_mimetype, dict with the + following keys: + - id (bytes): content's identifier (sha1) + - ctags ([dict]): ctags list of symbols + policy_update ([str]): either 'update-dups' or 'ignore-dups' to + respectively update duplicates or ignore them + + """ + self.storage.content_ctags_add( + results, conflict_update=(policy_update == 'update-dups')) + + @click.command() @click.option('--path', help="Path to execute index on") def main(path): r = list(run_ctags(path)) print(r) if __name__ == '__main__': main() diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py index 44edadb..611294d 100644 --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -1,52 +1,62 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.task import Task from .orchestrator import OrchestratorAllContentsIndexer from .orchestrator import OrchestratorTextContentsIndexer -from . import ContentMimetypeIndexer, ContentLanguageIndexer +from . import ContentMimetypeIndexer, ContentLanguageIndexer, CtagsIndexer class SWHOrchestratorAllContentsTask(Task): """Main task in charge of reading batch contents (of any type) and broadcasting them back to other tasks. """ task_queue = 'swh_indexer_orchestrator_content_all' def run(self, *args, **kwargs): OrchestratorAllContentsIndexer().run(*args, **kwargs) class SWHOrchestratorTextContentsTask(Task): """Main task in charge of reading batch contents (of type text) and broadcasting them back to other tasks. """ task_queue = 'swh_indexer_orchestrator_content_text' def run(self, *args, **kwargs): OrchestratorTextContentsIndexer().run(*args, **kwargs) class SWHContentMimetypeTask(Task): """Task which computes the mimetype, encoding from the sha1's content. """ task_queue = 'swh_indexer_content_mimetype' def run(self, *args, **kwargs): ContentMimetypeIndexer().run(*args, **kwargs) class SWHContentLanguageTask(Task): """Task which computes the language from the sha1's content. """ task_queue = 'swh_indexer_content_language' def run(self, *args, **kwargs): ContentLanguageIndexer().run(*args, **kwargs) + + +class SWHCtagsTask(Task): + """Task which computes ctags from the sha1's content. + + """ + task_queue = 'swh_indexer_content_ctags' + + def run(self, *args, **kwargs): + CtagsIndexer().run(*args, **kwargs)