diff --git a/swh/indexer/ctags.py b/swh/indexer/ctags.py index f5bd817..6bdd78d 100644 --- a/swh/indexer/ctags.py +++ b/swh/indexer/ctags.py @@ -1,150 +1,164 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import subprocess import json from swh.core import hashutil from .language import compute_language from .indexer import BaseIndexer, DiskIndexer # Options used to compute tags __FLAGS = [ '--fields=+lnz', # +l: language # +n: line number of tag definition # +z: include the symbol's kind (function, variable, ...) '--sort=no', # sort output on tag name '--links=no', # do not follow symlinks '--output-format=json', # outputs in json ] -def run_ctags(path, lang=None, ctags_binary='ctags'): +def run_ctags(path, lang=None, ctags_command='ctags'): """Run ctags on file path with optional language. Args: path: path to the file lang: language for that path (optional) Returns: ctags' output """ optional = [] if lang: optional = ['--language-force=%s' % lang] - cmd = [ctags_binary] + __FLAGS + optional + [path] + cmd = [ctags_command] + __FLAGS + optional + [path] output = subprocess.check_output(cmd, universal_newlines=True) for symbol in output.split('\n'): if not symbol: continue js_symbol = json.loads(symbol) yield { 'name': js_symbol['name'], 'kind': js_symbol['kind'], 'line': js_symbol['line'], 'lang': js_symbol['language'], } class CtagsIndexer(BaseIndexer, DiskIndexer): CONFIG_BASE_FILENAME = 'indexer/ctags' ADDITIONAL_CONFIG = { - 'ctags': ('str', '/usr/bin/ctags'), 'workdir': ('str', '/tmp/swh/indexer.ctags'), + 'tool': ('dict', { + 'name': 'universal-ctags', + 'version': '~git7859817b', + 'command': '/usr/bin/ctags', + }), 'languages': ('dict', { 'ada': 'Ada', 'adl': None, 'agda': None, # ... }) } def __init__(self): super().__init__() self.working_directory = self.config['workdir'] self.language_map = self.config['languages'] - self.ctags_binary = self.config['ctags'] + self.ctags_command = self.config['tool']['command'] + self.tool_name = self.config['tool']['name'] + self.tool_version = self.config['tool']['version'] def filter_contents(self, sha1s): """Filter out known sha1s and return only missing ones. """ - yield from self.storage.content_ctags_missing(sha1s) + yield from self.storage.content_ctags_missing(( + { + 'id': sha1, + 'tool_name': self.tool_name, + 'tool_version': self.tool_version + } for sha1 in sha1s + )) def index_content(self, sha1, raw_content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols """ lang = compute_language(raw_content)['lang'] if not lang: return None ctags_lang = self.language_map.get(lang) if not ctags_lang: return None ctags = { 'id': sha1, } filename = hashutil.hash_to_hex(sha1) content_path = self.write_to_temp( filename=filename, data=raw_content) result = run_ctags(content_path, lang=ctags_lang, - ctags_binary=self.ctags_binary) + ctags_command=self.ctags_command) ctags.update({ 'ctags': list(result), + 'tool_name': self.tool_name, + 'tool_version': self.tool_version, }) self.cleanup(content_path) return ctags def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - ctags ([dict]): ctags list of symbols policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.storage.content_ctags_add( results, conflict_update=(policy_update == 'update-dups')) @click.command() @click.option('--path', help="Path to execute index on") def main(path): r = list(run_ctags(path)) print(r) if __name__ == '__main__': main() diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py index d3864b9..05ce54b 100644 --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,132 +1,138 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import subprocess from swh.core import hashutil from .indexer import BaseIndexer, DiskIndexer def compute_license(tool, path): """Determine license from file at path. Args: path: filepath to determine the license Returns: A dict with the following keys: - licenses ([str]): associated detected licenses to path - path (bytes): content filepath - tool (str): tool used to compute the output """ properties = subprocess.check_output([tool, path], universal_newlines=True) if properties: res = properties.rstrip().split(' contains license(s) ') licenses = res[1].split(',') return { 'licenses': licenses, 'path': path, } class ContentFossologyLicenseIndexer(BaseIndexer, DiskIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {license, encoding} from that content - store result in storage """ ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.fossology.license'), 'tool': ('dict', { - 'cli': '/usr/local/bin/nomossa', 'name': 'nomos', - 'version': '3.1.0rc2-31-ga2cbb8c' + 'version': '3.1.0rc2-31-ga2cbb8c', + 'command': '/usr/bin/nomossa', }), } CONFIG_BASE_FILENAME = 'indexer/fossology_license' def __init__(self): super().__init__() self.working_directory = self.config['workdir'] - self.tool = self.config['tool']['cli'] + self.tool = self.config['tool']['command'] self.tool_name = self.config['tool']['name'] self.tool_version = self.config['tool']['version'] def filter_contents(self, sha1s): """Filter out known sha1s and return only missing ones. """ - yield from self.storage.content_fossology_license_missing(sha1s) + yield from self.storage.content_fossology_license_missing(( + { + 'id': sha1, + 'tool_name': self.tool_name, + 'tool_version': self.tool_version + } for sha1 in sha1s + )) def index_content(self, sha1, content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier content (bytes): raw content in bytes Returns: A dict, representing a content_license, with keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path """ filename = hashutil.hash_to_hex(sha1) content_path = self.write_to_temp( filename=filename, data=content) properties = compute_license(self.tool, path=content_path) properties.update({ 'id': sha1, 'tool_name': self.tool_name, 'tool_version': self.tool_version, }) self.cleanup(content_path) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_license, dict with the following keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ wrong_licenses = self.storage.content_fossology_license_add( results, conflict_update=(policy_update == 'update-dups')) if wrong_licenses: for l in wrong_licenses: self.log.warn('Content %s has some unknown licenses: %s' % ( hashutil.hash_to_hex(l['id']), ','.join((name for name in l['licenses']))) ) @click.command(help='Compute license for path using tool') @click.option('--tool', default='nomossa', help="Path to tool") @click.option('--path', required=1, help="Path to execute index on") def main(tool, path): print(compute_license(tool, path)) if __name__ == '__main__': main() diff --git a/swh/indexer/language.py b/swh/indexer/language.py index eb065de..84838f8 100644 --- a/swh/indexer/language.py +++ b/swh/indexer/language.py @@ -1,97 +1,116 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pygments.lexers import guess_lexer from chardet import detect from .indexer import BaseIndexer def _cleanup_classname(classname): """Determine the language from the pygments' lexer names. """ return classname.lower().replace(' ', '-') def compute_language(raw_content): """Determine the raw content's language. Args: raw_content (bytes): content to determine raw content Returns: Dict with keys: - lang: None if nothing found or the possible language - decoding_failure: True if a decoding failure happened """ try: stats = detect(raw_content) encoding = stats['encoding'] content = raw_content.decode(encoding) lang = _cleanup_classname( guess_lexer(content).name) return { 'lang': lang } except Exception: return { 'lang': None } class ContentLanguageIndexer(BaseIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ + CONFIG_BASE_FILENAME = 'indexer/language' + + ADDITIONAL_CONFIG = { + 'tool': ('dict', { + 'name': 'pygments', + 'version': '2.0.1+dfsg-1.1+deb8u1', + }), + } + def __init__(self): super().__init__() + self.tool_name = self.config['tool']['name'] + self.tool_version = self.config['tool']['version'] def filter_contents(self, sha1s): """Filter out known sha1s and return only missing ones. """ - yield from self.storage.content_language_missing(sha1s) + yield from self.storage.content_language_missing(( + { + 'id': sha1, + 'tool_name': self.tool_name, + 'tool_version': self.tool_version + } for sha1 in sha1s + )) def index_content(self, sha1, raw_content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language """ result = compute_language(raw_content) result.update({ 'id': sha1, + 'tool_name': self.tool_name, + 'tool_version': self.tool_version, }) return result def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - lang (bytes): detected language policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.storage.content_language_add( results, conflict_update=(policy_update == 'update-dups')) diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py index 01378a2..97fc605 100644 --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -1,139 +1,153 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import subprocess from swh.core import hashutil from swh.scheduler.celery_backend.config import app from .indexer import BaseIndexer, DiskIndexer def compute_mimetype_encoding(path): """Determine mimetype and encoding from file at path. Args: path: filepath to determine the mime type Returns: A dict with mimetype and encoding key and corresponding values. """ cmd = ['file', '--mime', path] properties = subprocess.check_output(cmd) if properties: res = properties.split(b': ')[1].strip().split(b'; ') mimetype = res[0] encoding = res[1].split(b'=')[1] return { 'mimetype': mimetype, 'encoding': encoding } class ContentMimetypeIndexer(BaseIndexer, DiskIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.mimetype'), 'destination_queue': ( - 'str', 'swh.indexer.tasks.SWHOrchestratorTextContentsTask') + 'str', 'swh.indexer.tasks.SWHOrchestratorTextContentsTask'), + 'tool': ('dict', { + 'name': 'file', + 'version': '5.22' + }), } CONFIG_BASE_FILENAME = 'indexer/mimetype' def __init__(self): super().__init__() self.working_directory = self.config['workdir'] destination_queue = self.config['destination_queue'] self.task_destination = app.tasks[destination_queue] + self.tool_name = self.config['tool']['name'] + self.tool_version = self.config['tool']['version'] def filter_contents(self, sha1s): """Filter out known sha1s and return only missing ones. """ - yield from self.storage.content_mimetype_missing(sha1s) + yield from self.storage.content_mimetype_missing(( + { + 'id': sha1, + 'tool_name': self.tool_name, + 'tool_version': self.tool_version + } for sha1 in sha1s + )) def index_content(self, sha1, content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier content (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ filename = hashutil.hash_to_hex(sha1) content_path = self.write_to_temp( filename=filename, data=content) properties = compute_mimetype_encoding(content_path) properties.update({ 'id': sha1, + 'tool_name': self.tool_name, + 'tool_version': self.tool_version, }) self.cleanup(content_path) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.storage.content_mimetype_add( results, conflict_update=(policy_update == 'update-dups')) def _filter_text(self, results): """Filter sha1 whose raw content is text. """ for result in results: if b'binary' in result['encoding']: continue yield result['id'] def next_step(self, results): """When the computations is done, we'd like to send over only text contents to the text content orchestrator. Args: results ([dict]): List of content_mimetype results, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ self.task_destination.delay(list(self._filter_text(results))) @click.command() @click.option('--path', help="Path to execute index on") def main(path): print(compute_mimetype_encoding(path)) if __name__ == '__main__': main()