diff --git a/swh/indexer/__init__.py b/swh/indexer/__init__.py index 014c42b..90e662a 100644 --- a/swh/indexer/__init__.py +++ b/swh/indexer/__init__.py @@ -1,22 +1,25 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from .file_properties import ContentMimetypeIndexer +from .language import ContentLanguageIndexer INDEXER_CLASSES = { 'mimetype': ContentMimetypeIndexer, + 'language': ContentLanguageIndexer, } TASK_NAMES = { 'orchestrator': 'swh.indexer.tasks.SWHOrchestratorTask', 'mimetype': 'swh.indexer.tasks.SWHContentMimetypeTask', + 'language': 'swh.indexer.tasks.SWHContentLanguageTask', } __all__ = [ 'INDEXER_CLASSES', 'TASK_NAMES' ] diff --git a/swh/indexer/language.py b/swh/indexer/language.py index 2a7855c..eba2cd5 100644 --- a/swh/indexer/language.py +++ b/swh/indexer/language.py @@ -1,48 +1,100 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pygments.lexers import guess_lexer -from pygments.util import ClassNotFound from chardet import detect +from .indexer import BaseIndexer -def cleanup_classname(classname): + +def _cleanup_classname(classname): """Determine the language from the pygments' lexer names. """ return classname.lower().replace(' ', '-') -def run_language(raw_content): +def compute_language(raw_content): """Determine the raw content's language. Args: raw_content (bytes): content to determine raw content Returns: Dict with keys: - lang: None if nothing found or the possible language - decoding_failure: True if a decoding failure happened """ try: - encoding = detect(raw_content)['encoding'] + stats = detect(raw_content) + encoding = stats['encoding'] content = raw_content.decode(encoding) - lang = cleanup_classname( + lang = _cleanup_classname( guess_lexer(content).name) - return { 'lang': lang } - except ClassNotFound as e: - return { - 'lang': None - } - except LookupError as e: # Unknown encoding + except Exception: return { - 'decoding_failure': True, 'lang': None } + + +class ContentLanguageIndexer(BaseIndexer): + """Indexer in charge of: + - filtering out content already indexed + - reading content from objstorage per the content's id (sha1) + - computing {mimetype, encoding} from that content + - store result in storage + + """ + ADDITIONAL_CONFIG = { + 'workdir': ('str', '/tmp/swh/worker.file.properties'), + } + + def __init__(self): + super().__init__() + self.working_directory = self.config['workdir'] + + def filter_contents(self, sha1s): + """Filter out known sha1s and return only missing ones. + + """ + yield from self.storage.content_language_missing(sha1s) + + def index_content(self, sha1, raw_content): + """Index sha1s' content and store result. + + Args: + sha1 (bytes): content's identifier + raw_content (bytes): raw content in bytes + + Returns: + A dict, representing a content_mimetype, with keys: + - id (bytes): content's identifier (sha1) + - lang (bytes): detected language + + """ + result = compute_language(raw_content) + result.update({ + 'id': sha1, + }) + + return result + + def persist_index_computations(self, results): + """Persist the results in storage. + + Args: + + results ([dict]): list of content_mimetype, dict with the + following keys: + - id (bytes): content's identifier (sha1) + - lang (bytes): detected language + + """ + self.storage.content_language_add(results) diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py index a487ad8..957a857 100644 --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -1,30 +1,41 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.task import Task from .orchestrator import OrchestratorIndexer from .file_properties import ContentMimetypeIndexer +from .language import ContentLanguageIndexer class SWHOrchestratorTask(Task): """Main task in charge of reading messages and broadcasting them back to other tasks. """ task_queue = 'swh_indexer_orchestrator' def run(self, *args, **kwargs): OrchestratorIndexer().run(*args, **kwargs) class SWHContentMimetypeTask(Task): """Task which computes the mimetype, encoding from the sha1's content. """ task_queue = 'swh_indexer_content_mimetype' def run(self, *args, **kwargs): ContentMimetypeIndexer().run(*args, **kwargs) + + +class SWHContentLanguageTask(Task): + """Task which computes the language from the sha1's content. + + """ + task_queue = 'swh_indexer_content_language' + + def run(self, *args, **kwargs): + ContentLanguageIndexer().run(*args, **kwargs)