diff --git a/swh/indexer/__init__.py b/swh/indexer/__init__.py index 90e662a..5b71c6d 100644 --- a/swh/indexer/__init__.py +++ b/swh/indexer/__init__.py @@ -1,25 +1,26 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from .file_properties import ContentMimetypeIndexer +from .mimetype import ContentMimetypeIndexer from .language import ContentLanguageIndexer INDEXER_CLASSES = { 'mimetype': ContentMimetypeIndexer, 'language': ContentLanguageIndexer, } TASK_NAMES = { 'orchestrator': 'swh.indexer.tasks.SWHOrchestratorTask', 'mimetype': 'swh.indexer.tasks.SWHContentMimetypeTask', 'language': 'swh.indexer.tasks.SWHContentLanguageTask', } __all__ = [ - 'INDEXER_CLASSES', 'TASK_NAMES' + 'INDEXER_CLASSES', 'TASK_NAMES', 'ContentMimetypeIndexer', + 'ContentLanguageIndexer' ] diff --git a/swh/indexer/file_properties.py b/swh/indexer/mimetype.py similarity index 96% rename from swh/indexer/file_properties.py rename to swh/indexer/mimetype.py index a570cbf..9a28ebd 100644 --- a/swh/indexer/file_properties.py +++ b/swh/indexer/mimetype.py @@ -1,107 +1,109 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import subprocess from swh.core import hashutil from .indexer import BaseIndexer, DiskIndexer def compute_mimetype_encoding(path): """Determine mimetype and encoding from file at path. Args: path: filepath to determine the mime type Returns: A dict with mimetype and encoding key and corresponding values. """ cmd = ['file', '--mime-type', '--mime-encoding', path] properties = subprocess.check_output(cmd) if properties: res = properties.split(b': ')[1].strip().split(b'; ') mimetype = res[0] encoding = res[1].split(b'=')[1] return { 'mimetype': mimetype, 'encoding': encoding } class ContentMimetypeIndexer(BaseIndexer, DiskIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ ADDITIONAL_CONFIG = { - 'workdir': ('str', '/tmp/swh/worker.file.properties'), + 'workdir': ('str', '/tmp/swh/indexer.mimetype'), } + CONFIG_BASE_FILENAME = 'indexer/mimetype' + def __init__(self): super().__init__() self.working_directory = self.config['workdir'] def filter_contents(self, sha1s): """Filter out known sha1s and return only missing ones. """ yield from self.storage.content_mimetype_missing(sha1s) def index_content(self, sha1, content): """Index sha1s' content and store result. Args: sha1 (bytes): content's identifier content (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ filename = hashutil.hash_to_hex(sha1) content_path = self.write_to_temp( filename=filename, data=content) properties = compute_mimetype_encoding(content_path) properties.update({ 'id': sha1, }) self.cleanup(content_path) return properties def persist_index_computations(self, results): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ self.storage.content_mimetype_add(results) @click.command() @click.option('--path', help="Path to execute index on") def main(path): print(compute_mimetype_encoding(path)) if __name__ == '__main__': main() diff --git a/swh/indexer/tasks.py b/swh/indexer/tasks.py index 957a857..914e886 100644 --- a/swh/indexer/tasks.py +++ b/swh/indexer/tasks.py @@ -1,41 +1,40 @@ # Copyright (C) 2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.scheduler.task import Task from .orchestrator import OrchestratorIndexer -from .file_properties import ContentMimetypeIndexer -from .language import ContentLanguageIndexer +from . import ContentMimetypeIndexer, ContentLanguageIndexer class SWHOrchestratorTask(Task): """Main task in charge of reading messages and broadcasting them back to other tasks. """ task_queue = 'swh_indexer_orchestrator' def run(self, *args, **kwargs): OrchestratorIndexer().run(*args, **kwargs) class SWHContentMimetypeTask(Task): """Task which computes the mimetype, encoding from the sha1's content. """ task_queue = 'swh_indexer_content_mimetype' def run(self, *args, **kwargs): ContentMimetypeIndexer().run(*args, **kwargs) class SWHContentLanguageTask(Task): """Task which computes the language from the sha1's content. """ task_queue = 'swh_indexer_content_language' def run(self, *args, **kwargs): ContentLanguageIndexer().run(*args, **kwargs)