Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/mimetype.py
# Copyright (C) 2016-2017 The Software Heritage developers | # Copyright (C) 2016-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import click | import click | ||||
import magic | import magic | ||||
from swh.model import hashutil | from swh.model import hashutil | ||||
from swh.scheduler import get_scheduler | from swh.scheduler import get_scheduler | ||||
from swh.scheduler.utils import create_task_dict | |||||
from .indexer import ContentIndexer | from .indexer import ContentIndexer | ||||
def compute_mimetype_encoding(raw_content): | def compute_mimetype_encoding(raw_content): | ||||
"""Determine mimetype and encoding from the raw content. | """Determine mimetype and encoding from the raw content. | ||||
Args: | Args: | ||||
Show All 22 Lines | class ContentMimetypeIndexer(ContentIndexer): | ||||
""" | """ | ||||
ADDITIONAL_CONFIG = { | ADDITIONAL_CONFIG = { | ||||
'scheduler': { | 'scheduler': { | ||||
'cls': 'remote', | 'cls': 'remote', | ||||
'args': { | 'args': { | ||||
'url': 'http://localhost:5008', | 'url': 'http://localhost:5008', | ||||
}, | }, | ||||
}, | }, | ||||
'destination_task': ('str', None), | |||||
'tools': ('dict', { | 'tools': ('dict', { | ||||
'name': 'file', | 'name': 'file', | ||||
'version': '1:5.30-1+deb9u1', | 'version': '1:5.30-1+deb9u1', | ||||
'configuration': { | 'configuration': { | ||||
"type": "library", | "type": "library", | ||||
"debian-package": "python3-magic" | "debian-package": "python3-magic" | ||||
}, | }, | ||||
}), | }), | ||||
} | } | ||||
CONFIG_BASE_FILENAME = 'indexer/mimetype' | CONFIG_BASE_FILENAME = 'indexer/mimetype' | ||||
def prepare(self): | def prepare(self): | ||||
super().prepare() | super().prepare() | ||||
self.destination_task = self.config.get('destination_task') | |||||
self.scheduler = get_scheduler(**self.config['scheduler']) | self.scheduler = get_scheduler(**self.config['scheduler']) | ||||
self.tool = self.tools[0] | self.tool = self.tools[0] | ||||
def filter(self, ids): | def filter(self, ids): | ||||
"""Filter out known sha1s and return only missing ones. | """Filter out known sha1s and return only missing ones. | ||||
""" | """ | ||||
yield from self.idx_storage.content_mimetype_missing(( | yield from self.idx_storage.content_mimetype_missing(( | ||||
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines | def persist_index_computations(self, results, policy_update): | ||||
policy_update ([str]): either 'update-dups' or 'ignore-dups' to | policy_update ([str]): either 'update-dups' or 'ignore-dups' to | ||||
respectively update duplicates or ignore them | respectively update duplicates or ignore them | ||||
""" | """ | ||||
self.idx_storage.content_mimetype_add( | self.idx_storage.content_mimetype_add( | ||||
results, conflict_update=(policy_update == 'update-dups')) | results, conflict_update=(policy_update == 'update-dups')) | ||||
def _filter_text(self, results): | |||||
"""Filter sha1 whose raw content is text. | |||||
""" | |||||
for result in results: | |||||
if b'binary' in result['encoding']: | |||||
continue | |||||
ardumont: as before [1]
[1] https://forge.softwareheritage.org/D677#inline-3602 | |||||
yield result['id'] | |||||
def next_step(self, results): | |||||
"""When the computations is done, we'd like to send over only text | |||||
contents to the text content orchestrator. | |||||
Args: | |||||
results ([dict]): List of content_mimetype results, dict | |||||
with the following keys: | |||||
- id (bytes): content's identifier (sha1) | |||||
- mimetype (bytes): mimetype in bytes | |||||
- encoding (bytes): encoding in bytes | |||||
""" | |||||
if self.destination_task: | |||||
assert self.scheduler | |||||
self.scheduler.create_tasks([create_task_dict( | |||||
self.destination_task, | |||||
'oneshot', | |||||
list(self._filter_text(results)) | |||||
)]) | |||||
@click.command() | @click.command() | ||||
@click.option('--path', help="Path to execute index on") | @click.option('--path', help="Path to execute index on") | ||||
def main(path): | def main(path): | ||||
with open(path, 'rb') as f: | with open(path, 'rb') as f: | ||||
raw_content = f.read() | raw_content = f.read() | ||||
print(compute_mimetype_encoding(raw_content)) | print(compute_mimetype_encoding(raw_content)) | ||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
main() | main() |
as before [1]
[1] https://forge.softwareheritage.org/D677#inline-3602