diff --git a/debian/control b/debian/control index 5072015..a2b8550 100644 --- a/debian/control +++ b/debian/control @@ -1,29 +1,30 @@ Source: swh-indexer Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python (>= 2), python3-all, python3-chardet (>= 2.3.0~), python3-click, python3-nose, python3-pygments, + python3-magic, python3-setuptools, python3-swh.core (>= 0.0.27~), python3-swh.model (>= 0.0.15~), python3-swh.objstorage (>= 0.0.13~), python3-swh.scheduler (>= 0.0.14~), python3-swh.storage (>= 0.0.85~), python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/78/ Package: python3-swh.indexer Architecture: all Depends: fossology-nomossa (>= 3.1~), python3-swh.scheduler (>= 0.0.14~), universal-ctags (>= 0.8~), ${misc:Depends}, ${python3:Depends} Description: Software Heritage Content Indexer diff --git a/requirements.txt b/requirements.txt index a8e7eca..b97c809 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ # Add here external Python modules dependencies, one per line. Module names # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html vcversioner pygments click chardet +file_magic diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py index f2dd87b..6131c03 100644 --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -1,165 +1,152 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click +import magic -from subprocess import Popen, PIPE from swh.scheduler import utils from .indexer import ContentIndexer def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. Args: raw_content (bytes): content's raw data Returns: - A dict with mimetype and encoding key and corresponding values. + A dict with mimetype and encoding key and corresponding values + (as bytes). """ - if raw_content is b'': - return { - 'mimetype': b'application/x-empty', - 'encoding': b'binary' - } - - with Popen(['file', '--mime', '-'], stdin=PIPE, - stdout=PIPE, stderr=PIPE) as p: - properties, _ = p.communicate(raw_content) - - if properties: - res = properties.split(b': ')[1].strip().split(b'; ') - mimetype = res[0] - encoding = res[1].split(b'=')[1] - return { - 'mimetype': mimetype, - 'encoding': encoding - } + r = magic.detect_from_content(raw_content) + return { + 'mimetype': r.mime_type.encode('utf-8'), + 'encoding': r.encoding.encode('utf-8'), + } class ContentMimetypeIndexer(ContentIndexer): """Indexer in charge of: - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {mimetype, encoding} from that content - store result in storage """ ADDITIONAL_CONFIG = { - # chained queue message, e.g: - # swh.indexer.tasks.SWHOrchestratorTextContentsTask 'destination_queue': ('str', None), 'tools': ('dict', { 'name': 'file', - 'version': '5.22', + 'version': '1:5.30-1+deb9u1', 'configuration': { - 'command_line': 'file --mime ', + "type": "library", + "debian-package": "python3-magic" }, }), } CONFIG_BASE_FILENAME = 'indexer/mimetype' def prepare(self): super().prepare() destination_queue = self.config.get('destination_queue') if destination_queue: self.task_destination = utils.get_task(destination_queue) else: self.task_destination = None self.tools = self.retrieve_tools_information() def filter(self, ids): """Filter out known sha1s and return only missing ones. """ yield from self.storage.content_mimetype_missing(( { 'id': sha1, 'indexer_configuration_id': self.tools['id'], } for sha1 in ids )) def index(self, id, data): """Index sha1s' content and store result. Args: id (bytes): content's identifier data (bytes): raw content in bytes Returns: A dict, representing a content_mimetype, with keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ properties = compute_mimetype_encoding(data) properties.update({ 'id': id, 'indexer_configuration_id': self.tools['id'], }) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_mimetype, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.storage.content_mimetype_add( results, conflict_update=(policy_update == 'update-dups')) def _filter_text(self, results): """Filter sha1 whose raw content is text. """ for result in results: if b'binary' in result['encoding']: continue yield result['id'] def next_step(self, results): """When the computations is done, we'd like to send over only text contents to the text content orchestrator. Args: results ([dict]): List of content_mimetype results, dict with the following keys: - id (bytes): content's identifier (sha1) - mimetype (bytes): mimetype in bytes - encoding (bytes): encoding in bytes """ if self.task_destination: self.task_destination.delay(list(self._filter_text(results))) @click.command() @click.option('--path', help="Path to execute index on") def main(path): with open(path, 'rb') as f: raw_content = f.read() print(compute_mimetype_encoding(raw_content)) if __name__ == '__main__': main() diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py index 7ddfa59..976e291 100644 --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -1,158 +1,161 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from nose.tools import istest from swh.indexer.mimetype import ContentMimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class _MockStorage(): """Mock storage to simplify reading indexers' outputs. """ def content_mimetype_add(self, mimetypes, conflict_update=None): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_get(self, tool): return { 'id': 10, } class TestMimetypeIndexer(ContentMimetypeIndexer): """Specific mimetype whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'destination_queue': None, 'rescheduling_task': None, 'tools': { 'name': 'file', - 'version': '5.22', - 'configuration': 'file --mime ', + 'version': '1:5.30-1+deb9u1', + 'configuration': { + "type": "library", + "debian-package": "python3-magic" + }, }, } self.storage = _MockStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.task_destination = None self.rescheduling_task = self.config['rescheduling_task'] self.destination_queue = self.config['destination_queue'] self.tools = self.retrieve_tools_information() class TestMimetypeIndexerWrongStorage(TestMimetypeIndexer): """Specific mimetype whose configuration is not enough to satisfy the indexing tests. """ def prepare(self): super().prepare() self.tools = None class TestMimetypeIndexerWithErrors(unittest.TestCase): @istest def test_index_fail_because_wrong_tool(self): try: TestMimetypeIndexerWrongStorage() except ValueError: pass else: self.fail('An error should be raised about wrong tool being used.') class TestMimetypeIndexerTest(unittest.TestCase): def setUp(self): self.indexer = TestMimetypeIndexer() @istest def test_index_no_update(self): # given sha1s = [ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', '688a5ef812c53907562fe379d4b3851e69c7cb15', ] # when self.indexer.run(sha1s, policy_update='ignore-dups') # then expected_results = [{ 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }] self.assertFalse(self.indexer.storage.conflict_update) self.assertEquals(expected_results, self.indexer.storage.state) @istest def test_index_update(self): # given sha1s = [ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', '688a5ef812c53907562fe379d4b3851e69c7cb15', 'da39a3ee5e6b4b0d3255bfef95601890afd80709', # empty content ] # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = [{ 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': 'da39a3ee5e6b4b0d3255bfef95601890afd80709', 'indexer_configuration_id': 10, 'mimetype': b'application/x-empty', 'encoding': b'binary', }] self.assertTrue(self.indexer.storage.conflict_update) self.assertEquals(expected_results, self.indexer.storage.state) @istest def test_index_one_unknown_sha1(self): # given sha1s = ['688a5ef812c53907562fe379d4b3851e69c7cb15', '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = [{ 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }] self.assertTrue(self.indexer.storage.conflict_update) self.assertEquals(expected_results, self.indexer.storage.state)