diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ vcversioner click -file-magic +python-magic >= 0.4.13 pyld xmltodict diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -3,13 +3,24 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import magic +import pkgutil -from swh.model import hashutil +import pkg_resources from .indexer import ContentIndexer, ContentRangeIndexer +def _import_python_magic(): + """Imports python-magic (NOT file_magic; both are made available + with 'import magic')""" + magic_dist = pkg_resources.get_distribution('python-magic') + magic_importer = pkgutil.get_importer(magic_dist.module_path) + return magic_importer.find_module('magic').load_module() + + +magic = _import_python_magic() + + def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. @@ -21,10 +32,12 @@ (as bytes). """ - r = magic.detect_from_content(raw_content) + m = magic.Magic(mime=True, mime_encoding=True) + res = m.from_buffer(raw_content) + (mimetype, encoding) = res.split('; charset=') return { - 'mimetype': r.mime_type, - 'encoding': r.encoding, + 'mimetype': mimetype, + 'encoding': encoding, } @@ -63,17 +76,11 @@ - **encoding** (bytes): encoding in bytes """ - try: - properties = compute_mimetype_encoding(data) - properties.update({ - 'id': id, - 'indexer_configuration_id': self.tool['id'], - }) - except TypeError: - self.log.error('Detecting mimetype error for id %s' % ( - hashutil.hash_to_hex(id), )) - return None - + properties = compute_mimetype_encoding(data) + properties.update({ + 'id': id, + 'indexer_configuration_id': self.tool['id'], + }) return properties def persist_index_computations(self, results, policy_update):