diff --git a/debian/control b/debian/control --- a/debian/control +++ b/debian/control @@ -10,7 +10,7 @@ python3-hypothesis (>= 3.11.0~), python3-pytest, python3-pygments, - python3-magic, + python3-magic (>= 2:0.4.3), python3-pyld, python3-setuptools, python3-swh.core (>= 0.0.44~), diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,6 @@ pygments click chardet -file_magic +python-magic pyld xmltodict diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -3,13 +3,24 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import magic +import pkgutil -from swh.model import hashutil +import pkg_resources from .indexer import ContentIndexer, ContentRangeIndexer +def _import_python_magic(): + """Imports python-magic (NOT file_magic; both are made available + with 'import magic')""" + magic_dist = pkg_resources.get_distribution('python-magic') + magic_importer = pkgutil.get_importer(magic_dist.module_path) + return magic_importer.find_module('magic').load_module() + + +magic = _import_python_magic() + + def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. @@ -21,10 +32,12 @@ (as bytes). """ - r = magic.detect_from_content(raw_content) + m = magic.Magic(mime=True, mime_encoding=True) + res = m.from_buffer(raw_content) + (mimetype, encoding) = res.split('; charset=') return { - 'mimetype': r.mime_type, - 'encoding': r.encoding, + 'mimetype': mimetype, + 'encoding': encoding, } @@ -67,17 +80,11 @@ - **encoding** (bytes): encoding in bytes """ - try: - properties = compute_mimetype_encoding(data) - properties.update({ - 'id': id, - 'indexer_configuration_id': self.tool['id'], - }) - except TypeError: - self.log.error('Detecting mimetype error for id %s' % ( - hashutil.hash_to_hex(id), )) - return None - + properties = compute_mimetype_encoding(data) + properties.update({ + 'id': id, + 'indexer_configuration_id': self.tool['id'], + }) return properties def persist_index_computations(self, results, policy_update): diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -29,8 +29,8 @@ 'configuration': {}, }), 'tasks': ('dict', { - 'revision_metadata': 'revision_metadata', - 'origin_intrinsic_metadata': 'origin_metadata', + 'revision_metadata': 'indexer_revision_metadata', + 'origin_intrinsic_metadata': 'indexer_origin_metadata', }) }