diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ vcversioner click -file-magic +python-magic >= 0.4.13 pyld xmltodict diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -5,10 +5,13 @@ import magic -from swh.model import hashutil - from .indexer import ContentIndexer, ContentRangeIndexer +if not hasattr(magic.Magic, 'from_buffer'): + raise ImportError( + 'Expected "import magic" to import python-magic, but file_magic ' + 'was imported instead.') + def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. @@ -21,10 +24,12 @@ (as bytes). """ - r = magic.detect_from_content(raw_content) + m = magic.Magic(mime=True, mime_encoding=True) + res = m.from_buffer(raw_content) + (mimetype, encoding) = res.split('; charset=') return { - 'mimetype': r.mime_type, - 'encoding': r.encoding, + 'mimetype': mimetype, + 'encoding': encoding, } @@ -63,17 +68,11 @@ - **encoding** (bytes): encoding in bytes """ - try: - properties = compute_mimetype_encoding(data) - properties.update({ - 'id': id, - 'indexer_configuration_id': self.tool['id'], - }) - except TypeError: - self.log.error('Detecting mimetype error for id %s' % ( - hashutil.hash_to_hex(id), )) - return None - + properties = compute_mimetype_encoding(data) + properties.update({ + 'id': id, + 'indexer_configuration_id': self.tool['id'], + }) return properties def persist_index_computations(self, results, policy_update):