Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/mimetype.py
# Copyright (C) 2016-2018 The Software Heritage developers | # Copyright (C) 2016-2018 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import magic | import magic | ||||
from swh.model import hashutil | |||||
from .indexer import ContentIndexer, ContentRangeIndexer | from .indexer import ContentIndexer, ContentRangeIndexer | ||||
if not hasattr(magic.Magic, 'from_buffer'): | |||||
raise ImportError( | |||||
'Expected "import magic" to import python-magic, but file_magic ' | |||||
'was imported instead.') | |||||
def compute_mimetype_encoding(raw_content): | def compute_mimetype_encoding(raw_content): | ||||
"""Determine mimetype and encoding from the raw content. | """Determine mimetype and encoding from the raw content. | ||||
Args: | Args: | ||||
raw_content (bytes): content's raw data | raw_content (bytes): content's raw data | ||||
olasd: If they're both available with `import magic`, they're incompatible and shouldn't really be… | |||||
Returns: | Returns: | ||||
dict: mimetype and encoding key and corresponding values | dict: mimetype and encoding key and corresponding values | ||||
(as bytes). | (as bytes). | ||||
""" | """ | ||||
r = magic.detect_from_content(raw_content) | m = magic.Magic(mime=True, mime_encoding=True) | ||||
res = m.from_buffer(raw_content) | |||||
(mimetype, encoding) = res.split('; charset=') | |||||
return { | return { | ||||
'mimetype': r.mime_type, | 'mimetype': mimetype, | ||||
'encoding': r.encoding, | 'encoding': encoding, | ||||
} | } | ||||
Not Done Inline ActionsWith this, the tool registration mechanism must change. With this, that becomes dynamic. Right now, the configuration is the one by default referencing python3-magic [1] [1] https://forge.softwareheritage.org/source/swh-indexer/browse/master/swh/indexer/mimetype.py$38-44 ardumont: With this, the tool registration mechanism must change.
Right now, the tool used is declared in… | |||||
Done Inline ActionsAgreed. Let's put this diff on hold while we migrate to introspecting tools. vlorentz: Agreed. Let's put this diff on hold while we migrate to introspecting tools. | |||||
class MixinMimetypeIndexer: | class MixinMimetypeIndexer: | ||||
"""Mixin mimetype indexer. | """Mixin mimetype indexer. | ||||
See :class:`MimetypeIndexer` and :class:`MimetypeRangeIndexer` | See :class:`MimetypeIndexer` and :class:`MimetypeRangeIndexer` | ||||
""" | """ | ||||
ADDITIONAL_CONFIG = { | ADDITIONAL_CONFIG = { | ||||
Show All 24 Lines | def index(self, id, data): | ||||
Returns: | Returns: | ||||
dict: content's mimetype; dict keys being | dict: content's mimetype; dict keys being | ||||
- **id** (bytes): content's identifier (sha1) | - **id** (bytes): content's identifier (sha1) | ||||
- **mimetype** (bytes): mimetype in bytes | - **mimetype** (bytes): mimetype in bytes | ||||
- **encoding** (bytes): encoding in bytes | - **encoding** (bytes): encoding in bytes | ||||
""" | """ | ||||
try: | |||||
properties = compute_mimetype_encoding(data) | properties = compute_mimetype_encoding(data) | ||||
properties.update({ | properties.update({ | ||||
'id': id, | 'id': id, | ||||
'indexer_configuration_id': self.tool['id'], | 'indexer_configuration_id': self.tool['id'], | ||||
}) | }) | ||||
except TypeError: | |||||
self.log.error('Detecting mimetype error for id %s' % ( | |||||
hashutil.hash_to_hex(id), )) | |||||
return None | |||||
return properties | return properties | ||||
def persist_index_computations(self, results, policy_update): | def persist_index_computations(self, results, policy_update): | ||||
"""Persist the results in storage. | """Persist the results in storage. | ||||
Args: | Args: | ||||
results ([dict]): list of content's mimetype dicts | results ([dict]): list of content's mimetype dicts | ||||
(see :meth:`.index`) | (see :meth:`.index`) | ||||
▲ Show 20 Lines • Show All 62 Lines • Show Last 20 Lines |
If they're both available with import magic, they're incompatible and shouldn't really be installed at the same time.
Could we just check that the magic module has an attribute that's only available in the version we want, and fail importing if it doesn't?