diff --git a/debian/control b/debian/control --- a/debian/control +++ b/debian/control @@ -10,7 +10,7 @@ python3-hypothesis (>= 3.11.0~), python3-pytest, python3-pygments, - python3-magic, + python3-magic (>= 2:0.4.13), python3-pyld, python3-setuptools, python3-swh.core (>= 0.0.44~), diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,6 @@ pygments click chardet -file_magic +python-magic >= 0.4.13 pyld xmltodict diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -5,10 +5,13 @@ import magic -from swh.model import hashutil - from .indexer import ContentIndexer, ContentRangeIndexer +if not hasattr(magic.Magic, 'from_buffer'): + raise ImportError( + 'Expected "import magic" to import python-magic, but file_magic ' + 'was imported instead.') + def compute_mimetype_encoding(raw_content): """Determine mimetype and encoding from the raw content. @@ -21,10 +24,12 @@ (as bytes). """ - r = magic.detect_from_content(raw_content) + m = magic.Magic(mime=True, mime_encoding=True) + res = m.from_buffer(raw_content) + (mimetype, encoding) = res.split('; charset=') return { - 'mimetype': r.mime_type, - 'encoding': r.encoding, + 'mimetype': mimetype, + 'encoding': encoding, } @@ -67,17 +72,11 @@ - **encoding** (bytes): encoding in bytes """ - try: - properties = compute_mimetype_encoding(data) - properties.update({ - 'id': id, - 'indexer_configuration_id': self.tool['id'], - }) - except TypeError: - self.log.error('Detecting mimetype error for id %s' % ( - hashutil.hash_to_hex(id), )) - return None - + properties = compute_mimetype_encoding(data) + properties.update({ + 'id': id, + 'indexer_configuration_id': self.tool['id'], + }) return properties def persist_index_computations(self, results, policy_update): diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py --- a/swh/indexer/origin_head.py +++ b/swh/indexer/origin_head.py @@ -29,8 +29,8 @@ 'configuration': {}, }), 'tasks': ('dict', { - 'revision_metadata': 'revision_metadata', - 'origin_intrinsic_metadata': 'origin_metadata', + 'revision_metadata': 'indexer_revision_metadata', + 'origin_intrinsic_metadata': 'indexer_origin_metadata', }) }