Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/mimetype.py
# Copyright (C) 2016-2020 The Software Heritage developers | # Copyright (C) 2016-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from typing import Any, Dict, List, Optional, Union | from typing import Any, Dict, List, Optional | ||||
import magic | import magic | ||||
from swh.core.config import merge_configs | from swh.core.config import merge_configs | ||||
from swh.indexer.storage.interface import IndexerStorageInterface, PagedResult, Sha1 | from swh.indexer.storage.interface import IndexerStorageInterface, PagedResult, Sha1 | ||||
from swh.indexer.storage.model import ContentMimetypeRow | from swh.indexer.storage.model import ContentMimetypeRow | ||||
from swh.model.model import Revision | |||||
from .indexer import ContentIndexer, ContentPartitionIndexer | from .indexer import ContentIndexer, ContentPartitionIndexer | ||||
if not hasattr(magic.Magic, "from_buffer"): | if not hasattr(magic.Magic, "from_buffer"): | ||||
raise ImportError( | raise ImportError( | ||||
'Expected "import magic" to import python-magic, but file_magic ' | 'Expected "import magic" to import python-magic, but file_magic ' | ||||
"was imported instead." | "was imported instead." | ||||
) | ) | ||||
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines | class MixinMimetypeIndexer: | ||||
tool: Any | tool: Any | ||||
idx_storage: IndexerStorageInterface | idx_storage: IndexerStorageInterface | ||||
def __init__(self, *args, **kwargs): | def __init__(self, *args, **kwargs): | ||||
super().__init__(*args, **kwargs) | super().__init__(*args, **kwargs) | ||||
self.config = merge_configs(DEFAULT_CONFIG, self.config) | self.config = merge_configs(DEFAULT_CONFIG, self.config) | ||||
def index( | def index( | ||||
self, id: Union[bytes, Dict, Revision], data: Optional[bytes] = None, **kwargs | self, id: Sha1, data: Optional[bytes] = None, **kwargs | ||||
) -> List[ContentMimetypeRow]: | ) -> List[ContentMimetypeRow]: | ||||
"""Index sha1s' content and store result. | """Index sha1s' content and store result. | ||||
Args: | Args: | ||||
id: content's identifier | id: content's identifier | ||||
data: raw content in bytes | data: raw content in bytes | ||||
Returns: | Returns: | ||||
dict: content's mimetype; dict keys being | dict: content's mimetype; dict keys being | ||||
- id: content's identifier (sha1) | - id: content's identifier (sha1) | ||||
- mimetype: mimetype in bytes | - mimetype: mimetype in bytes | ||||
- encoding: encoding in bytes | - encoding: encoding in bytes | ||||
""" | """ | ||||
assert isinstance(id, bytes) | |||||
assert data is not None | assert data is not None | ||||
properties = compute_mimetype_encoding(data) | properties = compute_mimetype_encoding(data) | ||||
return [ | return [ | ||||
ContentMimetypeRow( | ContentMimetypeRow( | ||||
id=id, | id=id, | ||||
indexer_configuration_id=self.tool["id"], | indexer_configuration_id=self.tool["id"], | ||||
mimetype=properties["mimetype"], | mimetype=properties["mimetype"], | ||||
encoding=properties["encoding"], | encoding=properties["encoding"], | ||||
▲ Show 20 Lines • Show All 76 Lines • Show Last 20 Lines |