Changeset View
Changeset View
Standalone View
Standalone View
swh/indexer/mimetype.py
# Copyright (C) 2016-2018 The Software Heritage developers | # Copyright (C) 2016-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from typing import Optional, Dict, Any, List | |||||
import magic | import magic | ||||
from typing import Optional | |||||
from .indexer import ContentIndexer, ContentRangeIndexer | from .indexer import ContentIndexer, ContentRangeIndexer | ||||
if not hasattr(magic.Magic, 'from_buffer'): | if not hasattr(magic.Magic, 'from_buffer'): | ||||
raise ImportError( | raise ImportError( | ||||
'Expected "import magic" to import python-magic, but file_magic ' | 'Expected "import magic" to import python-magic, but file_magic ' | ||||
'was imported instead.') | 'was imported instead.') | ||||
def compute_mimetype_encoding(raw_content): | def compute_mimetype_encoding(raw_content: bytes) -> Dict[str, bytes]: | ||||
"""Determine mimetype and encoding from the raw content. | """Determine mimetype and encoding from the raw content. | ||||
Args: | Args: | ||||
raw_content (bytes): content's raw data | raw_content: content's raw data | ||||
Returns: | Returns: | ||||
dict: mimetype and encoding key and corresponding values | dict: mimetype and encoding key and corresponding values. | ||||
(as bytes). | |||||
""" | """ | ||||
m = magic.Magic(mime=True, mime_encoding=True) | m = magic.Magic(mime=True, mime_encoding=True) | ||||
res = m.from_buffer(raw_content) | res = m.from_buffer(raw_content) | ||||
(mimetype, encoding) = res.split('; charset=') | (mimetype, encoding) = res.split('; charset=') | ||||
return { | return { | ||||
'mimetype': mimetype, | 'mimetype': mimetype, | ||||
'encoding': encoding, | 'encoding': encoding, | ||||
} | } | ||||
class MixinMimetypeIndexer: | class MixinMimetypeIndexer: | ||||
"""Mixin mimetype indexer. | """Mixin mimetype indexer. | ||||
ardumont: Please, below the dosctring | |||||
Done Inline ActionsSorry I didn't notice this sooner, but attributes must be after the docstring, else the docstring is associated with the last attribute. vlorentz: Sorry I didn't notice this sooner, but attributes must be after the docstring, else the… | |||||
See :class:`MimetypeIndexer` and :class:`MimetypeRangeIndexer` | See :class:`MimetypeIndexer` and :class:`MimetypeRangeIndexer` | ||||
""" | """ | ||||
tool: Dict[str, Any] | |||||
idx_storage: Any | |||||
ADDITIONAL_CONFIG = { | ADDITIONAL_CONFIG = { | ||||
'tools': ('dict', { | 'tools': ('dict', { | ||||
'name': 'file', | 'name': 'file', | ||||
'version': '1:5.30-1+deb9u1', | 'version': '1:5.30-1+deb9u1', | ||||
'configuration': { | 'configuration': { | ||||
"type": "library", | "type": "library", | ||||
"debian-package": "python3-magic" | "debian-package": "python3-magic" | ||||
}, | }, | ||||
}), | }), | ||||
'write_batch_size': ('int', 1000), | 'write_batch_size': ('int', 1000), | ||||
} | } | ||||
CONFIG_BASE_FILENAME = 'indexer/mimetype' # type: Optional[str] | CONFIG_BASE_FILENAME = 'indexer/mimetype' # type: Optional[str] | ||||
def index(self, id, data): | def index(self, id: bytes, data: bytes) -> Dict[str, Any]: | ||||
"""Index sha1s' content and store result. | """Index sha1s' content and store result. | ||||
Args: | Args: | ||||
id (bytes): content's identifier | id: content's identifier | ||||
data (bytes): raw content in bytes | data: raw content in bytes | ||||
Returns: | Returns: | ||||
dict: content's mimetype; dict keys being | dict: content's mimetype; dict keys being | ||||
- **id** (bytes): content's identifier (sha1) | - id: content's identifier (sha1) | ||||
- **mimetype** (bytes): mimetype in bytes | - mimetype: mimetype in bytes | ||||
- **encoding** (bytes): encoding in bytes | - encoding: encoding in bytes | ||||
""" | """ | ||||
properties = compute_mimetype_encoding(data) | properties = compute_mimetype_encoding(data) | ||||
properties.update({ | properties.update({ | ||||
'id': id, | 'id': id, | ||||
'indexer_configuration_id': self.tool['id'], | 'indexer_configuration_id': self.tool['id'], | ||||
}) | }) | ||||
return properties | return properties | ||||
def persist_index_computations(self, results, policy_update): | def persist_index_computations( | ||||
self, results: List[Dict], policy_update: List[str] | |||||
) -> None: | |||||
"""Persist the results in storage. | """Persist the results in storage. | ||||
Args: | Args: | ||||
results ([dict]): list of content's mimetype dicts | results: list of content's mimetype dicts | ||||
(see :meth:`.index`) | (see :meth:`.index`) | ||||
policy_update ([str]): either 'update-dups' or 'ignore-dups' to | policy_update: either 'update-dups' or 'ignore-dups' to | ||||
respectively update duplicates or ignore them | respectively update duplicates or ignore them | ||||
""" | """ | ||||
self.idx_storage.content_mimetype_add( | self.idx_storage.content_mimetype_add( | ||||
results, conflict_update=(policy_update == 'update-dups')) | results, conflict_update=(policy_update == 'update-dups')) | ||||
class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer): | class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer): | ||||
"""Mimetype Indexer working on list of content identifiers. | """Mimetype Indexer working on list of content identifiers. | ||||
It: | It: | ||||
- (optionally) filters out content already indexed (cf. | - (optionally) filters out content already indexed (cf. | ||||
:meth:`.filter`) | :meth:`.filter`) | ||||
- reads content from objstorage per the content's id (sha1) | - reads content from objstorage per the content's id (sha1) | ||||
- computes {mimetype, encoding} from that content | - computes {mimetype, encoding} from that content | ||||
- stores result in storage | - stores result in storage | ||||
""" | """ | ||||
def filter(self, ids): | def filter(self, ids): | ||||
"""Filter out known sha1s and return only missing ones. | """Filter out known sha1s and return only missing ones. | ||||
Done Inline ActionsBreak line before ), so the return type fits on a single line vlorentz: Break line before `)`, so the return type fits on a single line | |||||
""" | """ | ||||
yield from self.idx_storage.content_mimetype_missing(( | yield from self.idx_storage.content_mimetype_missing(( | ||||
{ | { | ||||
'id': sha1, | 'id': sha1, | ||||
'indexer_configuration_id': self.tool['id'], | 'indexer_configuration_id': self.tool['id'], | ||||
} for sha1 in ids | } for sha1 in ids | ||||
)) | )) | ||||
class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer): | class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer): | ||||
"""Mimetype Range Indexer working on range of content identifiers. | """Mimetype Range Indexer working on range of content identifiers. | ||||
It: | It: | ||||
- (optionally) filters out content already indexed (cf | - (optionally) filters out content already indexed (cf | ||||
:meth:`.indexed_contents_in_range`) | :meth:`.indexed_contents_in_range`) | ||||
- reads content from objstorage per the content's id (sha1) | - reads content from objstorage per the content's id (sha1) | ||||
- computes {mimetype, encoding} from that content | - computes {mimetype, encoding} from that content | ||||
- stores result in storage | - stores result in storage | ||||
""" | """ | ||||
def indexed_contents_in_range(self, start, end): | |||||
def indexed_contents_in_range( | |||||
self, start: bytes, end: bytes | |||||
Done Inline Actionsstr without the quote :) ardumont: `str` without the quote :) | |||||
) -> Dict[str, Optional[bytes]]: | |||||
Done Inline ActionsOur code style is to not have too many lines if possible. vlorentz: Our code style is to not have too many lines if possible. | |||||
"""Retrieve indexed content id within range [start, end]. | """Retrieve indexed content id within range [start, end]. | ||||
Args: | Args: | ||||
start (bytes): Starting bound from range identifier | start: Starting bound from range identifier | ||||
end (bytes): End range identifier | end: End range identifier | ||||
Returns: | Returns: | ||||
dict: a dict with keys: | dict: a dict with keys: | ||||
- **ids** [bytes]: iterable of content ids within the range. | - ids: iterable of content ids within the range. | ||||
- **next** (Optional[bytes]): The next range of sha1 starts at | - next: The next range of sha1 starts at | ||||
this sha1 if any | this sha1 if any | ||||
""" | """ | ||||
return self.idx_storage.content_mimetype_get_range( | return self.idx_storage.content_mimetype_get_range( | ||||
start, end, self.tool['id']) | start, end, self.tool['id']) |
Please, below the dosctring