Differential D2622 Diff 9469 swh/indexer/mimetype.py

Changeset View

Standalone View

swh/indexer/mimetype.py

	# Copyright (C) 2016-2018 The Software Heritage developers			# Copyright (C) 2016-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution			# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version			# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information			# See top-level LICENSE file for more information

				from typing import Optional, Dict, Any, List
	import magic			import magic

	from typing import Optional

	from .indexer import ContentIndexer, ContentRangeIndexer			from .indexer import ContentIndexer, ContentRangeIndexer

	if not hasattr(magic.Magic, 'from_buffer'):			if not hasattr(magic.Magic, 'from_buffer'):
	raise ImportError(			raise ImportError(
	'Expected "import magic" to import python-magic, but file_magic '			'Expected "import magic" to import python-magic, but file_magic '
	'was imported instead.')			'was imported instead.')


	def compute_mimetype_encoding(raw_content):			def compute_mimetype_encoding(raw_content: bytes) -> Dict[str, bytes]:
	"""Determine mimetype and encoding from the raw content.			"""Determine mimetype and encoding from the raw content.

	Args:			Args:
	raw_content (bytes): content's raw data			raw_content: content's raw data

	Returns:			Returns:
	dict: mimetype and encoding key and corresponding values			dict: mimetype and encoding key and corresponding values.
	(as bytes).

	"""			"""
	m = magic.Magic(mime=True, mime_encoding=True)			m = magic.Magic(mime=True, mime_encoding=True)
	res = m.from_buffer(raw_content)			res = m.from_buffer(raw_content)
	(mimetype, encoding) = res.split('; charset=')			(mimetype, encoding) = res.split('; charset=')
	return {			return {
	'mimetype': mimetype,			'mimetype': mimetype,
	'encoding': encoding,			'encoding': encoding,
	}			}


	class MixinMimetypeIndexer:			class MixinMimetypeIndexer:
	"""Mixin mimetype indexer.			"""Mixin mimetype indexer.
				ardumontUnsubmitted Done Inline Actions Please, below the dosctring ardumont: Please, below the dosctring
				vlorentzUnsubmitted Done Inline Actions Sorry I didn't notice this sooner, but attributes must be after the docstring, else the docstring is associated with the last attribute. vlorentz: Sorry I didn't notice this sooner, but attributes must be after the docstring, else the…

	See :class:`MimetypeIndexer` and :class:`MimetypeRangeIndexer`			See :class:`MimetypeIndexer` and :class:`MimetypeRangeIndexer`

	"""			"""
				tool: Dict[str, Any]
				idx_storage: Any
	ADDITIONAL_CONFIG = {			ADDITIONAL_CONFIG = {
	'tools': ('dict', {			'tools': ('dict', {
	'name': 'file',			'name': 'file',
	'version': '1:5.30-1+deb9u1',			'version': '1:5.30-1+deb9u1',
	'configuration': {			'configuration': {
	"type": "library",			"type": "library",
	"debian-package": "python3-magic"			"debian-package": "python3-magic"
	},			},
	}),			}),
	'write_batch_size': ('int', 1000),			'write_batch_size': ('int', 1000),
	}			}

	CONFIG_BASE_FILENAME = 'indexer/mimetype' # type: Optional[str]			CONFIG_BASE_FILENAME = 'indexer/mimetype' # type: Optional[str]

	def index(self, id, data):			def index(self, id: bytes, data: bytes) -> Dict[str, Any]:
	"""Index sha1s' content and store result.			"""Index sha1s' content and store result.

	Args:			Args:
	id (bytes): content's identifier			id: content's identifier
	data (bytes): raw content in bytes			data: raw content in bytes

	Returns:			Returns:
	dict: content's mimetype; dict keys being			dict: content's mimetype; dict keys being

	- id (bytes): content's identifier (sha1)			- id: content's identifier (sha1)
	- mimetype (bytes): mimetype in bytes			- mimetype: mimetype in bytes
	- encoding (bytes): encoding in bytes			- encoding: encoding in bytes

	"""			"""
	properties = compute_mimetype_encoding(data)			properties = compute_mimetype_encoding(data)
	properties.update({			properties.update({
	'id': id,			'id': id,
	'indexer_configuration_id': self.tool['id'],			'indexer_configuration_id': self.tool['id'],
	})			})
	return properties			return properties

	def persist_index_computations(self, results, policy_update):			def persist_index_computations(
				self, results: List[Dict], policy_update: List[str]
				) -> None:
	"""Persist the results in storage.			"""Persist the results in storage.

	Args:			Args:
	results ([dict]): list of content's mimetype dicts			results: list of content's mimetype dicts
	(see :meth:`.index`)			(see :meth:`.index`)

	policy_update ([str]): either 'update-dups' or 'ignore-dups' to			policy_update: either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them			respectively update duplicates or ignore them

	"""			"""
	self.idx_storage.content_mimetype_add(			self.idx_storage.content_mimetype_add(
	results, conflict_update=(policy_update == 'update-dups'))			results, conflict_update=(policy_update == 'update-dups'))


	class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer):			class MimetypeIndexer(MixinMimetypeIndexer, ContentIndexer):
	"""Mimetype Indexer working on list of content identifiers.			"""Mimetype Indexer working on list of content identifiers.

	It:			It:

	- (optionally) filters out content already indexed (cf.			- (optionally) filters out content already indexed (cf.
	:meth:`.filter`)			:meth:`.filter`)
	- reads content from objstorage per the content's id (sha1)			- reads content from objstorage per the content's id (sha1)
	- computes {mimetype, encoding} from that content			- computes {mimetype, encoding} from that content
	- stores result in storage			- stores result in storage

	"""			"""
	def filter(self, ids):			def filter(self, ids):
	"""Filter out known sha1s and return only missing ones.			"""Filter out known sha1s and return only missing ones.
				vlorentzUnsubmitted Done Inline Actions Break line before `)`, so the return type fits on a single line vlorentz: Break line before `)`, so the return type fits on a single line

	"""			"""
	yield from self.idx_storage.content_mimetype_missing((			yield from self.idx_storage.content_mimetype_missing((
	{			{
	'id': sha1,			'id': sha1,
	'indexer_configuration_id': self.tool['id'],			'indexer_configuration_id': self.tool['id'],
	} for sha1 in ids			} for sha1 in ids
	))			))


	class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer):			class MimetypeRangeIndexer(MixinMimetypeIndexer, ContentRangeIndexer):
	"""Mimetype Range Indexer working on range of content identifiers.			"""Mimetype Range Indexer working on range of content identifiers.

	It:			It:

	- (optionally) filters out content already indexed (cf			- (optionally) filters out content already indexed (cf
	:meth:`.indexed_contents_in_range`)			:meth:`.indexed_contents_in_range`)
	- reads content from objstorage per the content's id (sha1)			- reads content from objstorage per the content's id (sha1)
	- computes {mimetype, encoding} from that content			- computes {mimetype, encoding} from that content
	- stores result in storage			- stores result in storage

	"""			"""
	def indexed_contents_in_range(self, start, end):
				def indexed_contents_in_range(
				self, start: bytes, end: bytes
				ardumontUnsubmitted Done Inline Actions `str` without the quote :) ardumont: `str` without the quote :)
				) -> Dict[str, Optional[bytes]]:
				vlorentzUnsubmitted Done Inline Actions Our code style is to not have too many lines if possible. vlorentz: Our code style is to not have too many lines if possible.
	"""Retrieve indexed content id within range [start, end].			"""Retrieve indexed content id within range [start, end].

	Args:			Args:
	start (bytes): Starting bound from range identifier			start: Starting bound from range identifier
	end (bytes): End range identifier			end: End range identifier

	Returns:			Returns:
	dict: a dict with keys:			dict: a dict with keys:

	- ids [bytes]: iterable of content ids within the range.			- ids: iterable of content ids within the range.
	- next (Optional[bytes]): The next range of sha1 starts at			- next: The next range of sha1 starts at
	this sha1 if any			this sha1 if any

	"""			"""
	return self.idx_storage.content_mimetype_get_range(			return self.idx_storage.content_mimetype_get_range(
	start, end, self.tool['id'])			start, end, self.tool['id'])