D2622.diff
No OneTemporary
Actions

Size

22 KB

Subscribers

None

D2622.diff
View Options

	diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
	--- a/swh/indexer/indexer.py
	+++ b/swh/indexer/indexer.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2016-2018 The Software Heritage developers
	+# Copyright (C) 2016-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -12,7 +12,8 @@

	from copy import deepcopy
	from contextlib import contextmanager
	-from typing import Any, Dict, Tuple
	+from typing import Any, Dict, Tuple, Generator, Union, List, Optional
	+from typing import Set

	from swh.scheduler import get_scheduler
	from swh.scheduler import CONFIG as SWH_CONFIG
	@@ -27,13 +28,17 @@


	@contextmanager
	-def write_to_temp(filename, data, working_directory):
	+def write_to_temp(
	+ filename: str, data: bytes, working_directory: str
	+) -> Generator[str, None, None]:
	"""Write the sha1's content in a temporary file.

	Args:
	- filename (str): one of sha1's many filenames
	- data (bytes): the sha1's content to write in temporary
	+ filename: one of sha1's many filenames
	+ data: the sha1's content to write in temporary
	file
	+ working_directory: the directory into which the
	+ file is written

	Returns:
	The path to the temporary file created. That file is
	@@ -103,6 +108,8 @@
	filtering.

	"""
	+ results: List[Dict]
	+
	CONFIG = 'indexer/base'

	DEFAULT_CONFIG = {
	@@ -134,7 +141,9 @@
	"""Prevents exceptions in `index()` from raising too high. Set to False
	in tests to properly catch all exceptions."""

	- def __init__(self, config=None, **kw):
	+ scheduler: Any
	+
	+ def __init__(self, config=None, **kw) -> None:
	"""Prepare and check that the indexer is ready to run.

	"""
	@@ -155,7 +164,7 @@
	self.check()
	self.log.debug('%s: config=%s', self, self.config)

	- def prepare(self):
	+ def prepare(self) -> None:
	"""Prepare the indexer's needed runtime configuration.
	Without this step, the indexer cannot possibly run.

	@@ -181,10 +190,10 @@
	self.results = []

	@property
	- def tool(self):
	+ def tool(self) -> Dict:
	return self.tools[0]

	- def check(self):
	+ def check(self) -> None:
	"""Check the indexer's configuration is ok before proceeding.
	If ok, does nothing. If not raise error.

	@@ -193,13 +202,15 @@
	raise ValueError('Tools %s is unknown, cannot continue' %
	self.tools)

	- def _prepare_tool(self, tool):
	+ def _prepare_tool(self, tool: Dict[str, Any]) -> Dict[str, Any]:
	"""Prepare the tool dict to be compliant with the storage api.

	"""
	return {'tool_%s' % key: value for key, value in tool.items()}

	- def register_tools(self, tools):
	+ def register_tools(
	+ self, tools: Union[Dict[str, Any], List[Dict[str, Any]]]
	+ ) -> List[Dict[str, Any]]:
	"""Permit to register tools to the storage.

	Add a sensible default which can be overridden if not
	@@ -209,7 +220,7 @@
	one or more tools.

	Args:
	- tools (dict/[dict]): Either a dict or a list of dict.
	+ tools: Either a dict or a list of dict.

	Returns:
	list: List of dicts with additional id key.
	@@ -230,12 +241,14 @@
	else:
	return []

	- def index(self, id, data):
	+ def index(
	+ self, id: bytes, data: bytes
	+ ) -> Dict[str, Any]:
	"""Index computation for the id and associated raw data.

	Args:
	- id (bytes): identifier
	- data (bytes): id's data from storage or objstorage depending on
	+ id: identifier
	+ data: id's data from storage or objstorage depending on
	object type

	Returns:
	@@ -245,11 +258,11 @@
	"""
	raise NotImplementedError()

	- def filter(self, ids):
	+ def filter(self, ids: List[bytes]) -> Generator[bytes, None, None]:
	"""Filter missing ids for that particular indexer.

	Args:
	- ids ([bytes]): list of ids
	+ ids: list of ids

	Yields:
	iterator of missing ids
	@@ -274,16 +287,18 @@
	"""
	pass

	- def next_step(self, results, task):
	+ def next_step(
	+ self, results: List[Dict], task: Optional[Dict[str, Any]]
	+ ) -> None:
	"""Do something else with computations results (e.g. send to another
	queue, ...).

	(This is not an abstractmethod since it is optional).

	Args:
	- results ([result]): List of results (dict) as returned
	+ results: List of results (dict) as returned
	by index function.
	- task (dict): a dict in the form expected by
	+ task: a dict in the form expected by
	`scheduler.backend.SchedulerBackend.create_tasks`
	without `next_run`, plus an optional `result_name` key.

	@@ -394,12 +409,14 @@

	"""
	@abc.abstractmethod
	- def indexed_contents_in_range(self, start, end):
	+ def indexed_contents_in_range(
	+ self, start: bytes, end: bytes
	+ ) -> Any:
	"""Retrieve indexed contents within range [start, end].

	Args:
	- start (bytes): Starting bound from range identifier
	- end (bytes): End range identifier
	+ start: Starting bound from range identifier
	+ end: End range identifier

	Yields:
	bytes: Content identifier present in the range ``[start, end]``
	@@ -407,14 +424,16 @@
	"""
	pass

	- def _list_contents_to_index(self, start, end, indexed):
	+ def _list_contents_to_index(
	+ self, start: bytes, end: bytes, indexed: Set[bytes]
	+ ) -> Generator[bytes, None, None]:
	"""Compute from storage the new contents to index in the range [start,
	end]. The already indexed contents are skipped.

	Args:
	- start (bytes): Starting bound from range identifier
	- end (bytes): End range identifier
	- indexed (Set[bytes]): Set of content already indexed.
	+ start: Starting bound from range identifier
	+ end: End range identifier
	+ indexed: Set of content already indexed.

	Yields:
	bytes: Identifier of contents to index.
	@@ -433,13 +452,15 @@
	yield _id
	start = result['next']

	- def _index_contents(self, start, end, indexed, **kwargs):
	+ def _index_contents(
	+ self, start: bytes, end: bytes, indexed: Set[bytes], **kwargs: Any
	+ ) -> Generator[Dict, None, None]:
	"""Index the contents from within range [start, end]

	Args:
	- start (bytes): Starting bound from range identifier
	- end (bytes): End range identifier
	- indexed (Set[bytes]): Set of content already indexed.
	+ start: Starting bound from range identifier
	+ end: End range identifier
	+ indexed: Set of content already indexed.

	Yields:
	dict: Data indexed to persist using the indexer storage
	@@ -452,7 +473,7 @@
	self.log.warning('Content %s not found in objstorage' %
	hashutil.hash_to_hex(sha1))
	continue
	- res = self.index(sha1, raw_content, **kwargs)
	+ res = self.index(sha1, raw_content, **kwargs) # type: ignore
	if res:
	if not isinstance(res['id'], bytes):
	raise TypeError(
	@@ -460,15 +481,17 @@
	(self.__class__.__name__, res['id']))
	yield res

	- def _index_with_skipping_already_done(self, start, end):
	+ def _index_with_skipping_already_done(
	+ self, start: bytes, end: bytes
	+ ) -> Generator[Dict, None, None]:
	"""Index not already indexed contents in range [start, end].

	Args:
	- start** (Union[bytes, str]): Starting range identifier
	- end (Union[bytes, str]): Ending range identifier
	+ start: Starting range identifier
	+ end: Ending range identifier

	Yields:
	- bytes: Content identifier present in the range
	+ dict: Content identifier present in the range
	``[start, end]`` which are not already indexed.

	"""
	@@ -558,7 +581,7 @@
	self.results = results
	return self.next_step(results, task=next_step)

	- def index_list(self, origins, **kwargs):
	+ def index_list(self, origins: List[Any], **kwargs: Any) -> List[Dict]:
	results = []
	for origin in origins:
	try:
	diff --git a/swh/indexer/metadata.py b/swh/indexer/metadata.py
	--- a/swh/indexer/metadata.py
	+++ b/swh/indexer/metadata.py
	@@ -1,10 +1,12 @@
	-# Copyright (C) 2017-2018 The Software Heritage developers
	+# Copyright (C) 2017-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	from copy import deepcopy

	+from typing import Any, List, Dict, Tuple, Callable, Generator
	+
	from swh.core.utils import grouper

	from swh.indexer.codemeta import merge_documents
	@@ -21,7 +23,10 @@
	ORIGIN_GET_BATCH_SIZE = 10


	-def call_with_batches(f, args, batch_size):
	+def call_with_batches(
	+ f: Callable[[List[Dict[str, Any]]], Dict['str', Any]],
	+ args: List[Dict[str, str]], batch_size: int
	+) -> Generator[str, None, None]:
	"""Calls a function with batches of args, and concatenates the results.
	"""
	groups = grouper(args, batch_size)
	@@ -82,15 +87,17 @@
	return None
	return result

	- def persist_index_computations(self, results, policy_update):
	+ def persist_index_computations(
	+ self, results: List[Dict], policy_update: str
	+ ) -> None:
	"""Persist the results in storage.

	Args:
	- results ([dict]): list of content_metadata, dict with the
	+ results: list of content_metadata, dict with the
	following keys:
	- id (bytes): content's identifier (sha1)
	- metadata (jsonb): detected metadata
	- policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	+ policy_update: either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	@@ -179,16 +186,18 @@
	'Problem when indexing rev: %r', e)
	return result

	- def persist_index_computations(self, results, policy_update):
	+ def persist_index_computations(
	+ self, results: List[Dict], policy_update: str
	+ ) -> None:
	"""Persist the results in storage.

	Args:
	- results ([dict]): list of content_mimetype, dict with the
	+ results: list of content_mimetype, dict with the
	following keys:
	- id (bytes): content's identifier (sha1)
	- mimetype (bytes): mimetype in bytes
	- encoding (bytes): encoding in bytes
	- policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	+ policy_update: either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	@@ -198,13 +207,14 @@
	results, conflict_update=(policy_update == 'update-dups'))

	def translate_revision_intrinsic_metadata(
	- self, detected_files, log_suffix):
	+ self, detected_files: Dict[str, List[Any]], log_suffix: str
	+ ) -> Tuple[List[Any], List[Any]]:
	"""
	Determine plan of action to translate metadata when containing
	one or multiple detected files:

	Args:
	- detected_files (dict): dictionary mapping context names (e.g.,
	+ detected_files: dictionary mapping context names (e.g.,
	"npm", "authors") to list of sha1

	Returns:
	@@ -272,7 +282,7 @@

	USE_TOOLS = False

	- def __init__(self, config=None, **kwargs):
	+ def __init__(self, config=None, **kwargs) -> None:
	super().__init__(config=config, **kwargs)
	self.origin_head_indexer = OriginHeadIndexer(config=config)
	self.revision_metadata_indexer = RevisionMetadataIndexer(config=config)
	@@ -313,14 +323,16 @@
	results.append((orig_metadata, rev_metadata))
	return results

	- def persist_index_computations(self, results, policy_update):
	+ def persist_index_computations(
	+ self, results: List[Dict], policy_update: str
	+ ) -> None:
	conflict_update = (policy_update == 'update-dups')

	# Deduplicate revisions
	- rev_metadata = []
	- orig_metadata = []
	- revs_to_delete = []
	- origs_to_delete = []
	+ rev_metadata: List[Any] = []
	+ orig_metadata: List[Any] = []
	+ revs_to_delete: List[Any] = []
	+ origs_to_delete: List[Any] = []
	for (orig_item, rev_item) in results:
	assert rev_item['metadata'] == orig_item['metadata']
	if not rev_item['metadata'] or \
	diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py
	--- a/swh/indexer/mimetype.py
	+++ b/swh/indexer/mimetype.py
	@@ -1,12 +1,11 @@
	-# Copyright (C) 2016-2018 The Software Heritage developers
	+# Copyright (C) 2016-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	+from typing import Optional, Dict, Any, List
	import magic

	-from typing import Optional
	-
	from .indexer import ContentIndexer, ContentRangeIndexer

	if not hasattr(magic.Magic, 'from_buffer'):
	@@ -15,15 +14,14 @@
	'was imported instead.')


	-def compute_mimetype_encoding(raw_content):
	+def compute_mimetype_encoding(raw_content: bytes) -> Dict[str, bytes]:
	"""Determine mimetype and encoding from the raw content.

	Args:
	- raw_content (bytes): content's raw data
	+ raw_content: content's raw data

	Returns:
	- dict: mimetype and encoding key and corresponding values
	- (as bytes).
	+ dict: mimetype and encoding key and corresponding values.

	"""
	m = magic.Magic(mime=True, mime_encoding=True)
	@@ -41,6 +39,8 @@
	See :class:`MimetypeIndexer` and :class:`MimetypeRangeIndexer`

	"""
	+ tool: Dict[str, Any]
	+ idx_storage: Any
	ADDITIONAL_CONFIG = {
	'tools': ('dict', {
	'name': 'file',
	@@ -55,36 +55,38 @@

	CONFIG_BASE_FILENAME = 'indexer/mimetype' # type: Optional[str]

	- def index(self, id, data):
	+ def index(self, id: bytes, data: bytes) -> Dict[str, Any]:
	"""Index sha1s' content and store result.

	Args:
	- id (bytes): content's identifier
	- data (bytes): raw content in bytes
	+ id: content's identifier
	+ data: raw content in bytes

	Returns:
	dict: content's mimetype; dict keys being

	- - id (bytes): content's identifier (sha1)
	- - mimetype (bytes): mimetype in bytes
	- - encoding (bytes): encoding in bytes
	+ - id: content's identifier (sha1)
	+ - mimetype: mimetype in bytes
	+ - encoding: encoding in bytes

	"""
	properties = compute_mimetype_encoding(data)
	properties.update({
	'id': id,
	'indexer_configuration_id': self.tool['id'],
	- })
	+ })
	return properties

	- def persist_index_computations(self, results, policy_update):
	+ def persist_index_computations(
	+ self, results: List[Dict], policy_update: List[str]
	+ ) -> None:
	"""Persist the results in storage.

	Args:
	- results ([dict]): list of content's mimetype dicts
	+ results: list of content's mimetype dicts
	(see :meth:`.index`)

	- policy_update ([str]): either 'update-dups' or 'ignore-dups' to
	+ policy_update: either 'update-dups' or 'ignore-dups' to
	respectively update duplicates or ignore them

	"""
	@@ -128,18 +130,21 @@
	- stores result in storage

	"""
	- def indexed_contents_in_range(self, start, end):
	+
	+ def indexed_contents_in_range(
	+ self, start: bytes, end: bytes
	+ ) -> Dict[str, Optional[bytes]]:
	"""Retrieve indexed content id within range [start, end].

	Args:
	- start (bytes): Starting bound from range identifier
	- end (bytes): End range identifier
	+ start: Starting bound from range identifier
	+ end: End range identifier

	Returns:
	dict: a dict with keys:

	- - ids [bytes]: iterable of content ids within the range.
	- - next (Optional[bytes]): The next range of sha1 starts at
	+ - ids: iterable of content ids within the range.
	+ - next: The next range of sha1 starts at
	this sha1 if any

	"""
	diff --git a/swh/indexer/origin_head.py b/swh/indexer/origin_head.py
	--- a/swh/indexer/origin_head.py
	+++ b/swh/indexer/origin_head.py
	@@ -1,8 +1,10 @@
	-# Copyright (C) 2018 The Software Heritage developers
	+# Copyright (C) 2018-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	+from typing import List, Tuple, Any, Dict, Union
	+
	import re
	import click
	import logging
	@@ -20,7 +22,9 @@

	USE_TOOLS = False

	- def persist_index_computations(self, results, policy_update):
	+ def persist_index_computations(
	+ self, results: Any, policy_update: str
	+ ) -> None:
	"""Do nothing. The indexer's results are not persistent, they
	should only be piped to another indexer."""
	pass
	@@ -58,7 +62,9 @@
	rb'$')

	@classmethod
	- def _parse_version(cls, filename):
	+ def _parse_version(
	+ cls: Any, filename: str
	+ ) -> Tuple[Union[float, int], ...]:
	"""Extracts the release version from an archive filename,
	to get an ordering whose maximum is likely to be the last
	version of the software
	@@ -92,7 +98,7 @@
	assert False, res.group('preversion')
	return tuple(version)

	- def _try_get_ftp_head(self, snapshot):
	+ def _try_get_ftp_head(self, snapshot: Dict[str, Any]) -> Any:
	archive_names = list(snapshot['branches'])
	max_archive_name = max(archive_names, key=self._parse_version)
	r = self._try_resolve_target(snapshot['branches'], max_archive_name)
	@@ -100,7 +106,9 @@

	# Generic

	- def _try_get_head_generic(self, snapshot):
	+ def _try_get_head_generic(
	+ self, snapshot: Dict[str, Any]
	+ ) -> Any:
	# Works on 'deposit', 'pypi', and VCSs.
	try:
	branches = snapshot['branches']
	@@ -112,7 +120,7 @@
	self._try_resolve_target(branches, b'master')
	)

	- def _try_resolve_target(self, branches, target_name):
	+ def _try_resolve_target(self, branches: Dict, target_name: bytes) -> Any:
	try:
	target = branches[target_name]
	if target is None:
	@@ -140,7 +148,7 @@
	@click.option('--origins', '-i',
	help='Origins to lookup, in the "type+url" format',
	multiple=True)
	-def main(origins):
	+def main(origins: List[str]) -> None:
	rev_metadata_indexer = OriginHeadIndexer()
	rev_metadata_indexer.run(origins)

	diff --git a/swh/indexer/rehash.py b/swh/indexer/rehash.py
	--- a/swh/indexer/rehash.py
	+++ b/swh/indexer/rehash.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2017-2018 The Software Heritage developers
	+# Copyright (C) 2017-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -7,6 +7,7 @@
	import itertools

	from collections import defaultdict
	+from typing import Dict, Any, Tuple, List, Generator

	from swh.core import utils
	from swh.core.config import SWHConfig
	@@ -64,7 +65,7 @@

	CONFIG_BASE_FILENAME = 'indexer/rehash'

	- def __init__(self):
	+ def __init__(self) -> None:
	self.config = self.parse_config_file()
	self.storage = get_storage(**self.config['storage'])
	self.objstorage = get_objstorage(**self.config['objstorage'])
	@@ -80,7 +81,9 @@
	if not self.compute_checksums:
	raise ValueError('Checksums list should not be empty.')

	- def _read_content_ids(self, contents):
	+ def _read_content_ids(
	+ self, contents: List[Dict[str, Any]]
	+ ) -> Generator[bytes, Any, None]:
	"""Read the content identifiers from the contents.

	"""
	@@ -91,14 +94,15 @@

	yield h

	- def get_new_contents_metadata(self, all_contents):
	+ def get_new_contents_metadata(
	+ self, all_contents: List[Dict[str, Any]]
	+ ) -> Generator[Tuple[Dict[str, Any], List[Any]], Any, None]:
	"""Retrieve raw contents and compute new checksums on the
	contents. Unknown or corrupted contents are skipped.

	Args:
	- all_contents ([dict]): List of contents as dictionary with
	+ all_contents: List of contents as dictionary with
	the necessary primary keys
	- checksum_algorithms ([str]): List of checksums to compute

	Yields:
	tuple: tuple of (content to update, list of checksums computed)
	@@ -141,7 +145,7 @@
	content.update(content_hashes)
	yield content, checksums_to_compute

	- def run(self, contents):
	+ def run(self, contents: List[Dict[str, Any]]) -> None:
	"""Given a list of content:

	- (re)compute a given set of checksums on contents available in our
	@@ -149,7 +153,7 @@
	- update those contents with the new metadata

	Args:
	- contents (dict): contents as dictionary with necessary keys.
	+ contents: contents as dictionary with necessary keys.
	key present in such dictionary should be the ones defined in
	the 'primary_key' option.

	@@ -158,7 +162,7 @@
	self.get_new_contents_metadata(contents),
	self.batch_size_update):

	- groups = defaultdict(list)
	+ groups: Dict[str, List[Any]] = defaultdict(list)
	for content, keys_to_update in data:
	keys = ','.join(keys_to_update)
	groups[keys].append(content)

File Metadata

Mime Type: text/plain
Expires: Nov 5 2024, 5:46 AM (8 w, 2 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3222514

D2622.diffNo OneTemporaryActions

D2622.diffView Options

File Metadata

Event Timeline

D2622.diff
No OneTemporary
Actions

D2622.diff
View Options