Differential D2324 Diff 8700 swh/storage/storage.py

Changeset View

Standalone View

swh/storage/storage.py

# Copyright (C) 2015-2019 The Software Heritage developers		# Copyright (C) 2015-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution		# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version		# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information		# See top-level LICENSE file for more information

import copy		import copy
import datetime		import datetime
import itertools		import itertools
import json		import json

from collections import defaultdict		from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor		from concurrent.futures import ThreadPoolExecutor
from contextlib import contextmanager		from contextlib import contextmanager
from typing import Any, Dict, Mapping		from typing import Any, Dict, Mapping, Optional

import dateutil.parser		import dateutil.parser
import psycopg2		import psycopg2
import psycopg2.pool		import psycopg2.pool

from swh.core.api import remote_api_endpoint		from swh.core.api import remote_api_endpoint
from swh.model.hashutil import ALGORITHMS, hash_to_bytes		from swh.model.hashutil import ALGORITHMS, hash_to_bytes
from swh.objstorage import get_objstorage		from swh.objstorage import get_objstorage
▲ Show 20 Lines • Show All 1,662 Lines • ▼ Show 20 Lines	def origin_get_range(self, origin_from=1, origin_count=100,

Yields:		Yields:
dicts containing origin information as returned		dicts containing origin information as returned
by :meth:`swh.storage.storage.Storage.origin_get`.		by :meth:`swh.storage.storage.Storage.origin_get`.
"""		"""
for origin in db.origin_get_range(origin_from, origin_count, cur):		for origin in db.origin_get_range(origin_from, origin_count, cur):
yield dict(zip(db.origin_get_range_cols, origin))		yield dict(zip(db.origin_get_range_cols, origin))

		@remote_api_endpoint('origin/list')
		@timed
		@db_transaction()
		def origin_list(self, page_token: Optional[str] = None, limit: int = 100,
		*, db=None, cur=None) -> dict:
		"""Returns the list of origins

		Args:
		page_token: opaque token used for pagination.
		limit: the maximum number of results to return

		Returns:
		dict: dict with the following keys:
		- next_page_token (str, optional): opaque token to be used as
		`page_token` for retrieving the next page. if absent, there is
		ardumontUnsubmitted Done Inline Actions same remark than for in-memory implementation (type, name, typo). ardumont: same remark than for in-memory implementation (type, name, typo).
		no more pages to gather.
		- origins (List[dict]): list of origins, as returned by
		`origin_get`.
		ardumontUnsubmitted Not Done Inline Actions Why do we force the string type here? ardumont: Why do we force the string type here?
		vlorentzAuthorUnsubmitted Done Inline Actions Because it's an opaque token, so clients shouldn't know what it actually contains so we can change it at any time. (eg. for Cassandra, it will be a large byte array) vlorentz: Because it's an opaque token, so clients shouldn't know what it actually contains so we can…
		"""
		page_token = page_token or '0'
		if not isinstance(page_token, str):
		raise TypeError('page_token must be a string.')
		origin_from = int(page_token)
		result: Dict[str, Any] = {
		'origins': [
		dict(zip(db.origin_get_range_cols, origin))
		for origin in db.origin_get_range(origin_from, limit, cur)
		],
		}

		assert len(result['origins']) <= limit
		if len(result['origins']) == limit:
		ardumontUnsubmitted Not Done Inline Actions Why don't you change the `db.origin_get_range` implementation so that it retuns what you want instead? ardumont: Why don't you change the `db.origin_get_range` implementation so that it retuns what you want…
		vlorentzAuthorUnsubmitted Done Inline Actions Because I need the origin id to compute the `next_page_token`. vlorentz: Because I need the origin id to compute the `next_page_token`.
		result['next_page_token'] = str(result['origins'][limit-1]['id']+1)

		for origin in result['origins']:
		del origin['id']

		return result

@remote_api_endpoint('origin/search')		@remote_api_endpoint('origin/search')
@timed		@timed
@db_transaction_generator()		@db_transaction_generator()
def origin_search(self, url_pattern, offset=0, limit=50,		def origin_search(self, url_pattern, offset=0, limit=50,
regexp=False, with_visit=False, db=None, cur=None):		regexp=False, with_visit=False, db=None, cur=None):
"""Search for origins whose urls contain a provided string pattern		"""Search for origins whose urls contain a provided string pattern
or match a provided regular expression.		or match a provided regular expression.
The search is performed in a case insensitive way.		The search is performed in a case insensitive way.
▲ Show 20 Lines • Show All 347 Lines • Show Last 20 Lines