D2609.diff
No OneTemporary
Actions

Size

76 KB

Subscribers

None

D2609.diff
View Options

	diff --git a/requirements-swh.txt b/requirements-swh.txt
	--- a/requirements-swh.txt
	+++ b/requirements-swh.txt
	@@ -1,4 +1,4 @@
	-swh.core[db,http] >= 0.0.65
	+swh.core[db,http] >= 0.0.87
	swh.model >= 0.0.15
	swh.objstorage >= 0.0.28
	swh.scheduler >= 0.0.47
	diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
	--- a/swh/indexer/storage/__init__.py
	+++ b/swh/indexer/storage/__init__.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2015-2018 The Software Heritage developers
	+# Copyright (C) 2015-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -9,7 +9,6 @@

	from collections import defaultdict

	-from swh.core.api import remote_api_endpoint
	from swh.storage.common import db_transaction_generator, db_transaction
	from swh.storage.exc import StorageDBError
	from .db import Db
	@@ -108,10 +107,8 @@
	if db is not self._db:
	db.put_conn()

	- @remote_api_endpoint('check_config')
	@db_transaction()
	def check_config(self, *, check_write, db=None, cur=None):
	- """Check that the storage is configured and ready to go."""
	# Check permissions on one of the tables
	if check_write:
	check = 'INSERT'
	@@ -124,22 +121,8 @@
	)
	return cur.fetchone()[0]

	- @remote_api_endpoint('content_mimetype/missing')
	@db_transaction_generator()
	def content_mimetype_missing(self, mimetypes, db=None, cur=None):
	- """Generate mimetypes missing from storage.
	-
	- Args:
	- mimetypes (iterable): iterable of dict with keys:
	-
	- - id (bytes): sha1 identifier
	- - indexer_configuration_id (int): tool used to compute the
	- results
	-
	- Yields:
	- tuple (id, indexer_configuration_id): missing id
	-
	- """
	for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
	yield obj[0]

	@@ -147,34 +130,6 @@
	indexer_configuration_id, limit=1000,
	with_textual_data=False,
	db=None, cur=None):
	- """Retrieve ids of type content_type within range [start, end] bound
	- by limit.
	-
	- Args:
	- content_type (str): content's type (mimetype, language, etc...)
	- start (bytes): Starting identifier range (expected smaller
	- than end)
	- end (bytes): Ending identifier range (expected larger
	- than start)
	- indexer_configuration_id (int): The tool used to index data
	- limit (int): Limit result (default to 1000)
	- with_textual_data (bool): Deal with only textual
	- content (True) or all
	- content (all contents by
	- defaults, False)
	-
	- Raises:
	- ValueError for;
	- - limit to None
	- - wrong content_type provided
	-
	- Returns:
	- a dict with keys:
	- - ids [bytes]: iterable of content ids within the range.
	- - next (Optional[bytes]): The next range of sha1 starts at
	- this sha1 if any
	-
	- """
	if limit is None:
	raise ValueError('Development error: limit should not be None')
	if content_type not in db.content_indexer_names:
	@@ -199,53 +154,16 @@
	'next': next_id
	}

	- @remote_api_endpoint('content_mimetype/range')
	@db_transaction()
	def content_mimetype_get_range(self, start, end, indexer_configuration_id,
	limit=1000, db=None, cur=None):
	- """Retrieve mimetypes within range [start, end] bound by limit.
	-
	- Args:
	- start (bytes): Starting identifier range (expected smaller
	- than end)
	- end (bytes): Ending identifier range (expected larger
	- than start)
	- indexer_configuration_id (int): The tool used to index data
	- limit (int): Limit result (default to 1000)
	-
	- Raises:
	- ValueError for limit to None
	-
	- Returns:
	- a dict with keys:
	- - ids [bytes]: iterable of content ids within the range.
	- - next (Optional[bytes]): The next range of sha1 starts at
	- this sha1 if any
	-
	- """
	return self._content_get_range('mimetype', start, end,
	indexer_configuration_id, limit=limit,
	db=db, cur=cur)

	- @remote_api_endpoint('content_mimetype/add')
	@db_transaction()
	def content_mimetype_add(self, mimetypes, conflict_update=False, db=None,
	cur=None):
	- """Add mimetypes not present in storage.
	-
	- Args:
	- mimetypes (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1 identifier
	- - mimetype (bytes): raw content's mimetype
	- - encoding (bytes): raw content's encoding
	- - indexer_configuration_id (int): tool's id used to
	- compute the results
	- - conflict_update (bool): Flag to determine if we want to
	- overwrite (``True``) or skip duplicates (``False``, the
	- default)
	-
	- """
	_check_id_duplicates(mimetypes)
	mimetypes.sort(key=lambda m: m['id'])
	db.mktemp_content_mimetype(cur)
	@@ -254,84 +172,26 @@
	cur)
	db.content_mimetype_add_from_temp(conflict_update, cur)

	- @remote_api_endpoint('content_mimetype')
	@db_transaction_generator()
	def content_mimetype_get(self, ids, db=None, cur=None):
	- """Retrieve full content mimetype per ids.
	-
	- Args:
	- ids (iterable): sha1 identifier
	-
	- Yields:
	- mimetypes (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1 identifier
	- - mimetype (bytes): raw content's mimetype
	- - encoding (bytes): raw content's encoding
	- - tool (dict): Tool used to compute the language
	-
	- """
	for c in db.content_mimetype_get_from_list(ids, cur):
	yield converters.db_to_mimetype(
	dict(zip(db.content_mimetype_cols, c)))

	- @remote_api_endpoint('content_language/missing')
	@db_transaction_generator()
	def content_language_missing(self, languages, db=None, cur=None):
	- """List languages missing from storage.
	-
	- Args:
	- languages (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1 identifier
	- - indexer_configuration_id (int): tool used to compute
	- the results
	-
	- Yields:
	- an iterable of missing id for the tuple (id,
	- indexer_configuration_id)
	-
	- """
	for obj in db.content_language_missing_from_list(languages, cur):
	yield obj[0]

	- @remote_api_endpoint('content_language')
	@db_transaction_generator()
	def content_language_get(self, ids, db=None, cur=None):
	- """Retrieve full content language per ids.
	-
	- Args:
	- ids (iterable): sha1 identifier
	-
	- Yields:
	- languages (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1 identifier
	- - lang (bytes): raw content's language
	- - tool (dict): Tool used to compute the language
	-
	- """
	for c in db.content_language_get_from_list(ids, cur):
	yield converters.db_to_language(
	dict(zip(db.content_language_cols, c)))

	- @remote_api_endpoint('content_language/add')
	@db_transaction()
	def content_language_add(self, languages, conflict_update=False, db=None,
	cur=None):
	- """Add languages not present in storage.
	-
	- Args:
	- languages (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1
	- - lang (bytes): language detected
	-
	- conflict_update (bool): Flag to determine if we want to
	- overwrite (true) or skip duplicates (false, the
	- default)
	-
	- """
	_check_id_duplicates(languages)
	languages.sort(key=lambda m: m['id'])
	db.mktemp_content_language(cur)
	@@ -347,62 +207,19 @@

	db.content_language_add_from_temp(conflict_update, cur)

	- @remote_api_endpoint('content/ctags/missing')
	@db_transaction_generator()
	def content_ctags_missing(self, ctags, db=None, cur=None):
	- """List ctags missing from storage.
	-
	- Args:
	- ctags (iterable): dicts with keys:
	-
	- - id (bytes): sha1 identifier
	- - indexer_configuration_id (int): tool used to compute
	- the results
	-
	- Yields:
	- an iterable of missing id for the tuple (id,
	- indexer_configuration_id)
	-
	- """
	for obj in db.content_ctags_missing_from_list(ctags, cur):
	yield obj[0]

	- @remote_api_endpoint('content/ctags')
	@db_transaction_generator()
	def content_ctags_get(self, ids, db=None, cur=None):
	- """Retrieve ctags per id.
	-
	- Args:
	- ids (iterable): sha1 checksums
	-
	- Yields:
	- Dictionaries with keys:
	-
	- - id (bytes): content's identifier
	- - name (str): symbol's name
	- - kind (str): symbol's kind
	- - lang (str): language for that content
	- - tool (dict): tool used to compute the ctags' info
	-
	-
	- """
	for c in db.content_ctags_get_from_list(ids, cur):
	yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c)))

	- @remote_api_endpoint('content/ctags/add')
	@db_transaction()
	def content_ctags_add(self, ctags, conflict_update=False, db=None,
	cur=None):
	- """Add ctags not present in storage
	-
	- Args:
	- ctags (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1
	- - ctags ([list): List of dictionary with keys: name, kind,
	- line, lang
	-
	- """
	_check_id_duplicates(ctags)
	ctags.sort(key=lambda m: m['id'])

	@@ -422,41 +239,15 @@

	db.content_ctags_add_from_temp(conflict_update, cur)

	- @remote_api_endpoint('content/ctags/search')
	@db_transaction_generator()
	def content_ctags_search(self, expression,
	limit=10, last_sha1=None, db=None, cur=None):
	- """Search through content's raw ctags symbols.
	-
	- Args:
	- expression (str): Expression to search for
	- limit (int): Number of rows to return (default to 10).
	- last_sha1 (str): Offset from which retrieving data (default to '').
	-
	- Yields:
	- rows of ctags including id, name, lang, kind, line, etc...
	-
	- """
	for obj in db.content_ctags_search(expression, last_sha1, limit,
	cur=cur):
	yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj)))

	- @remote_api_endpoint('content/fossology_license')
	@db_transaction_generator()
	def content_fossology_license_get(self, ids, db=None, cur=None):
	- """Retrieve licenses per id.
	-
	- Args:
	- ids (iterable): sha1 checksums
	-
	- Yields:
	- dict: ``{id: facts}`` where ``facts`` is a dict with the
	- following keys:
	-
	- - licenses ([str]): associated licenses for that content
	- - tool (dict): Tool used to compute the license
	-
	- """
	d = defaultdict(list)
	for c in db.content_fossology_license_get_from_list(ids, cur):
	license = dict(zip(db.content_fossology_license_cols, c))
	@@ -467,26 +258,9 @@
	for id_, facts in d.items():
	yield {id_: facts}

	- @remote_api_endpoint('content/fossology_license/add')
	@db_transaction()
	def content_fossology_license_add(self, licenses, conflict_update=False,
	db=None, cur=None):
	- """Add licenses not present in storage.
	-
	- Args:
	- licenses (iterable): dictionaries with keys:
	-
	- - id: sha1
	- - licenses ([bytes]): List of licenses associated to sha1
	- - tool (str): nomossa
	-
	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	- Returns:
	- list: content_license entries which failed due to unknown licenses
	-
	- """
	_check_id_duplicates(licenses)
	licenses.sort(key=lambda m: m['id'])
	db.mktemp_content_fossology_license(cur)
	@@ -502,90 +276,28 @@
	cur=cur)
	db.content_fossology_license_add_from_temp(conflict_update, cur)

	- @remote_api_endpoint('content/fossology_license/range')
	@db_transaction()
	def content_fossology_license_get_range(
	self, start, end, indexer_configuration_id,
	limit=1000, db=None, cur=None):
	- """Retrieve licenses within range [start, end] bound by limit.
	-
	- Args:
	- start (bytes): Starting identifier range (expected smaller
	- than end)
	- end (bytes): Ending identifier range (expected larger
	- than start)
	- indexer_configuration_id (int): The tool used to index data
	- limit (int): Limit result (default to 1000)
	-
	- Raises:
	- ValueError for limit to None
	-
	- Returns:
	- a dict with keys:
	- - ids [bytes]: iterable of content ids within the range.
	- - next (Optional[bytes]): The next range of sha1 starts at
	- this sha1 if any
	-
	- """
	return self._content_get_range('fossology_license', start, end,
	indexer_configuration_id, limit=limit,
	with_textual_data=True, db=db, cur=cur)

	- @remote_api_endpoint('content_metadata/missing')
	@db_transaction_generator()
	def content_metadata_missing(self, metadata, db=None, cur=None):
	- """List metadata missing from storage.
	-
	- Args:
	- metadata (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1 identifier
	- - indexer_configuration_id (int): tool used to compute
	- the results
	-
	- Yields:
	- missing sha1s
	-
	- """
	for obj in db.content_metadata_missing_from_list(metadata, cur):
	yield obj[0]

	- @remote_api_endpoint('content_metadata')
	@db_transaction_generator()
	def content_metadata_get(self, ids, db=None, cur=None):
	- """Retrieve metadata per id.
	-
	- Args:
	- ids (iterable): sha1 checksums
	-
	- Yields:
	- dictionaries with the following keys:
	-
	- id (bytes)
	- metadata (str): associated metadata
	- tool (dict): tool used to compute metadata
	-
	- """
	for c in db.content_metadata_get_from_list(ids, cur):
	yield converters.db_to_metadata(
	dict(zip(db.content_metadata_cols, c)))

	- @remote_api_endpoint('content_metadata/add')
	@db_transaction()
	def content_metadata_add(self, metadata, conflict_update=False, db=None,
	cur=None):
	- """Add metadata not present in storage.
	-
	- Args:
	- metadata (iterable): dictionaries with keys:
	-
	- - id: sha1
	- - metadata: arbitrary dict
	-
	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	- """
	_check_id_duplicates(metadata)
	metadata.sort(key=lambda m: m['id'])

	@@ -596,67 +308,21 @@
	cur)
	db.content_metadata_add_from_temp(conflict_update, cur)

	- @remote_api_endpoint('revision_intrinsic_metadata/missing')
	@db_transaction_generator()
	def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None):
	- """List metadata missing from storage.
	-
	- Args:
	- metadata (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1_git revision identifier
	- - indexer_configuration_id (int): tool used to compute
	- the results
	-
	- Yields:
	- missing ids
	-
	- """
	for obj in db.revision_intrinsic_metadata_missing_from_list(
	metadata, cur):
	yield obj[0]

	- @remote_api_endpoint('revision_intrinsic_metadata')
	@db_transaction_generator()
	def revision_intrinsic_metadata_get(self, ids, db=None, cur=None):
	- """Retrieve revision metadata per id.
	-
	- Args:
	- ids (iterable): sha1 checksums
	-
	- Yields:
	- : dictionaries with the following keys:
	-
	- - id (bytes)
	- - metadata (str): associated metadata
	- - tool (dict): tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- """
	for c in db.revision_intrinsic_metadata_get_from_list(ids, cur):
	yield converters.db_to_metadata(
	dict(zip(db.revision_intrinsic_metadata_cols, c)))

	- @remote_api_endpoint('revision_intrinsic_metadata/add')
	@db_transaction()
	def revision_intrinsic_metadata_add(self, metadata, conflict_update=False,
	db=None, cur=None):
	- """Add metadata not present in storage.
	-
	- Args:
	- metadata (iterable): dictionaries with keys:
	-
	- - id: sha1_git of revision
	- - metadata: arbitrary dict
	- - indexer_configuration_id: tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	- """
	_check_id_duplicates(metadata)
	metadata.sort(key=lambda m: m['id'])

	@@ -668,66 +334,20 @@
	cur)
	db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur)

	- @remote_api_endpoint('revision_intrinsic_metadata/delete')
	@db_transaction()
	def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None):
	- """Remove revision metadata from the storage.
	-
	- Args:
	- entries (dict): dictionaries with the following keys:
	-
	- - id (bytes): revision identifier
	- - indexer_configuration_id (int): tool used to compute
	- metadata
	- """
	db.revision_intrinsic_metadata_delete(entries, cur)

	- @remote_api_endpoint('origin_intrinsic_metadata')
	@db_transaction_generator()
	def origin_intrinsic_metadata_get(self, ids, db=None, cur=None):
	- """Retrieve origin metadata per id.
	-
	- Args:
	- ids (iterable): origin identifiers
	-
	- Yields:
	- list: dictionaries with the following keys:
	-
	- - id (str): origin url
	- - from_revision (bytes): which revision this metadata
	- was extracted from
	- - metadata (str): associated metadata
	- - tool (dict): tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- """
	for c in db.origin_intrinsic_metadata_get_from_list(ids, cur):
	yield converters.db_to_metadata(
	dict(zip(db.origin_intrinsic_metadata_cols, c)))

	- @remote_api_endpoint('origin_intrinsic_metadata/add')
	@db_transaction()
	def origin_intrinsic_metadata_add(self, metadata,
	conflict_update=False, db=None,
	cur=None):
	- """Add origin metadata not present in storage.
	-
	- Args:
	- metadata (iterable): dictionaries with keys:
	-
	- - id: origin urls
	- - from_revision: sha1 id of the revision used to generate
	- these metadata.
	- - metadata: arbitrary dict
	- - indexer_configuration_id: tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	- """
	_check_id_duplicates(metadata)
	metadata.sort(key=lambda m: m['id'])

	@@ -740,81 +360,24 @@
	cur)
	db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)

	- @remote_api_endpoint('origin_intrinsic_metadata/delete')
	@db_transaction()
	def origin_intrinsic_metadata_delete(
	self, entries, db=None, cur=None):
	- """Remove origin metadata from the storage.
	-
	- Args:
	- entries (dict): dictionaries with the following keys:
	-
	- - id (str): origin urls
	- - indexer_configuration_id (int): tool used to compute
	- metadata
	- """
	db.origin_intrinsic_metadata_delete(entries, cur)

	- @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext')
	@db_transaction_generator()
	def origin_intrinsic_metadata_search_fulltext(
	self, conjunction, limit=100, db=None, cur=None):
	- """Returns the list of origins whose metadata contain all the terms.
	-
	- Args:
	- conjunction (List[str]): List of terms to be searched for.
	- limit (int): The maximum number of results to return
	-
	- Yields:
	- list: dictionaries with the following keys:
	-
	- - id (str): origin urls
	- - from_revision: sha1 id of the revision used to generate
	- these metadata.
	- - metadata (str): associated metadata
	- - tool (dict): tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- """
	for c in db.origin_intrinsic_metadata_search_fulltext(
	conjunction, limit=limit, cur=cur):
	yield converters.db_to_metadata(
	dict(zip(db.origin_intrinsic_metadata_cols, c)))

	- @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer')
	@db_transaction()
	def origin_intrinsic_metadata_search_by_producer(
	self, page_token='', limit=100, ids_only=False,
	mappings=None, tool_ids=None,
	db=None, cur=None):
	- """Returns the list of origins whose metadata contain all the terms.
	-
	- Args:
	- page_token (str): Opaque token used for pagination.
	- limit (int): The maximum number of results to return
	- ids_only (bool): Determines whether only origin urls are
	- returned or the content as well
	- mappings (List[str]): Returns origins whose intrinsic metadata
	- were generated using at least one of these mappings.
	-
	- Returns:
	- dict: dict with the following keys:
	- - next_page_token (str, optional): opaque token to be used as
	- `page_token` for retrieving the next page. If absent, there is
	- no more pages to gather.
	- - origins (list): list of origin url (str) if `ids_only=True`
	- else dictionaries with the following keys:
	-
	- - id (str): origin urls
	- - from_revision: sha1 id of the revision used to generate
	- these metadata.
	- - metadata (str): associated metadata
	- - tool (dict): tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- """
	assert isinstance(page_token, str)
	# we go to limit+1 to check whether we should add next_page_token in
	# the response
	@@ -834,25 +397,9 @@
	result['next_page_token'] = result['origins'][-1]['id']
	return result

	- @remote_api_endpoint('origin_intrinsic_metadata/stats')
	@db_transaction()
	def origin_intrinsic_metadata_stats(
	self, db=None, cur=None):
	- """Returns counts of indexed metadata per origins, broken down
	- into metadata types.
	-
	- Returns:
	- dict: dictionary with keys:
	-
	- - total (int): total number of origins that were indexed
	- (possibly yielding an empty metadata dictionary)
	- - non_empty (int): total number of origins that we extracted
	- a non-empty metadata dictionary from
	- - per_mapping (dict): a dictionary with mapping names as
	- keys and number of origins whose indexing used this
	- mapping. Note that indexing a given origin may use
	- 0, 1, or many mappings.
	- """
	mapping_names = [m for m in MAPPING_NAMES]
	select_parts = []

	@@ -880,26 +427,8 @@
	'per_mapping': results,
	}

	- @remote_api_endpoint('indexer_configuration/add')
	@db_transaction_generator()
	def indexer_configuration_add(self, tools, db=None, cur=None):
	- """Add new tools to the storage.
	-
	- Args:
	- tools ([dict]): List of dictionary representing tool to
	- insert in the db. Dictionary with the following keys:
	-
	- - tool_name (str): tool's name
	- - tool_version (str): tool's version
	- - tool_configuration (dict): tool's configuration
	- (free form dict)
	-
	- Returns:
	- List of dict inserted in the db (holding the id key as
	- well). The order of the list is not guaranteed to match
	- the order of the initial list.
	-
	- """
	db.mktemp_indexer_configuration(cur)
	db.copy_to(tools, 'tmp_indexer_configuration',
	['tool_name', 'tool_version', 'tool_configuration'],
	@@ -909,24 +438,8 @@
	for line in tools:
	yield dict(zip(db.indexer_configuration_cols, line))

	- @remote_api_endpoint('indexer_configuration/data')
	@db_transaction()
	def indexer_configuration_get(self, tool, db=None, cur=None):
	- """Retrieve tool information.
	-
	- Args:
	- tool (dict): Dictionary representing a tool with the
	- following keys:
	-
	- - tool_name (str): tool's name
	- - tool_version (str): tool's version
	- - tool_configuration (dict): tool's configuration
	- (free form dict)
	-
	- Returns:
	- The same dictionary with an `id` key, None otherwise.
	-
	- """
	tool_conf = tool['tool_configuration']
	if isinstance(tool_conf, dict):
	tool_conf = json.dumps(tool_conf)
	diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py
	--- a/swh/indexer/storage/api/client.py
	+++ b/swh/indexer/storage/api/client.py
	@@ -7,11 +7,11 @@

	from swh.storage.exc import StorageAPIError

	-from .. import IndexerStorage
	+from ..interface import IndexerStorageInterface


	class RemoteStorage(RPCClient):
	"""Proxy to a remote storage API"""

	- backend_class = IndexerStorage
	+ backend_class = IndexerStorageInterface
	api_exception = StorageAPIError
	diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py
	--- a/swh/indexer/storage/api/server.py
	+++ b/swh/indexer/storage/api/server.py
	@@ -10,8 +10,9 @@
	from swh.core.api import (RPCServerApp, error_handler,
	encode_data_server as encode_data)
	from swh.indexer.storage import (
	- get_indexer_storage, INDEXER_CFG_KEY, IndexerStorage
	+ get_indexer_storage, INDEXER_CFG_KEY
	)
	+from swh.indexer.storage.interface import IndexerStorageInterface


	def get_storage():
	@@ -23,7 +24,7 @@


	app = RPCServerApp(__name__,
	- backend_class=IndexerStorage,
	+ backend_class=IndexerStorageInterface,
	backend_factory=get_storage)
	storage = None

	diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
	--- a/swh/indexer/storage/in_memory.py
	+++ b/swh/indexer/storage/in_memory.py
	@@ -203,168 +203,36 @@
	return True

	def content_mimetype_missing(self, mimetypes):
	- """Generate mimetypes missing from storage.
	-
	- Args:
	- mimetypes (iterable): iterable of dict with keys:
	-
	- - id (bytes): sha1 identifier
	- - indexer_configuration_id (int): tool used to compute the
	- results
	-
	- Yields:
	- tuple (id, indexer_configuration_id): missing id
	-
	- """
	yield from self._mimetypes.missing(mimetypes)

	def content_mimetype_get_range(
	self, start, end, indexer_configuration_id, limit=1000):
	- """Retrieve mimetypes within range [start, end] bound by limit.
	-
	- Args:
	- start (bytes): Starting identifier range (expected smaller
	- than end)
	- end (bytes): Ending identifier range (expected larger
	- than start)
	- indexer_configuration_id (int): The tool used to index data
	- limit (int): Limit result (default to 1000)
	-
	- Raises:
	- ValueError for limit to None
	-
	- Returns:
	- a dict with keys:
	- - ids [bytes]: iterable of content ids within the range.
	- - next (Optional[bytes]): The next range of sha1 starts at
	- this sha1 if any
	-
	- """
	return self._mimetypes.get_range(
	start, end, indexer_configuration_id, limit)

	def content_mimetype_add(self, mimetypes, conflict_update=False):
	- """Add mimetypes not present in storage.
	-
	- Args:
	- mimetypes (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1 identifier
	- - mimetype (bytes): raw content's mimetype
	- - encoding (bytes): raw content's encoding
	- - indexer_configuration_id (int): tool's id used to
	- compute the results
	- - conflict_update (bool): Flag to determine if we want to
	- overwrite (``True``) or skip duplicates (``False``, the
	- default)
	-
	- """
	if not all(isinstance(x['id'], bytes) for x in mimetypes):
	raise TypeError('identifiers must be bytes.')
	self._mimetypes.add(mimetypes, conflict_update)

	- def content_mimetype_get(self, ids, db=None, cur=None):
	- """Retrieve full content mimetype per ids.
	-
	- Args:
	- ids (iterable): sha1 identifier
	-
	- Yields:
	- mimetypes (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1 identifier
	- - mimetype (bytes): raw content's mimetype
	- - encoding (bytes): raw content's encoding
	- - tool (dict): Tool used to compute the language
	-
	- """
	+ def content_mimetype_get(self, ids):
	yield from self._mimetypes.get(ids)

	def content_language_missing(self, languages):
	- """List languages missing from storage.
	-
	- Args:
	- languages (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1 identifier
	- - indexer_configuration_id (int): tool used to compute
	- the results
	-
	- Yields:
	- an iterable of missing id for the tuple (id,
	- indexer_configuration_id)
	-
	- """
	yield from self._languages.missing(languages)

	def content_language_get(self, ids):
	- """Retrieve full content language per ids.
	-
	- Args:
	- ids (iterable): sha1 identifier
	-
	- Yields:
	- languages (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1 identifier
	- - lang (bytes): raw content's language
	- - tool (dict): Tool used to compute the language
	-
	- """
	yield from self._languages.get(ids)

	def content_language_add(self, languages, conflict_update=False):
	- """Add languages not present in storage.
	-
	- Args:
	- languages (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1
	- - lang (bytes): language detected
	-
	- conflict_update (bool): Flag to determine if we want to
	- overwrite (true) or skip duplicates (false, the
	- default)
	-
	- """
	if not all(isinstance(x['id'], bytes) for x in languages):
	raise TypeError('identifiers must be bytes.')
	self._languages.add(languages, conflict_update)

	def content_ctags_missing(self, ctags):
	- """List ctags missing from storage.
	-
	- Args:
	- ctags (iterable): dicts with keys:
	-
	- - id (bytes): sha1 identifier
	- - indexer_configuration_id (int): tool used to compute
	- the results
	-
	- Yields:
	- an iterable of missing id for the tuple (id,
	- indexer_configuration_id)
	-
	- """
	yield from self._content_ctags.missing(ctags)

	def content_ctags_get(self, ids):
	- """Retrieve ctags per id.
	-
	- Args:
	- ids (iterable): sha1 checksums
	-
	- Yields:
	- Dictionaries with keys:
	-
	- - id (bytes): content's identifier
	- - name (str): symbol's name
	- - kind (str): symbol's kind
	- - lang (str): language for that content
	- - tool (dict): tool used to compute the ctags' info
	-
	-
	- """
	for item in self._content_ctags.get(ids):
	for item_ctags_item in item['ctags']:
	yield {
	@@ -374,35 +242,12 @@
	}

	def content_ctags_add(self, ctags, conflict_update=False):
	- """Add ctags not present in storage
	-
	- Args:
	- ctags (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1
	- - ctags ([list): List of dictionary with keys: name, kind,
	- line, lang
	- - indexer_configuration_id: tool used to compute the
	- results
	-
	- """
	if not all(isinstance(x['id'], bytes) for x in ctags):
	raise TypeError('identifiers must be bytes.')
	self._content_ctags.add_merge(ctags, conflict_update, 'ctags')

	def content_ctags_search(self, expression,
	- limit=10, last_sha1=None, db=None, cur=None):
	- """Search through content's raw ctags symbols.
	-
	- Args:
	- expression (str): Expression to search for
	- limit (int): Number of rows to return (default to 10).
	- last_sha1 (str): Offset from which retrieving data (default to '').
	-
	- Yields:
	- rows of ctags including id, name, lang, kind, line, etc...
	-
	- """
	+ limit=10, last_sha1=None):
	nb_matches = 0
	for ((id_, tool_id), item) in \
	sorted(self._content_ctags._data.items()):
	@@ -421,19 +266,6 @@
	return

	def content_fossology_license_get(self, ids):
	- """Retrieve licenses per id.
	-
	- Args:
	- ids (iterable): sha1 checksums
	-
	- Yields:
	- dict: ``{id: facts}`` where ``facts`` is a dict with the
	- following keys:
	-
	- - licenses ([str]): associated licenses for that content
	- - tool (dict): Tool used to compute the license
	-
	- """
	# Rewrites the output of SubStorage.get from the old format to
	# the new one. SubStorage.get should be updated once all other
	# *_get methods use the new format.
	@@ -445,239 +277,52 @@
	yield {id_: facts}

	def content_fossology_license_add(self, licenses, conflict_update=False):
	- """Add licenses not present in storage.
	-
	- Args:
	- licenses (iterable): dictionaries with keys:
	-
	- - id: sha1
	- - licenses ([bytes]): List of licenses associated to sha1
	- - tool (str): nomossa
	-
	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	- Returns:
	- list: content_license entries which failed due to unknown licenses
	-
	- """
	if not all(isinstance(x['id'], bytes) for x in licenses):
	raise TypeError('identifiers must be bytes.')
	self._licenses.add_merge(licenses, conflict_update, 'licenses')

	def content_fossology_license_get_range(
	self, start, end, indexer_configuration_id, limit=1000):
	- """Retrieve licenses within range [start, end] bound by limit.
	-
	- Args:
	- start (bytes): Starting identifier range (expected smaller
	- than end)
	- end (bytes): Ending identifier range (expected larger
	- than start)
	- indexer_configuration_id (int): The tool used to index data
	- limit (int): Limit result (default to 1000)
	-
	- Raises:
	- ValueError for limit to None
	-
	- Returns:
	- a dict with keys:
	- - ids [bytes]: iterable of content ids within the range.
	- - next (Optional[bytes]): The next range of sha1 starts at
	- this sha1 if any
	-
	- """
	return self._licenses.get_range(
	start, end, indexer_configuration_id, limit)

	def content_metadata_missing(self, metadata):
	- """List metadata missing from storage.
	-
	- Args:
	- metadata (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1 identifier
	- - indexer_configuration_id (int): tool used to compute
	- the results
	-
	- Yields:
	- missing sha1s
	-
	- """
	yield from self._content_metadata.missing(metadata)

	def content_metadata_get(self, ids):
	- """Retrieve metadata per id.
	-
	- Args:
	- ids (iterable): sha1 checksums
	-
	- Yields:
	- dictionaries with the following keys:
	-
	- - id (bytes)
	- - metadata (str): associated metadata
	- - tool (dict): tool used to compute metadata
	-
	- """
	yield from self._content_metadata.get(ids)

	def content_metadata_add(self, metadata, conflict_update=False):
	- """Add metadata not present in storage.
	-
	- Args:
	- metadata (iterable): dictionaries with keys:
	-
	- - id: sha1
	- - metadata: arbitrary dict
	- - indexer_configuration_id: tool used to compute the
	- results
	-
	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	- """
	if not all(isinstance(x['id'], bytes) for x in metadata):
	raise TypeError('identifiers must be bytes.')
	self._content_metadata.add(metadata, conflict_update)

	def revision_intrinsic_metadata_missing(self, metadata):
	- """List metadata missing from storage.
	-
	- Args:
	- metadata (iterable): dictionaries with keys:
	-
	- - id (bytes): sha1_git revision identifier
	- - indexer_configuration_id (int): tool used to compute
	- the results
	-
	- Yields:
	- missing ids
	-
	- """
	yield from self._revision_intrinsic_metadata.missing(metadata)

	def revision_intrinsic_metadata_get(self, ids):
	- """Retrieve revision metadata per id.
	-
	- Args:
	- ids (iterable): sha1 checksums
	-
	- Yields:
	- dictionaries with the following keys:
	-
	- - id (bytes)
	- - metadata (str): associated metadata
	- - tool (dict): tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- """
	yield from self._revision_intrinsic_metadata.get(ids)

	def revision_intrinsic_metadata_add(self, metadata, conflict_update=False):
	- """Add metadata not present in storage.
	-
	- Args:
	- metadata (iterable): dictionaries with keys:
	-
	- - id: sha1_git of revision
	- - metadata: arbitrary dict
	- - indexer_configuration_id: tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	- """
	if not all(isinstance(x['id'], bytes) for x in metadata):
	raise TypeError('identifiers must be bytes.')
	self._revision_intrinsic_metadata.add(metadata, conflict_update)

	def revision_intrinsic_metadata_delete(self, entries):
	- """Remove revision metadata from the storage.
	-
	- Args:
	- entries (dict): dictionaries with the following keys:
	- - revision (int): origin identifier
	- - id (int): tool used to compute metadata
	- """
	self._revision_intrinsic_metadata.delete(entries)

	def origin_intrinsic_metadata_get(self, ids):
	- """Retrieve origin metadata per id.
	-
	- Args:
	- ids (iterable): origin identifiers
	-
	- Yields:
	- list: dictionaries with the following keys:
	-
	- - id (str): origin url
	- - from_revision (bytes): which revision this metadata
	- was extracted from
	- - metadata (str): associated metadata
	- - tool (dict): tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- """
	yield from self._origin_intrinsic_metadata.get(ids)

	def origin_intrinsic_metadata_add(self, metadata,
	conflict_update=False):
	- """Add origin metadata not present in storage.
	-
	- Args:
	- metadata (iterable): dictionaries with keys:
	-
	- - id: origin url
	- - from_revision: sha1 id of the revision used to generate
	- these metadata.
	- - metadata: arbitrary dict
	- - indexer_configuration_id: tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- conflict_update: Flag to determine if we want to overwrite (true)
	- or skip duplicates (false, the default)
	-
	- """
	self._origin_intrinsic_metadata.add(metadata, conflict_update)

	def origin_intrinsic_metadata_delete(self, entries):
	- """Remove origin metadata from the storage.
	-
	- Args:
	- entries (dict): dictionaries with the following keys:
	-
	- - id (str): origin url
	- - indexer_configuration_id (int): tool used to compute
	- metadata
	- """
	self._origin_intrinsic_metadata.delete(entries)

	def origin_intrinsic_metadata_search_fulltext(
	self, conjunction, limit=100):
	- """Returns the list of origins whose metadata contain all the terms.
	-
	- Args:
	- conjunction (List[str]): List of terms to be searched for.
	- limit (int): The maximum number of results to return
	-
	- Yields:
	- list: dictionaries with the following keys:
	-
	- - id (str): origin url
	- - from_revision (bytes): which revision this metadata
	- was extracted from
	- - metadata (str): associated metadata
	- - tool (dict): tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- """
	# A very crude fulltext search implementation, but that's enough
	# to work on English metadata
	tokens_re = re.compile('[a-zA-Z0-9]+')
	@@ -711,34 +356,7 @@

	def origin_intrinsic_metadata_search_by_producer(
	self, page_token='', limit=100, ids_only=False,
	- mappings=None, tool_ids=None,
	- db=None, cur=None):
	- """Returns the list of origins whose metadata contain all the terms.
	-
	- Args:
	- page_token (str): Opaque token used for pagination.
	- limit (int): The maximum number of results to return
	- ids_only (bool): Determines whether only origin ids are returned
	- or the content as well
	- mappings (List[str]): Returns origins whose intrinsic metadata
	- were generated using at least one of these mappings.
	-
	- Returns:
	- dict: dict with the following keys:
	- - next_page_token (str, optional): opaque token to be used as
	- `page_token` for retrieveing the next page.
	- - origins (list): list of origin url (str) if `ids_only=True`
	- else dictionaries with the following keys:
	-
	- - id (str): origin urls
	- - from_revision: sha1 id of the revision used to generate
	- these metadata.
	- - metadata (str): associated metadata
	- - tool (dict): tool used to compute metadata
	- - mappings (List[str]): list of mappings used to translate
	- these metadata
	-
	- """
	+ mappings=None, tool_ids=None):
	assert isinstance(page_token, str)
	nb_results = 0
	if mappings is not None:
	@@ -771,20 +389,6 @@
	return result

	def origin_intrinsic_metadata_stats(self):
	- """Returns statistics on stored intrinsic metadata.
	-
	- Returns:
	- dict: dictionary with keys:
	-
	- - total (int): total number of origins that were indexed
	- (possibly yielding an empty metadata dictionary)
	- - non_empty (int): total number of origins that we extracted
	- a non-empty metadata dictionary from
	- - per_mapping (dict): a dictionary with mapping names as
	- keys and number of origins whose indexing used this
	- mapping. Note that indexing a given origin may use
	- 0, 1, or many mappings.
	- """
	mapping_count = {m: 0 for m in MAPPING_NAMES}
	total = non_empty = 0
	for data in self._origin_intrinsic_metadata.get_all():
	@@ -800,23 +404,6 @@
	}

	def indexer_configuration_add(self, tools):
	- """Add new tools to the storage.
	-
	- Args:
	- tools ([dict]): List of dictionary representing tool to
	- insert in the db. Dictionary with the following keys:
	-
	- - tool_name (str): tool's name
	- - tool_version (str): tool's version
	- - tool_configuration (dict): tool's configuration
	- (free form dict)
	-
	- Returns:
	- list: List of dict inserted in the db (holding the id key as
	- well). The order of the list is not guaranteed to match
	- the order of the initial list.
	-
	- """
	inserted = []
	for tool in tools:
	tool = tool.copy()
	@@ -827,21 +414,6 @@
	return inserted

	def indexer_configuration_get(self, tool):
	- """Retrieve tool information.
	-
	- Args:
	- tool (dict): Dictionary representing a tool with the
	- following keys:
	-
	- - tool_name (str): tool's name
	- - tool_version (str): tool's version
	- - tool_configuration (dict): tool's configuration
	- (free form dict)
	-
	- Returns:
	- The same dictionary with an `id` key, None otherwise.
	-
	- """
	return self._tools.get(self._tool_key(tool))

	def _tool_key(self, tool):
	diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/interface.py
	copy from swh/indexer/storage/__init__.py
	copy to swh/indexer/storage/interface.py
	--- a/swh/indexer/storage/__init__.py
	+++ b/swh/indexer/storage/interface.py
	@@ -1,132 +1,20 @@
	-# Copyright (C) 2015-2018 The Software Heritage developers
	+# Copyright (C) 2015-2020 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information


	-import json
	-import psycopg2
	-
	-from collections import defaultdict
	-
	from swh.core.api import remote_api_endpoint
	-from swh.storage.common import db_transaction_generator, db_transaction
	-from swh.storage.exc import StorageDBError
	-from .db import Db
	-
	-from . import converters
	-
	-
	-INDEXER_CFG_KEY = 'indexer_storage'
	-
	-
	-MAPPING_NAMES = ['codemeta', 'gemspec', 'maven', 'npm', 'pkg-info']
	-
	-
	-def get_indexer_storage(cls, args):
	- """Get an indexer storage object of class `storage_class` with
	- arguments `storage_args`.
	-
	- Args:
	- cls (str): storage's class, either 'local' or 'remote'
	- args (dict): dictionary of arguments passed to the
	- storage class constructor
	-
	- Returns:
	- an instance of swh.indexer's storage (either local or remote)
	-
	- Raises:
	- ValueError if passed an unknown storage class.
	-
	- """
	- if cls == 'remote':
	- from .api.client import RemoteStorage as IndexerStorage
	- elif cls == 'local':
	- from . import IndexerStorage
	- elif cls == 'memory':
	- from .in_memory import IndexerStorage
	- else:
	- raise ValueError('Unknown indexer storage class `%s`' % cls)
	-
	- return IndexerStorage(**args)
	-
	-
	-def _check_id_duplicates(data):
	- """
	- If any two dictionaries in `data` have the same id, raises
	- a `ValueError`.
	-
	- Values associated to the key must be hashable.
	-
	- Args:
	- data (List[dict]): List of dictionaries to be inserted
	-
	- >>> _check_id_duplicates([
	- ... {'id': 'foo', 'data': 'spam'},
	- ... {'id': 'bar', 'data': 'egg'},
	- ... ])
	- >>> _check_id_duplicates([
	- ... {'id': 'foo', 'data': 'spam'},
	- ... {'id': 'foo', 'data': 'egg'},
	- ... ])
	- Traceback (most recent call last):
	- ...
	- ValueError: The same id is present more than once.
	- """
	- if len({item['id'] for item in data}) < len(data):
	- raise ValueError('The same id is present more than once.')


	-class IndexerStorage:
	- """SWH Indexer Storage
	-
	- """
	- def __init__(self, db, min_pool_conns=1, max_pool_conns=10):
	- """
	- Args:
	- db_conn: either a libpq connection string, or a psycopg2 connection
	-
	- """
	- try:
	- if isinstance(db, psycopg2.extensions.connection):
	- self._pool = None
	- self._db = Db(db)
	- else:
	- self._pool = psycopg2.pool.ThreadedConnectionPool(
	- min_pool_conns, max_pool_conns, db
	- )
	- self._db = None
	- except psycopg2.OperationalError as e:
	- raise StorageDBError(e)
	-
	- def get_db(self):
	- if self._db:
	- return self._db
	- return Db.from_pool(self._pool)
	-
	- def put_db(self, db):
	- if db is not self._db:
	- db.put_conn()
	-
	+class IndexerStorageInterface:
	@remote_api_endpoint('check_config')
	- @db_transaction()
	- def check_config(self, *, check_write, db=None, cur=None):
	+ def check_config(self, *, check_write):
	"""Check that the storage is configured and ready to go."""
	- # Check permissions on one of the tables
	- if check_write:
	- check = 'INSERT'
	- else:
	- check = 'SELECT'
	-
	- cur.execute(
	- "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa
	- (check,)
	- )
	- return cur.fetchone()[0]
	+ ...

	@remote_api_endpoint('content_mimetype/missing')
	- @db_transaction_generator()
	- def content_mimetype_missing(self, mimetypes, db=None, cur=None):
	+ def content_mimetype_missing(self, mimetypes):
	"""Generate mimetypes missing from storage.

	Args:
	@@ -140,13 +28,11 @@
	tuple (id, indexer_configuration_id): missing id

	"""
	- for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
	- yield obj[0]
	+ ...

	def _content_get_range(self, content_type, start, end,
	indexer_configuration_id, limit=1000,
	- with_textual_data=False,
	- db=None, cur=None):
	+ with_textual_data=False):
	"""Retrieve ids of type content_type within range [start, end] bound
	by limit.

	@@ -175,34 +61,11 @@
	this sha1 if any

	"""
	- if limit is None:
	- raise ValueError('Development error: limit should not be None')
	- if content_type not in db.content_indexer_names:
	- err = 'Development error: Wrong type. Should be one of [%s]' % (
	- ','.join(db.content_indexer_names))
	- raise ValueError(err)
	-
	- ids = []
	- next_id = None
	- for counter, obj in enumerate(db.content_get_range(
	- content_type, start, end, indexer_configuration_id,
	- limit=limit+1, with_textual_data=with_textual_data, cur=cur)):
	- _id = obj[0]
	- if counter >= limit:
	- next_id = _id
	- break
	-
	- ids.append(_id)
	-
	- return {
	- 'ids': ids,
	- 'next': next_id
	- }
	+ ...

	@remote_api_endpoint('content_mimetype/range')
	- @db_transaction()
	def content_mimetype_get_range(self, start, end, indexer_configuration_id,
	- limit=1000, db=None, cur=None):
	+ limit=1000):
	"""Retrieve mimetypes within range [start, end] bound by limit.

	Args:
	@@ -223,14 +86,10 @@
	this sha1 if any

	"""
	- return self._content_get_range('mimetype', start, end,
	- indexer_configuration_id, limit=limit,
	- db=db, cur=cur)
	+ ...

	@remote_api_endpoint('content_mimetype/add')
	- @db_transaction()
	- def content_mimetype_add(self, mimetypes, conflict_update=False, db=None,
	- cur=None):
	+ def content_mimetype_add(self, mimetypes, conflict_update=False):
	"""Add mimetypes not present in storage.

	Args:
	@@ -246,17 +105,10 @@
	default)

	"""
	- _check_id_duplicates(mimetypes)
	- mimetypes.sort(key=lambda m: m['id'])
	- db.mktemp_content_mimetype(cur)
	- db.copy_to(mimetypes, 'tmp_content_mimetype',
	- ['id', 'mimetype', 'encoding', 'indexer_configuration_id'],
	- cur)
	- db.content_mimetype_add_from_temp(conflict_update, cur)
	+ ...

	@remote_api_endpoint('content_mimetype')
	- @db_transaction_generator()
	- def content_mimetype_get(self, ids, db=None, cur=None):
	+ def content_mimetype_get(self, ids):
	"""Retrieve full content mimetype per ids.

	Args:
	@@ -271,13 +123,10 @@
	- tool (dict): Tool used to compute the language

	"""
	- for c in db.content_mimetype_get_from_list(ids, cur):
	- yield converters.db_to_mimetype(
	- dict(zip(db.content_mimetype_cols, c)))
	+ ...

	@remote_api_endpoint('content_language/missing')
	- @db_transaction_generator()
	- def content_language_missing(self, languages, db=None, cur=None):
	+ def content_language_missing(self, languages):
	"""List languages missing from storage.

	Args:
	@@ -292,12 +141,10 @@
	indexer_configuration_id)

	"""
	- for obj in db.content_language_missing_from_list(languages, cur):
	- yield obj[0]
	+ ...

	@remote_api_endpoint('content_language')
	- @db_transaction_generator()
	- def content_language_get(self, ids, db=None, cur=None):
	+ def content_language_get(self, ids):
	"""Retrieve full content language per ids.

	Args:
	@@ -311,14 +158,10 @@
	- tool (dict): Tool used to compute the language

	"""
	- for c in db.content_language_get_from_list(ids, cur):
	- yield converters.db_to_language(
	- dict(zip(db.content_language_cols, c)))
	+ ...

	@remote_api_endpoint('content_language/add')
	- @db_transaction()
	- def content_language_add(self, languages, conflict_update=False, db=None,
	- cur=None):
	+ def content_language_add(self, languages, conflict_update=False):
	"""Add languages not present in storage.

	Args:
	@@ -332,24 +175,10 @@
	default)

	"""
	- _check_id_duplicates(languages)
	- languages.sort(key=lambda m: m['id'])
	- db.mktemp_content_language(cur)
	- # empty language is mapped to 'unknown'
	- db.copy_to(
	- ({
	- 'id': l['id'],
	- 'lang': 'unknown' if not l['lang'] else l['lang'],
	- 'indexer_configuration_id': l['indexer_configuration_id'],
	- } for l in languages),
	- 'tmp_content_language',
	- ['id', 'lang', 'indexer_configuration_id'], cur)
	-
	- db.content_language_add_from_temp(conflict_update, cur)
	+ ...

	@remote_api_endpoint('content/ctags/missing')
	- @db_transaction_generator()
	- def content_ctags_missing(self, ctags, db=None, cur=None):
	+ def content_ctags_missing(self, ctags):
	"""List ctags missing from storage.

	Args:
	@@ -364,12 +193,10 @@
	indexer_configuration_id)

	"""
	- for obj in db.content_ctags_missing_from_list(ctags, cur):
	- yield obj[0]
	+ ...

	@remote_api_endpoint('content/ctags')
	- @db_transaction_generator()
	- def content_ctags_get(self, ids, db=None, cur=None):
	+ def content_ctags_get(self, ids):
	"""Retrieve ctags per id.

	Args:
	@@ -386,13 +213,10 @@


	"""
	- for c in db.content_ctags_get_from_list(ids, cur):
	- yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c)))
	+ ...

	@remote_api_endpoint('content/ctags/add')
	- @db_transaction()
	- def content_ctags_add(self, ctags, conflict_update=False, db=None,
	- cur=None):
	+ def content_ctags_add(self, ctags, conflict_update=False):
	"""Add ctags not present in storage

	Args:
	@@ -403,29 +227,11 @@
	line, lang

	"""
	- _check_id_duplicates(ctags)
	- ctags.sort(key=lambda m: m['id'])
	-
	- def _convert_ctags(__ctags):
	- """Convert ctags dict to list of ctags.
	-
	- """
	- for ctags in __ctags:
	- yield from converters.ctags_to_db(ctags)
	-
	- db.mktemp_content_ctags(cur)
	- db.copy_to(list(_convert_ctags(ctags)),
	- tblname='tmp_content_ctags',
	- columns=['id', 'name', 'kind', 'line',
	- 'lang', 'indexer_configuration_id'],
	- cur=cur)
	-
	- db.content_ctags_add_from_temp(conflict_update, cur)
	+ ...

	@remote_api_endpoint('content/ctags/search')
	- @db_transaction_generator()
	def content_ctags_search(self, expression,
	- limit=10, last_sha1=None, db=None, cur=None):
	+ limit=10, last_sha1=None):
	"""Search through content's raw ctags symbols.

	Args:
	@@ -437,13 +243,10 @@
	rows of ctags including id, name, lang, kind, line, etc...

	"""
	- for obj in db.content_ctags_search(expression, last_sha1, limit,
	- cur=cur):
	- yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj)))
	+ ...

	@remote_api_endpoint('content/fossology_license')
	- @db_transaction_generator()
	- def content_fossology_license_get(self, ids, db=None, cur=None):
	+ def content_fossology_license_get(self, ids):
	"""Retrieve licenses per id.

	Args:
	@@ -457,20 +260,10 @@
	- tool (dict): Tool used to compute the license

	"""
	- d = defaultdict(list)
	- for c in db.content_fossology_license_get_from_list(ids, cur):
	- license = dict(zip(db.content_fossology_license_cols, c))
	-
	- id_ = license['id']
	- d[id_].append(converters.db_to_fossology_license(license))
	-
	- for id_, facts in d.items():
	- yield {id_: facts}
	+ ...

	@remote_api_endpoint('content/fossology_license/add')
	- @db_transaction()
	- def content_fossology_license_add(self, licenses, conflict_update=False,
	- db=None, cur=None):
	+ def content_fossology_license_add(self, licenses, conflict_update=False):
	"""Add licenses not present in storage.

	Args:
	@@ -487,26 +280,12 @@
	list: content_license entries which failed due to unknown licenses

	"""
	- _check_id_duplicates(licenses)
	- licenses.sort(key=lambda m: m['id'])
	- db.mktemp_content_fossology_license(cur)
	- db.copy_to(
	- ({
	- 'id': sha1['id'],
	- 'indexer_configuration_id': sha1['indexer_configuration_id'],
	- 'license': license,
	- } for sha1 in licenses
	- for license in sha1['licenses']),
	- tblname='tmp_content_fossology_license',
	- columns=['id', 'license', 'indexer_configuration_id'],
	- cur=cur)
	- db.content_fossology_license_add_from_temp(conflict_update, cur)
	+ ...

	@remote_api_endpoint('content/fossology_license/range')
	- @db_transaction()
	def content_fossology_license_get_range(
	self, start, end, indexer_configuration_id,
	- limit=1000, db=None, cur=None):
	+ limit=1000):
	"""Retrieve licenses within range [start, end] bound by limit.

	Args:
	@@ -527,13 +306,10 @@
	this sha1 if any

	"""
	- return self._content_get_range('fossology_license', start, end,
	- indexer_configuration_id, limit=limit,
	- with_textual_data=True, db=db, cur=cur)
	+ ...

	@remote_api_endpoint('content_metadata/missing')
	- @db_transaction_generator()
	- def content_metadata_missing(self, metadata, db=None, cur=None):
	+ def content_metadata_missing(self, metadata):
	"""List metadata missing from storage.

	Args:
	@@ -547,12 +323,10 @@
	missing sha1s

	"""
	- for obj in db.content_metadata_missing_from_list(metadata, cur):
	- yield obj[0]
	+ ...

	@remote_api_endpoint('content_metadata')
	- @db_transaction_generator()
	- def content_metadata_get(self, ids, db=None, cur=None):
	+ def content_metadata_get(self, ids):
	"""Retrieve metadata per id.

	Args:
	@@ -566,14 +340,10 @@
	tool (dict): tool used to compute metadata

	"""
	- for c in db.content_metadata_get_from_list(ids, cur):
	- yield converters.db_to_metadata(
	- dict(zip(db.content_metadata_cols, c)))
	+ ...

	@remote_api_endpoint('content_metadata/add')
	- @db_transaction()
	- def content_metadata_add(self, metadata, conflict_update=False, db=None,
	- cur=None):
	+ def content_metadata_add(self, metadata, conflict_update=False):
	"""Add metadata not present in storage.

	Args:
	@@ -586,19 +356,10 @@
	or skip duplicates (false, the default)

	"""
	- _check_id_duplicates(metadata)
	- metadata.sort(key=lambda m: m['id'])
	-
	- db.mktemp_content_metadata(cur)
	-
	- db.copy_to(metadata, 'tmp_content_metadata',
	- ['id', 'metadata', 'indexer_configuration_id'],
	- cur)
	- db.content_metadata_add_from_temp(conflict_update, cur)
	+ ...

	@remote_api_endpoint('revision_intrinsic_metadata/missing')
	- @db_transaction_generator()
	- def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None):
	+ def revision_intrinsic_metadata_missing(self, metadata):
	"""List metadata missing from storage.

	Args:
	@@ -612,13 +373,10 @@
	missing ids

	"""
	- for obj in db.revision_intrinsic_metadata_missing_from_list(
	- metadata, cur):
	- yield obj[0]
	+ ...

	@remote_api_endpoint('revision_intrinsic_metadata')
	- @db_transaction_generator()
	- def revision_intrinsic_metadata_get(self, ids, db=None, cur=None):
	+ def revision_intrinsic_metadata_get(self, ids):
	"""Retrieve revision metadata per id.

	Args:
	@@ -634,14 +392,10 @@
	these metadata

	"""
	- for c in db.revision_intrinsic_metadata_get_from_list(ids, cur):
	- yield converters.db_to_metadata(
	- dict(zip(db.revision_intrinsic_metadata_cols, c)))
	+ ...

	@remote_api_endpoint('revision_intrinsic_metadata/add')
	- @db_transaction()
	- def revision_intrinsic_metadata_add(self, metadata, conflict_update=False,
	- db=None, cur=None):
	+ def revision_intrinsic_metadata_add(self, metadata, conflict_update=False):
	"""Add metadata not present in storage.

	Args:
	@@ -657,20 +411,10 @@
	or skip duplicates (false, the default)

	"""
	- _check_id_duplicates(metadata)
	- metadata.sort(key=lambda m: m['id'])
	-
	- db.mktemp_revision_intrinsic_metadata(cur)
	-
	- db.copy_to(metadata, 'tmp_revision_intrinsic_metadata',
	- ['id', 'metadata', 'mappings',
	- 'indexer_configuration_id'],
	- cur)
	- db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur)
	+ ...

	@remote_api_endpoint('revision_intrinsic_metadata/delete')
	- @db_transaction()
	- def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None):
	+ def revision_intrinsic_metadata_delete(self, entries):
	"""Remove revision metadata from the storage.

	Args:
	@@ -680,11 +424,10 @@
	- indexer_configuration_id (int): tool used to compute
	metadata
	"""
	- db.revision_intrinsic_metadata_delete(entries, cur)
	+ ...

	@remote_api_endpoint('origin_intrinsic_metadata')
	- @db_transaction_generator()
	- def origin_intrinsic_metadata_get(self, ids, db=None, cur=None):
	+ def origin_intrinsic_metadata_get(self, ids):
	"""Retrieve origin metadata per id.

	Args:
	@@ -702,15 +445,11 @@
	these metadata

	"""
	- for c in db.origin_intrinsic_metadata_get_from_list(ids, cur):
	- yield converters.db_to_metadata(
	- dict(zip(db.origin_intrinsic_metadata_cols, c)))
	+ ...

	@remote_api_endpoint('origin_intrinsic_metadata/add')
	- @db_transaction()
	def origin_intrinsic_metadata_add(self, metadata,
	- conflict_update=False, db=None,
	- cur=None):
	+ conflict_update=False):
	"""Add origin metadata not present in storage.

	Args:
	@@ -728,22 +467,11 @@
	or skip duplicates (false, the default)

	"""
	- _check_id_duplicates(metadata)
	- metadata.sort(key=lambda m: m['id'])
	-
	- db.mktemp_origin_intrinsic_metadata(cur)
	-
	- db.copy_to(metadata, 'tmp_origin_intrinsic_metadata',
	- ['id', 'metadata',
	- 'indexer_configuration_id',
	- 'from_revision', 'mappings'],
	- cur)
	- db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur)
	+ ...

	@remote_api_endpoint('origin_intrinsic_metadata/delete')
	- @db_transaction()
	def origin_intrinsic_metadata_delete(
	- self, entries, db=None, cur=None):
	+ self, entries):
	"""Remove origin metadata from the storage.

	Args:
	@@ -753,12 +481,11 @@
	- indexer_configuration_id (int): tool used to compute
	metadata
	"""
	- db.origin_intrinsic_metadata_delete(entries, cur)
	+ ...

	@remote_api_endpoint('origin_intrinsic_metadata/search/fulltext')
	- @db_transaction_generator()
	def origin_intrinsic_metadata_search_fulltext(
	- self, conjunction, limit=100, db=None, cur=None):
	+ self, conjunction, limit=100):
	"""Returns the list of origins whose metadata contain all the terms.

	Args:
	@@ -777,17 +504,12 @@
	these metadata

	"""
	- for c in db.origin_intrinsic_metadata_search_fulltext(
	- conjunction, limit=limit, cur=cur):
	- yield converters.db_to_metadata(
	- dict(zip(db.origin_intrinsic_metadata_cols, c)))
	+ ...

	@remote_api_endpoint('origin_intrinsic_metadata/search/by_producer')
	- @db_transaction()
	def origin_intrinsic_metadata_search_by_producer(
	self, page_token='', limit=100, ids_only=False,
	- mappings=None, tool_ids=None,
	- db=None, cur=None):
	+ mappings=None, tool_ids=None):
	"""Returns the list of origins whose metadata contain all the terms.

	Args:
	@@ -815,29 +537,11 @@
	these metadata

	"""
	- assert isinstance(page_token, str)
	- # we go to limit+1 to check whether we should add next_page_token in
	- # the response
	- res = db.origin_intrinsic_metadata_search_by_producer(
	- page_token, limit + 1, ids_only, mappings, tool_ids, cur)
	- result = {}
	- if ids_only:
	- result['origins'] = [origin for (origin,) in res]
	- if len(result['origins']) > limit:
	- result['origins'][limit:] = []
	- result['next_page_token'] = result['origins'][-1]
	- else:
	- result['origins'] = [converters.db_to_metadata(
	- dict(zip(db.origin_intrinsic_metadata_cols, c)))for c in res]
	- if len(result['origins']) > limit:
	- result['origins'][limit:] = []
	- result['next_page_token'] = result['origins'][-1]['id']
	- return result
	+ ...

	@remote_api_endpoint('origin_intrinsic_metadata/stats')
	- @db_transaction()
	def origin_intrinsic_metadata_stats(
	- self, db=None, cur=None):
	+ self):
	"""Returns counts of indexed metadata per origins, broken down
	into metadata types.

	@@ -853,36 +557,10 @@
	mapping. Note that indexing a given origin may use
	0, 1, or many mappings.
	"""
	- mapping_names = [m for m in MAPPING_NAMES]
	- select_parts = []
	-
	- # Count rows for each mapping
	- for mapping_name in mapping_names:
	- select_parts.append((
	- "sum(case when (mappings @> ARRAY['%s']) "
	- " then 1 else 0 end)"
	- ) % mapping_name)
	-
	- # Total
	- select_parts.append("sum(1)")
	-
	- # Rows whose metadata has at least one key that is not '@context'
	- select_parts.append(
	- "sum(case when ('{}'::jsonb @> (metadata - '@context')) "
	- " then 0 else 1 end)")
	- cur.execute('select ' + ', '.join(select_parts)
	- + ' from origin_intrinsic_metadata')
	- results = dict(zip(mapping_names + ['total', 'non_empty'],
	- cur.fetchone()))
	- return {
	- 'total': results.pop('total'),
	- 'non_empty': results.pop('non_empty'),
	- 'per_mapping': results,
	- }
	+ ...

	@remote_api_endpoint('indexer_configuration/add')
	- @db_transaction_generator()
	- def indexer_configuration_add(self, tools, db=None, cur=None):
	+ def indexer_configuration_add(self, tools):
	"""Add new tools to the storage.

	Args:
	@@ -900,18 +578,10 @@
	the order of the initial list.

	"""
	- db.mktemp_indexer_configuration(cur)
	- db.copy_to(tools, 'tmp_indexer_configuration',
	- ['tool_name', 'tool_version', 'tool_configuration'],
	- cur)
	-
	- tools = db.indexer_configuration_add_from_temp(cur)
	- for line in tools:
	- yield dict(zip(db.indexer_configuration_cols, line))
	+ ...

	@remote_api_endpoint('indexer_configuration/data')
	- @db_transaction()
	- def indexer_configuration_get(self, tool, db=None, cur=None):
	+ def indexer_configuration_get(self, tool):
	"""Retrieve tool information.

	Args:
	@@ -927,12 +597,4 @@
	The same dictionary with an `id` key, None otherwise.

	"""
	- tool_conf = tool['tool_configuration']
	- if isinstance(tool_conf, dict):
	- tool_conf = json.dumps(tool_conf)
	- idx = db.indexer_configuration_get(tool['tool_name'],
	- tool['tool_version'],
	- tool_conf)
	- if not idx:
	- return None
	- return dict(zip(db.indexer_configuration_cols, idx))
	+ ...
	diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
	--- a/swh/indexer/tests/storage/test_storage.py
	+++ b/swh/indexer/tests/storage/test_storage.py
	@@ -3,10 +3,15 @@
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information

	+import inspect
	import threading
	+
	import pytest
	+
	from swh.model.hashutil import hash_to_bytes

	+from swh.indexer.storage.interface import IndexerStorageInterface
	+

	def prepare_mimetypes_from(fossology_licenses):
	"""Fossology license needs some consistent data in db to run.
	@@ -1853,3 +1858,31 @@
	storage = swh_indexer_storage
	assert storage.check_config(check_write=True)
	assert storage.check_config(check_write=False)
	+
	+ def test_types(self, swh_indexer_storage):
	+ """Checks all methods of StorageInterface are implemented by this
	+ backend, and that they have the same signature."""
	+ # Create an instance of the protocol (which cannot be instantiated
	+ # directly, so this creates a subclass, then instantiates it)
	+ interface = type('_', (IndexerStorageInterface,), {})()
	+
	+ assert 'content_mimetype_add' in dir(interface)
	+
	+ missing_methods = []
	+
	+ for meth_name in dir(interface):
	+ if meth_name.startswith('_'):
	+ continue
	+ interface_meth = getattr(interface, meth_name)
	+ try:
	+ concrete_meth = getattr(swh_indexer_storage, meth_name)
	+ except AttributeError:
	+ missing_methods.append(meth_name)
	+ continue
	+
	+ expected_signature = inspect.signature(interface_meth)
	+ actual_signature = inspect.signature(concrete_meth)
	+
	+ assert expected_signature == actual_signature, meth_name
	+
	+ assert missing_methods == []

File Metadata

Mime Type: text/plain
Expires: Dec 18 2024, 12:13 PM (14 w, 5 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3221555

D2609.diffNo OneTemporaryActions

D2609.diffView Options

File Metadata

Event Timeline

D2609.diff
No OneTemporary
Actions

D2609.diff
View Options