diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,4 +1,4 @@ -swh.core[db,http] >= 0.0.65 +swh.core[db,http] >= 0.0.87 swh.model >= 0.0.15 swh.objstorage >= 0.0.28 swh.scheduler >= 0.0.47 diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -9,7 +9,6 @@ from collections import defaultdict -from swh.core.api import remote_api_endpoint from swh.storage.common import db_transaction_generator, db_transaction from swh.storage.exc import StorageDBError from .db import Db @@ -108,10 +107,8 @@ if db is not self._db: db.put_conn() - @remote_api_endpoint('check_config') @db_transaction() def check_config(self, *, check_write, db=None, cur=None): - """Check that the storage is configured and ready to go.""" # Check permissions on one of the tables if check_write: check = 'INSERT' @@ -124,22 +121,8 @@ ) return cur.fetchone()[0] - @remote_api_endpoint('content_mimetype/missing') @db_transaction_generator() def content_mimetype_missing(self, mimetypes, db=None, cur=None): - """Generate mimetypes missing from storage. - - Args: - mimetypes (iterable): iterable of dict with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute the - results - - Yields: - tuple (id, indexer_configuration_id): missing id - - """ for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] @@ -147,34 +130,6 @@ indexer_configuration_id, limit=1000, with_textual_data=False, db=None, cur=None): - """Retrieve ids of type content_type within range [start, end] bound - by limit. - - Args: - **content_type** (str): content's type (mimetype, language, etc...) - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **indexer_configuration_id** (int): The tool used to index data - **limit** (int): Limit result (default to 1000) - **with_textual_data** (bool): Deal with only textual - content (True) or all - content (all contents by - defaults, False) - - Raises: - ValueError for; - - limit to None - - wrong content_type provided - - Returns: - a dict with keys: - - **ids** [bytes]: iterable of content ids within the range. - - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any - - """ if limit is None: raise ValueError('Development error: limit should not be None') if content_type not in db.content_indexer_names: @@ -199,53 +154,16 @@ 'next': next_id } - @remote_api_endpoint('content_mimetype/range') @db_transaction() def content_mimetype_get_range(self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): - """Retrieve mimetypes within range [start, end] bound by limit. - - Args: - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **indexer_configuration_id** (int): The tool used to index data - **limit** (int): Limit result (default to 1000) - - Raises: - ValueError for limit to None - - Returns: - a dict with keys: - - **ids** [bytes]: iterable of content ids within the range. - - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any - - """ return self._content_get_range('mimetype', start, end, indexer_configuration_id, limit=limit, db=db, cur=cur) - @remote_api_endpoint('content_mimetype/add') @db_transaction() def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, cur=None): - """Add mimetypes not present in storage. - - Args: - mimetypes (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **mimetype** (bytes): raw content's mimetype - - **encoding** (bytes): raw content's encoding - - **indexer_configuration_id** (int): tool's id used to - compute the results - - **conflict_update** (bool): Flag to determine if we want to - overwrite (``True``) or skip duplicates (``False``, the - default) - - """ _check_id_duplicates(mimetypes) mimetypes.sort(key=lambda m: m['id']) db.mktemp_content_mimetype(cur) @@ -254,84 +172,26 @@ cur) db.content_mimetype_add_from_temp(conflict_update, cur) - @remote_api_endpoint('content_mimetype') @db_transaction_generator() def content_mimetype_get(self, ids, db=None, cur=None): - """Retrieve full content mimetype per ids. - - Args: - ids (iterable): sha1 identifier - - Yields: - mimetypes (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **mimetype** (bytes): raw content's mimetype - - **encoding** (bytes): raw content's encoding - - **tool** (dict): Tool used to compute the language - - """ for c in db.content_mimetype_get_from_list(ids, cur): yield converters.db_to_mimetype( dict(zip(db.content_mimetype_cols, c))) - @remote_api_endpoint('content_language/missing') @db_transaction_generator() def content_language_missing(self, languages, db=None, cur=None): - """List languages missing from storage. - - Args: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - an iterable of missing id for the tuple (id, - indexer_configuration_id) - - """ for obj in db.content_language_missing_from_list(languages, cur): yield obj[0] - @remote_api_endpoint('content_language') @db_transaction_generator() def content_language_get(self, ids, db=None, cur=None): - """Retrieve full content language per ids. - - Args: - ids (iterable): sha1 identifier - - Yields: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **lang** (bytes): raw content's language - - **tool** (dict): Tool used to compute the language - - """ for c in db.content_language_get_from_list(ids, cur): yield converters.db_to_language( dict(zip(db.content_language_cols, c))) - @remote_api_endpoint('content_language/add') @db_transaction() def content_language_add(self, languages, conflict_update=False, db=None, cur=None): - """Add languages not present in storage. - - Args: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 - - **lang** (bytes): language detected - - conflict_update (bool): Flag to determine if we want to - overwrite (true) or skip duplicates (false, the - default) - - """ _check_id_duplicates(languages) languages.sort(key=lambda m: m['id']) db.mktemp_content_language(cur) @@ -347,62 +207,19 @@ db.content_language_add_from_temp(conflict_update, cur) - @remote_api_endpoint('content/ctags/missing') @db_transaction_generator() def content_ctags_missing(self, ctags, db=None, cur=None): - """List ctags missing from storage. - - Args: - ctags (iterable): dicts with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - an iterable of missing id for the tuple (id, - indexer_configuration_id) - - """ for obj in db.content_ctags_missing_from_list(ctags, cur): yield obj[0] - @remote_api_endpoint('content/ctags') @db_transaction_generator() def content_ctags_get(self, ids, db=None, cur=None): - """Retrieve ctags per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - Dictionaries with keys: - - - **id** (bytes): content's identifier - - **name** (str): symbol's name - - **kind** (str): symbol's kind - - **lang** (str): language for that content - - **tool** (dict): tool used to compute the ctags' info - - - """ for c in db.content_ctags_get_from_list(ids, cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) - @remote_api_endpoint('content/ctags/add') @db_transaction() def content_ctags_add(self, ctags, conflict_update=False, db=None, cur=None): - """Add ctags not present in storage - - Args: - ctags (iterable): dictionaries with keys: - - - **id** (bytes): sha1 - - **ctags** ([list): List of dictionary with keys: name, kind, - line, lang - - """ _check_id_duplicates(ctags) ctags.sort(key=lambda m: m['id']) @@ -422,41 +239,15 @@ db.content_ctags_add_from_temp(conflict_update, cur) - @remote_api_endpoint('content/ctags/search') @db_transaction_generator() def content_ctags_search(self, expression, limit=10, last_sha1=None, db=None, cur=None): - """Search through content's raw ctags symbols. - - Args: - expression (str): Expression to search for - limit (int): Number of rows to return (default to 10). - last_sha1 (str): Offset from which retrieving data (default to ''). - - Yields: - rows of ctags including id, name, lang, kind, line, etc... - - """ for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) - @remote_api_endpoint('content/fossology_license') @db_transaction_generator() def content_fossology_license_get(self, ids, db=None, cur=None): - """Retrieve licenses per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dict: ``{id: facts}`` where ``facts`` is a dict with the - following keys: - - - **licenses** ([str]): associated licenses for that content - - **tool** (dict): Tool used to compute the license - - """ d = defaultdict(list) for c in db.content_fossology_license_get_from_list(ids, cur): license = dict(zip(db.content_fossology_license_cols, c)) @@ -467,26 +258,9 @@ for id_, facts in d.items(): yield {id_: facts} - @remote_api_endpoint('content/fossology_license/add') @db_transaction() def content_fossology_license_add(self, licenses, conflict_update=False, db=None, cur=None): - """Add licenses not present in storage. - - Args: - licenses (iterable): dictionaries with keys: - - - **id**: sha1 - - **licenses** ([bytes]): List of licenses associated to sha1 - - **tool** (str): nomossa - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - Returns: - list: content_license entries which failed due to unknown licenses - - """ _check_id_duplicates(licenses) licenses.sort(key=lambda m: m['id']) db.mktemp_content_fossology_license(cur) @@ -502,90 +276,28 @@ cur=cur) db.content_fossology_license_add_from_temp(conflict_update, cur) - @remote_api_endpoint('content/fossology_license/range') @db_transaction() def content_fossology_license_get_range( self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): - """Retrieve licenses within range [start, end] bound by limit. - - Args: - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **indexer_configuration_id** (int): The tool used to index data - **limit** (int): Limit result (default to 1000) - - Raises: - ValueError for limit to None - - Returns: - a dict with keys: - - **ids** [bytes]: iterable of content ids within the range. - - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any - - """ return self._content_get_range('fossology_license', start, end, indexer_configuration_id, limit=limit, with_textual_data=True, db=db, cur=cur) - @remote_api_endpoint('content_metadata/missing') @db_transaction_generator() def content_metadata_missing(self, metadata, db=None, cur=None): - """List metadata missing from storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - missing sha1s - - """ for obj in db.content_metadata_missing_from_list(metadata, cur): yield obj[0] - @remote_api_endpoint('content_metadata') @db_transaction_generator() def content_metadata_get(self, ids, db=None, cur=None): - """Retrieve metadata per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dictionaries with the following keys: - - id (bytes) - metadata (str): associated metadata - tool (dict): tool used to compute metadata - - """ for c in db.content_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.content_metadata_cols, c))) - @remote_api_endpoint('content_metadata/add') @db_transaction() def content_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - """Add metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: sha1 - - **metadata**: arbitrary dict - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ _check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) @@ -596,67 +308,21 @@ cur) db.content_metadata_add_from_temp(conflict_update, cur) - @remote_api_endpoint('revision_intrinsic_metadata/missing') @db_transaction_generator() def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None): - """List metadata missing from storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id** (bytes): sha1_git revision identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - missing ids - - """ for obj in db.revision_intrinsic_metadata_missing_from_list( metadata, cur): yield obj[0] - @remote_api_endpoint('revision_intrinsic_metadata') @db_transaction_generator() def revision_intrinsic_metadata_get(self, ids, db=None, cur=None): - """Retrieve revision metadata per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - : dictionaries with the following keys: - - - **id** (bytes) - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ for c in db.revision_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.revision_intrinsic_metadata_cols, c))) - @remote_api_endpoint('revision_intrinsic_metadata/add') @db_transaction() def revision_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - """Add metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: sha1_git of revision - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ _check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) @@ -668,66 +334,20 @@ cur) db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur) - @remote_api_endpoint('revision_intrinsic_metadata/delete') @db_transaction() def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None): - """Remove revision metadata from the storage. - - Args: - entries (dict): dictionaries with the following keys: - - - **id** (bytes): revision identifier - - **indexer_configuration_id** (int): tool used to compute - metadata - """ db.revision_intrinsic_metadata_delete(entries, cur) - @remote_api_endpoint('origin_intrinsic_metadata') @db_transaction_generator() def origin_intrinsic_metadata_get(self, ids, db=None, cur=None): - """Retrieve origin metadata per id. - - Args: - ids (iterable): origin identifiers - - Yields: - list: dictionaries with the following keys: - - - **id** (str): origin url - - **from_revision** (bytes): which revision this metadata - was extracted from - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) - @remote_api_endpoint('origin_intrinsic_metadata/add') @db_transaction() def origin_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - """Add origin metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: origin urls - - **from_revision**: sha1 id of the revision used to generate - these metadata. - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ _check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) @@ -740,81 +360,24 @@ cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) - @remote_api_endpoint('origin_intrinsic_metadata/delete') @db_transaction() def origin_intrinsic_metadata_delete( self, entries, db=None, cur=None): - """Remove origin metadata from the storage. - - Args: - entries (dict): dictionaries with the following keys: - - - **id** (str): origin urls - - **indexer_configuration_id** (int): tool used to compute - metadata - """ db.origin_intrinsic_metadata_delete(entries, cur) - @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext') @db_transaction_generator() def origin_intrinsic_metadata_search_fulltext( self, conjunction, limit=100, db=None, cur=None): - """Returns the list of origins whose metadata contain all the terms. - - Args: - conjunction (List[str]): List of terms to be searched for. - limit (int): The maximum number of results to return - - Yields: - list: dictionaries with the following keys: - - - **id** (str): origin urls - - **from_revision**: sha1 id of the revision used to generate - these metadata. - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ for c in db.origin_intrinsic_metadata_search_fulltext( conjunction, limit=limit, cur=cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) - @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer') @db_transaction() def origin_intrinsic_metadata_search_by_producer( self, page_token='', limit=100, ids_only=False, mappings=None, tool_ids=None, db=None, cur=None): - """Returns the list of origins whose metadata contain all the terms. - - Args: - page_token (str): Opaque token used for pagination. - limit (int): The maximum number of results to return - ids_only (bool): Determines whether only origin urls are - returned or the content as well - mappings (List[str]): Returns origins whose intrinsic metadata - were generated using at least one of these mappings. - - Returns: - dict: dict with the following keys: - - **next_page_token** (str, optional): opaque token to be used as - `page_token` for retrieving the next page. If absent, there is - no more pages to gather. - - **origins** (list): list of origin url (str) if `ids_only=True` - else dictionaries with the following keys: - - - **id** (str): origin urls - - **from_revision**: sha1 id of the revision used to generate - these metadata. - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ assert isinstance(page_token, str) # we go to limit+1 to check whether we should add next_page_token in # the response @@ -834,25 +397,9 @@ result['next_page_token'] = result['origins'][-1]['id'] return result - @remote_api_endpoint('origin_intrinsic_metadata/stats') @db_transaction() def origin_intrinsic_metadata_stats( self, db=None, cur=None): - """Returns counts of indexed metadata per origins, broken down - into metadata types. - - Returns: - dict: dictionary with keys: - - - total (int): total number of origins that were indexed - (possibly yielding an empty metadata dictionary) - - non_empty (int): total number of origins that we extracted - a non-empty metadata dictionary from - - per_mapping (dict): a dictionary with mapping names as - keys and number of origins whose indexing used this - mapping. Note that indexing a given origin may use - 0, 1, or many mappings. - """ mapping_names = [m for m in MAPPING_NAMES] select_parts = [] @@ -880,26 +427,8 @@ 'per_mapping': results, } - @remote_api_endpoint('indexer_configuration/add') @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): - """Add new tools to the storage. - - Args: - tools ([dict]): List of dictionary representing tool to - insert in the db. Dictionary with the following keys: - - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) - - Returns: - List of dict inserted in the db (holding the id key as - well). The order of the list is not guaranteed to match - the order of the initial list. - - """ db.mktemp_indexer_configuration(cur) db.copy_to(tools, 'tmp_indexer_configuration', ['tool_name', 'tool_version', 'tool_configuration'], @@ -909,24 +438,8 @@ for line in tools: yield dict(zip(db.indexer_configuration_cols, line)) - @remote_api_endpoint('indexer_configuration/data') @db_transaction() def indexer_configuration_get(self, tool, db=None, cur=None): - """Retrieve tool information. - - Args: - tool (dict): Dictionary representing a tool with the - following keys: - - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) - - Returns: - The same dictionary with an `id` key, None otherwise. - - """ tool_conf = tool['tool_configuration'] if isinstance(tool_conf, dict): tool_conf = json.dumps(tool_conf) diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py --- a/swh/indexer/storage/api/client.py +++ b/swh/indexer/storage/api/client.py @@ -7,11 +7,11 @@ from swh.storage.exc import StorageAPIError -from .. import IndexerStorage +from ..interface import IndexerStorageInterface class RemoteStorage(RPCClient): """Proxy to a remote storage API""" - backend_class = IndexerStorage + backend_class = IndexerStorageInterface api_exception = StorageAPIError diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py --- a/swh/indexer/storage/api/server.py +++ b/swh/indexer/storage/api/server.py @@ -10,8 +10,9 @@ from swh.core.api import (RPCServerApp, error_handler, encode_data_server as encode_data) from swh.indexer.storage import ( - get_indexer_storage, INDEXER_CFG_KEY, IndexerStorage + get_indexer_storage, INDEXER_CFG_KEY ) +from swh.indexer.storage.interface import IndexerStorageInterface def get_storage(): @@ -23,7 +24,7 @@ app = RPCServerApp(__name__, - backend_class=IndexerStorage, + backend_class=IndexerStorageInterface, backend_factory=get_storage) storage = None diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -203,168 +203,36 @@ return True def content_mimetype_missing(self, mimetypes): - """Generate mimetypes missing from storage. - - Args: - mimetypes (iterable): iterable of dict with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute the - results - - Yields: - tuple (id, indexer_configuration_id): missing id - - """ yield from self._mimetypes.missing(mimetypes) def content_mimetype_get_range( self, start, end, indexer_configuration_id, limit=1000): - """Retrieve mimetypes within range [start, end] bound by limit. - - Args: - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **indexer_configuration_id** (int): The tool used to index data - **limit** (int): Limit result (default to 1000) - - Raises: - ValueError for limit to None - - Returns: - a dict with keys: - - **ids** [bytes]: iterable of content ids within the range. - - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any - - """ return self._mimetypes.get_range( start, end, indexer_configuration_id, limit) def content_mimetype_add(self, mimetypes, conflict_update=False): - """Add mimetypes not present in storage. - - Args: - mimetypes (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **mimetype** (bytes): raw content's mimetype - - **encoding** (bytes): raw content's encoding - - **indexer_configuration_id** (int): tool's id used to - compute the results - - **conflict_update** (bool): Flag to determine if we want to - overwrite (``True``) or skip duplicates (``False``, the - default) - - """ if not all(isinstance(x['id'], bytes) for x in mimetypes): raise TypeError('identifiers must be bytes.') self._mimetypes.add(mimetypes, conflict_update) - def content_mimetype_get(self, ids, db=None, cur=None): - """Retrieve full content mimetype per ids. - - Args: - ids (iterable): sha1 identifier - - Yields: - mimetypes (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **mimetype** (bytes): raw content's mimetype - - **encoding** (bytes): raw content's encoding - - **tool** (dict): Tool used to compute the language - - """ + def content_mimetype_get(self, ids): yield from self._mimetypes.get(ids) def content_language_missing(self, languages): - """List languages missing from storage. - - Args: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - an iterable of missing id for the tuple (id, - indexer_configuration_id) - - """ yield from self._languages.missing(languages) def content_language_get(self, ids): - """Retrieve full content language per ids. - - Args: - ids (iterable): sha1 identifier - - Yields: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **lang** (bytes): raw content's language - - **tool** (dict): Tool used to compute the language - - """ yield from self._languages.get(ids) def content_language_add(self, languages, conflict_update=False): - """Add languages not present in storage. - - Args: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 - - **lang** (bytes): language detected - - conflict_update (bool): Flag to determine if we want to - overwrite (true) or skip duplicates (false, the - default) - - """ if not all(isinstance(x['id'], bytes) for x in languages): raise TypeError('identifiers must be bytes.') self._languages.add(languages, conflict_update) def content_ctags_missing(self, ctags): - """List ctags missing from storage. - - Args: - ctags (iterable): dicts with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - an iterable of missing id for the tuple (id, - indexer_configuration_id) - - """ yield from self._content_ctags.missing(ctags) def content_ctags_get(self, ids): - """Retrieve ctags per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - Dictionaries with keys: - - - **id** (bytes): content's identifier - - **name** (str): symbol's name - - **kind** (str): symbol's kind - - **lang** (str): language for that content - - **tool** (dict): tool used to compute the ctags' info - - - """ for item in self._content_ctags.get(ids): for item_ctags_item in item['ctags']: yield { @@ -374,35 +242,12 @@ } def content_ctags_add(self, ctags, conflict_update=False): - """Add ctags not present in storage - - Args: - ctags (iterable): dictionaries with keys: - - - **id** (bytes): sha1 - - **ctags** ([list): List of dictionary with keys: name, kind, - line, lang - - **indexer_configuration_id**: tool used to compute the - results - - """ if not all(isinstance(x['id'], bytes) for x in ctags): raise TypeError('identifiers must be bytes.') self._content_ctags.add_merge(ctags, conflict_update, 'ctags') def content_ctags_search(self, expression, - limit=10, last_sha1=None, db=None, cur=None): - """Search through content's raw ctags symbols. - - Args: - expression (str): Expression to search for - limit (int): Number of rows to return (default to 10). - last_sha1 (str): Offset from which retrieving data (default to ''). - - Yields: - rows of ctags including id, name, lang, kind, line, etc... - - """ + limit=10, last_sha1=None): nb_matches = 0 for ((id_, tool_id), item) in \ sorted(self._content_ctags._data.items()): @@ -421,19 +266,6 @@ return def content_fossology_license_get(self, ids): - """Retrieve licenses per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dict: ``{id: facts}`` where ``facts`` is a dict with the - following keys: - - - **licenses** ([str]): associated licenses for that content - - **tool** (dict): Tool used to compute the license - - """ # Rewrites the output of SubStorage.get from the old format to # the new one. SubStorage.get should be updated once all other # *_get methods use the new format. @@ -445,239 +277,52 @@ yield {id_: facts} def content_fossology_license_add(self, licenses, conflict_update=False): - """Add licenses not present in storage. - - Args: - licenses (iterable): dictionaries with keys: - - - **id**: sha1 - - **licenses** ([bytes]): List of licenses associated to sha1 - - **tool** (str): nomossa - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - Returns: - list: content_license entries which failed due to unknown licenses - - """ if not all(isinstance(x['id'], bytes) for x in licenses): raise TypeError('identifiers must be bytes.') self._licenses.add_merge(licenses, conflict_update, 'licenses') def content_fossology_license_get_range( self, start, end, indexer_configuration_id, limit=1000): - """Retrieve licenses within range [start, end] bound by limit. - - Args: - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **indexer_configuration_id** (int): The tool used to index data - **limit** (int): Limit result (default to 1000) - - Raises: - ValueError for limit to None - - Returns: - a dict with keys: - - **ids** [bytes]: iterable of content ids within the range. - - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any - - """ return self._licenses.get_range( start, end, indexer_configuration_id, limit) def content_metadata_missing(self, metadata): - """List metadata missing from storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - missing sha1s - - """ yield from self._content_metadata.missing(metadata) def content_metadata_get(self, ids): - """Retrieve metadata per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dictionaries with the following keys: - - - **id** (bytes) - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - """ yield from self._content_metadata.get(ids) def content_metadata_add(self, metadata, conflict_update=False): - """Add metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: sha1 - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute the - results - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ if not all(isinstance(x['id'], bytes) for x in metadata): raise TypeError('identifiers must be bytes.') self._content_metadata.add(metadata, conflict_update) def revision_intrinsic_metadata_missing(self, metadata): - """List metadata missing from storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id** (bytes): sha1_git revision identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - missing ids - - """ yield from self._revision_intrinsic_metadata.missing(metadata) def revision_intrinsic_metadata_get(self, ids): - """Retrieve revision metadata per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dictionaries with the following keys: - - - **id** (bytes) - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ yield from self._revision_intrinsic_metadata.get(ids) def revision_intrinsic_metadata_add(self, metadata, conflict_update=False): - """Add metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: sha1_git of revision - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ if not all(isinstance(x['id'], bytes) for x in metadata): raise TypeError('identifiers must be bytes.') self._revision_intrinsic_metadata.add(metadata, conflict_update) def revision_intrinsic_metadata_delete(self, entries): - """Remove revision metadata from the storage. - - Args: - entries (dict): dictionaries with the following keys: - - **revision** (int): origin identifier - - **id** (int): tool used to compute metadata - """ self._revision_intrinsic_metadata.delete(entries) def origin_intrinsic_metadata_get(self, ids): - """Retrieve origin metadata per id. - - Args: - ids (iterable): origin identifiers - - Yields: - list: dictionaries with the following keys: - - - **id** (str): origin url - - **from_revision** (bytes): which revision this metadata - was extracted from - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ yield from self._origin_intrinsic_metadata.get(ids) def origin_intrinsic_metadata_add(self, metadata, conflict_update=False): - """Add origin metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: origin url - - **from_revision**: sha1 id of the revision used to generate - these metadata. - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ self._origin_intrinsic_metadata.add(metadata, conflict_update) def origin_intrinsic_metadata_delete(self, entries): - """Remove origin metadata from the storage. - - Args: - entries (dict): dictionaries with the following keys: - - - **id** (str): origin url - - **indexer_configuration_id** (int): tool used to compute - metadata - """ self._origin_intrinsic_metadata.delete(entries) def origin_intrinsic_metadata_search_fulltext( self, conjunction, limit=100): - """Returns the list of origins whose metadata contain all the terms. - - Args: - conjunction (List[str]): List of terms to be searched for. - limit (int): The maximum number of results to return - - Yields: - list: dictionaries with the following keys: - - - **id** (str): origin url - - **from_revision** (bytes): which revision this metadata - was extracted from - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ # A very crude fulltext search implementation, but that's enough # to work on English metadata tokens_re = re.compile('[a-zA-Z0-9]+') @@ -711,34 +356,7 @@ def origin_intrinsic_metadata_search_by_producer( self, page_token='', limit=100, ids_only=False, - mappings=None, tool_ids=None, - db=None, cur=None): - """Returns the list of origins whose metadata contain all the terms. - - Args: - page_token (str): Opaque token used for pagination. - limit (int): The maximum number of results to return - ids_only (bool): Determines whether only origin ids are returned - or the content as well - mappings (List[str]): Returns origins whose intrinsic metadata - were generated using at least one of these mappings. - - Returns: - dict: dict with the following keys: - - **next_page_token** (str, optional): opaque token to be used as - `page_token` for retrieveing the next page. - - **origins** (list): list of origin url (str) if `ids_only=True` - else dictionaries with the following keys: - - - **id** (str): origin urls - - **from_revision**: sha1 id of the revision used to generate - these metadata. - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ + mappings=None, tool_ids=None): assert isinstance(page_token, str) nb_results = 0 if mappings is not None: @@ -771,20 +389,6 @@ return result def origin_intrinsic_metadata_stats(self): - """Returns statistics on stored intrinsic metadata. - - Returns: - dict: dictionary with keys: - - - total (int): total number of origins that were indexed - (possibly yielding an empty metadata dictionary) - - non_empty (int): total number of origins that we extracted - a non-empty metadata dictionary from - - per_mapping (dict): a dictionary with mapping names as - keys and number of origins whose indexing used this - mapping. Note that indexing a given origin may use - 0, 1, or many mappings. - """ mapping_count = {m: 0 for m in MAPPING_NAMES} total = non_empty = 0 for data in self._origin_intrinsic_metadata.get_all(): @@ -800,23 +404,6 @@ } def indexer_configuration_add(self, tools): - """Add new tools to the storage. - - Args: - tools ([dict]): List of dictionary representing tool to - insert in the db. Dictionary with the following keys: - - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) - - Returns: - list: List of dict inserted in the db (holding the id key as - well). The order of the list is not guaranteed to match - the order of the initial list. - - """ inserted = [] for tool in tools: tool = tool.copy() @@ -827,21 +414,6 @@ return inserted def indexer_configuration_get(self, tool): - """Retrieve tool information. - - Args: - tool (dict): Dictionary representing a tool with the - following keys: - - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) - - Returns: - The same dictionary with an `id` key, None otherwise. - - """ return self._tools.get(self._tool_key(tool)) def _tool_key(self, tool): diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/interface.py copy from swh/indexer/storage/__init__.py copy to swh/indexer/storage/interface.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/interface.py @@ -1,132 +1,20 @@ -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json -import psycopg2 - -from collections import defaultdict - from swh.core.api import remote_api_endpoint -from swh.storage.common import db_transaction_generator, db_transaction -from swh.storage.exc import StorageDBError -from .db import Db - -from . import converters - - -INDEXER_CFG_KEY = 'indexer_storage' - - -MAPPING_NAMES = ['codemeta', 'gemspec', 'maven', 'npm', 'pkg-info'] - - -def get_indexer_storage(cls, args): - """Get an indexer storage object of class `storage_class` with - arguments `storage_args`. - - Args: - cls (str): storage's class, either 'local' or 'remote' - args (dict): dictionary of arguments passed to the - storage class constructor - - Returns: - an instance of swh.indexer's storage (either local or remote) - - Raises: - ValueError if passed an unknown storage class. - - """ - if cls == 'remote': - from .api.client import RemoteStorage as IndexerStorage - elif cls == 'local': - from . import IndexerStorage - elif cls == 'memory': - from .in_memory import IndexerStorage - else: - raise ValueError('Unknown indexer storage class `%s`' % cls) - - return IndexerStorage(**args) - - -def _check_id_duplicates(data): - """ - If any two dictionaries in `data` have the same id, raises - a `ValueError`. - - Values associated to the key must be hashable. - - Args: - data (List[dict]): List of dictionaries to be inserted - - >>> _check_id_duplicates([ - ... {'id': 'foo', 'data': 'spam'}, - ... {'id': 'bar', 'data': 'egg'}, - ... ]) - >>> _check_id_duplicates([ - ... {'id': 'foo', 'data': 'spam'}, - ... {'id': 'foo', 'data': 'egg'}, - ... ]) - Traceback (most recent call last): - ... - ValueError: The same id is present more than once. - """ - if len({item['id'] for item in data}) < len(data): - raise ValueError('The same id is present more than once.') -class IndexerStorage: - """SWH Indexer Storage - - """ - def __init__(self, db, min_pool_conns=1, max_pool_conns=10): - """ - Args: - db_conn: either a libpq connection string, or a psycopg2 connection - - """ - try: - if isinstance(db, psycopg2.extensions.connection): - self._pool = None - self._db = Db(db) - else: - self._pool = psycopg2.pool.ThreadedConnectionPool( - min_pool_conns, max_pool_conns, db - ) - self._db = None - except psycopg2.OperationalError as e: - raise StorageDBError(e) - - def get_db(self): - if self._db: - return self._db - return Db.from_pool(self._pool) - - def put_db(self, db): - if db is not self._db: - db.put_conn() - +class IndexerStorageInterface: @remote_api_endpoint('check_config') - @db_transaction() - def check_config(self, *, check_write, db=None, cur=None): + def check_config(self, *, check_write): """Check that the storage is configured and ready to go.""" - # Check permissions on one of the tables - if check_write: - check = 'INSERT' - else: - check = 'SELECT' - - cur.execute( - "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa - (check,) - ) - return cur.fetchone()[0] + ... @remote_api_endpoint('content_mimetype/missing') - @db_transaction_generator() - def content_mimetype_missing(self, mimetypes, db=None, cur=None): + def content_mimetype_missing(self, mimetypes): """Generate mimetypes missing from storage. Args: @@ -140,13 +28,11 @@ tuple (id, indexer_configuration_id): missing id """ - for obj in db.content_mimetype_missing_from_list(mimetypes, cur): - yield obj[0] + ... def _content_get_range(self, content_type, start, end, indexer_configuration_id, limit=1000, - with_textual_data=False, - db=None, cur=None): + with_textual_data=False): """Retrieve ids of type content_type within range [start, end] bound by limit. @@ -175,34 +61,11 @@ this sha1 if any """ - if limit is None: - raise ValueError('Development error: limit should not be None') - if content_type not in db.content_indexer_names: - err = 'Development error: Wrong type. Should be one of [%s]' % ( - ','.join(db.content_indexer_names)) - raise ValueError(err) - - ids = [] - next_id = None - for counter, obj in enumerate(db.content_get_range( - content_type, start, end, indexer_configuration_id, - limit=limit+1, with_textual_data=with_textual_data, cur=cur)): - _id = obj[0] - if counter >= limit: - next_id = _id - break - - ids.append(_id) - - return { - 'ids': ids, - 'next': next_id - } + ... @remote_api_endpoint('content_mimetype/range') - @db_transaction() def content_mimetype_get_range(self, start, end, indexer_configuration_id, - limit=1000, db=None, cur=None): + limit=1000): """Retrieve mimetypes within range [start, end] bound by limit. Args: @@ -223,14 +86,10 @@ this sha1 if any """ - return self._content_get_range('mimetype', start, end, - indexer_configuration_id, limit=limit, - db=db, cur=cur) + ... @remote_api_endpoint('content_mimetype/add') - @db_transaction() - def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, - cur=None): + def content_mimetype_add(self, mimetypes, conflict_update=False): """Add mimetypes not present in storage. Args: @@ -246,17 +105,10 @@ default) """ - _check_id_duplicates(mimetypes) - mimetypes.sort(key=lambda m: m['id']) - db.mktemp_content_mimetype(cur) - db.copy_to(mimetypes, 'tmp_content_mimetype', - ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], - cur) - db.content_mimetype_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('content_mimetype') - @db_transaction_generator() - def content_mimetype_get(self, ids, db=None, cur=None): + def content_mimetype_get(self, ids): """Retrieve full content mimetype per ids. Args: @@ -271,13 +123,10 @@ - **tool** (dict): Tool used to compute the language """ - for c in db.content_mimetype_get_from_list(ids, cur): - yield converters.db_to_mimetype( - dict(zip(db.content_mimetype_cols, c))) + ... @remote_api_endpoint('content_language/missing') - @db_transaction_generator() - def content_language_missing(self, languages, db=None, cur=None): + def content_language_missing(self, languages): """List languages missing from storage. Args: @@ -292,12 +141,10 @@ indexer_configuration_id) """ - for obj in db.content_language_missing_from_list(languages, cur): - yield obj[0] + ... @remote_api_endpoint('content_language') - @db_transaction_generator() - def content_language_get(self, ids, db=None, cur=None): + def content_language_get(self, ids): """Retrieve full content language per ids. Args: @@ -311,14 +158,10 @@ - **tool** (dict): Tool used to compute the language """ - for c in db.content_language_get_from_list(ids, cur): - yield converters.db_to_language( - dict(zip(db.content_language_cols, c))) + ... @remote_api_endpoint('content_language/add') - @db_transaction() - def content_language_add(self, languages, conflict_update=False, db=None, - cur=None): + def content_language_add(self, languages, conflict_update=False): """Add languages not present in storage. Args: @@ -332,24 +175,10 @@ default) """ - _check_id_duplicates(languages) - languages.sort(key=lambda m: m['id']) - db.mktemp_content_language(cur) - # empty language is mapped to 'unknown' - db.copy_to( - ({ - 'id': l['id'], - 'lang': 'unknown' if not l['lang'] else l['lang'], - 'indexer_configuration_id': l['indexer_configuration_id'], - } for l in languages), - 'tmp_content_language', - ['id', 'lang', 'indexer_configuration_id'], cur) - - db.content_language_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('content/ctags/missing') - @db_transaction_generator() - def content_ctags_missing(self, ctags, db=None, cur=None): + def content_ctags_missing(self, ctags): """List ctags missing from storage. Args: @@ -364,12 +193,10 @@ indexer_configuration_id) """ - for obj in db.content_ctags_missing_from_list(ctags, cur): - yield obj[0] + ... @remote_api_endpoint('content/ctags') - @db_transaction_generator() - def content_ctags_get(self, ids, db=None, cur=None): + def content_ctags_get(self, ids): """Retrieve ctags per id. Args: @@ -386,13 +213,10 @@ """ - for c in db.content_ctags_get_from_list(ids, cur): - yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) + ... @remote_api_endpoint('content/ctags/add') - @db_transaction() - def content_ctags_add(self, ctags, conflict_update=False, db=None, - cur=None): + def content_ctags_add(self, ctags, conflict_update=False): """Add ctags not present in storage Args: @@ -403,29 +227,11 @@ line, lang """ - _check_id_duplicates(ctags) - ctags.sort(key=lambda m: m['id']) - - def _convert_ctags(__ctags): - """Convert ctags dict to list of ctags. - - """ - for ctags in __ctags: - yield from converters.ctags_to_db(ctags) - - db.mktemp_content_ctags(cur) - db.copy_to(list(_convert_ctags(ctags)), - tblname='tmp_content_ctags', - columns=['id', 'name', 'kind', 'line', - 'lang', 'indexer_configuration_id'], - cur=cur) - - db.content_ctags_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('content/ctags/search') - @db_transaction_generator() def content_ctags_search(self, expression, - limit=10, last_sha1=None, db=None, cur=None): + limit=10, last_sha1=None): """Search through content's raw ctags symbols. Args: @@ -437,13 +243,10 @@ rows of ctags including id, name, lang, kind, line, etc... """ - for obj in db.content_ctags_search(expression, last_sha1, limit, - cur=cur): - yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) + ... @remote_api_endpoint('content/fossology_license') - @db_transaction_generator() - def content_fossology_license_get(self, ids, db=None, cur=None): + def content_fossology_license_get(self, ids): """Retrieve licenses per id. Args: @@ -457,20 +260,10 @@ - **tool** (dict): Tool used to compute the license """ - d = defaultdict(list) - for c in db.content_fossology_license_get_from_list(ids, cur): - license = dict(zip(db.content_fossology_license_cols, c)) - - id_ = license['id'] - d[id_].append(converters.db_to_fossology_license(license)) - - for id_, facts in d.items(): - yield {id_: facts} + ... @remote_api_endpoint('content/fossology_license/add') - @db_transaction() - def content_fossology_license_add(self, licenses, conflict_update=False, - db=None, cur=None): + def content_fossology_license_add(self, licenses, conflict_update=False): """Add licenses not present in storage. Args: @@ -487,26 +280,12 @@ list: content_license entries which failed due to unknown licenses """ - _check_id_duplicates(licenses) - licenses.sort(key=lambda m: m['id']) - db.mktemp_content_fossology_license(cur) - db.copy_to( - ({ - 'id': sha1['id'], - 'indexer_configuration_id': sha1['indexer_configuration_id'], - 'license': license, - } for sha1 in licenses - for license in sha1['licenses']), - tblname='tmp_content_fossology_license', - columns=['id', 'license', 'indexer_configuration_id'], - cur=cur) - db.content_fossology_license_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('content/fossology_license/range') - @db_transaction() def content_fossology_license_get_range( self, start, end, indexer_configuration_id, - limit=1000, db=None, cur=None): + limit=1000): """Retrieve licenses within range [start, end] bound by limit. Args: @@ -527,13 +306,10 @@ this sha1 if any """ - return self._content_get_range('fossology_license', start, end, - indexer_configuration_id, limit=limit, - with_textual_data=True, db=db, cur=cur) + ... @remote_api_endpoint('content_metadata/missing') - @db_transaction_generator() - def content_metadata_missing(self, metadata, db=None, cur=None): + def content_metadata_missing(self, metadata): """List metadata missing from storage. Args: @@ -547,12 +323,10 @@ missing sha1s """ - for obj in db.content_metadata_missing_from_list(metadata, cur): - yield obj[0] + ... @remote_api_endpoint('content_metadata') - @db_transaction_generator() - def content_metadata_get(self, ids, db=None, cur=None): + def content_metadata_get(self, ids): """Retrieve metadata per id. Args: @@ -566,14 +340,10 @@ tool (dict): tool used to compute metadata """ - for c in db.content_metadata_get_from_list(ids, cur): - yield converters.db_to_metadata( - dict(zip(db.content_metadata_cols, c))) + ... @remote_api_endpoint('content_metadata/add') - @db_transaction() - def content_metadata_add(self, metadata, conflict_update=False, db=None, - cur=None): + def content_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: @@ -586,19 +356,10 @@ or skip duplicates (false, the default) """ - _check_id_duplicates(metadata) - metadata.sort(key=lambda m: m['id']) - - db.mktemp_content_metadata(cur) - - db.copy_to(metadata, 'tmp_content_metadata', - ['id', 'metadata', 'indexer_configuration_id'], - cur) - db.content_metadata_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('revision_intrinsic_metadata/missing') - @db_transaction_generator() - def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None): + def revision_intrinsic_metadata_missing(self, metadata): """List metadata missing from storage. Args: @@ -612,13 +373,10 @@ missing ids """ - for obj in db.revision_intrinsic_metadata_missing_from_list( - metadata, cur): - yield obj[0] + ... @remote_api_endpoint('revision_intrinsic_metadata') - @db_transaction_generator() - def revision_intrinsic_metadata_get(self, ids, db=None, cur=None): + def revision_intrinsic_metadata_get(self, ids): """Retrieve revision metadata per id. Args: @@ -634,14 +392,10 @@ these metadata """ - for c in db.revision_intrinsic_metadata_get_from_list(ids, cur): - yield converters.db_to_metadata( - dict(zip(db.revision_intrinsic_metadata_cols, c))) + ... @remote_api_endpoint('revision_intrinsic_metadata/add') - @db_transaction() - def revision_intrinsic_metadata_add(self, metadata, conflict_update=False, - db=None, cur=None): + def revision_intrinsic_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: @@ -657,20 +411,10 @@ or skip duplicates (false, the default) """ - _check_id_duplicates(metadata) - metadata.sort(key=lambda m: m['id']) - - db.mktemp_revision_intrinsic_metadata(cur) - - db.copy_to(metadata, 'tmp_revision_intrinsic_metadata', - ['id', 'metadata', 'mappings', - 'indexer_configuration_id'], - cur) - db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('revision_intrinsic_metadata/delete') - @db_transaction() - def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None): + def revision_intrinsic_metadata_delete(self, entries): """Remove revision metadata from the storage. Args: @@ -680,11 +424,10 @@ - **indexer_configuration_id** (int): tool used to compute metadata """ - db.revision_intrinsic_metadata_delete(entries, cur) + ... @remote_api_endpoint('origin_intrinsic_metadata') - @db_transaction_generator() - def origin_intrinsic_metadata_get(self, ids, db=None, cur=None): + def origin_intrinsic_metadata_get(self, ids): """Retrieve origin metadata per id. Args: @@ -702,15 +445,11 @@ these metadata """ - for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): - yield converters.db_to_metadata( - dict(zip(db.origin_intrinsic_metadata_cols, c))) + ... @remote_api_endpoint('origin_intrinsic_metadata/add') - @db_transaction() def origin_intrinsic_metadata_add(self, metadata, - conflict_update=False, db=None, - cur=None): + conflict_update=False): """Add origin metadata not present in storage. Args: @@ -728,22 +467,11 @@ or skip duplicates (false, the default) """ - _check_id_duplicates(metadata) - metadata.sort(key=lambda m: m['id']) - - db.mktemp_origin_intrinsic_metadata(cur) - - db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', - ['id', 'metadata', - 'indexer_configuration_id', - 'from_revision', 'mappings'], - cur) - db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('origin_intrinsic_metadata/delete') - @db_transaction() def origin_intrinsic_metadata_delete( - self, entries, db=None, cur=None): + self, entries): """Remove origin metadata from the storage. Args: @@ -753,12 +481,11 @@ - **indexer_configuration_id** (int): tool used to compute metadata """ - db.origin_intrinsic_metadata_delete(entries, cur) + ... @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext') - @db_transaction_generator() def origin_intrinsic_metadata_search_fulltext( - self, conjunction, limit=100, db=None, cur=None): + self, conjunction, limit=100): """Returns the list of origins whose metadata contain all the terms. Args: @@ -777,17 +504,12 @@ these metadata """ - for c in db.origin_intrinsic_metadata_search_fulltext( - conjunction, limit=limit, cur=cur): - yield converters.db_to_metadata( - dict(zip(db.origin_intrinsic_metadata_cols, c))) + ... @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer') - @db_transaction() def origin_intrinsic_metadata_search_by_producer( self, page_token='', limit=100, ids_only=False, - mappings=None, tool_ids=None, - db=None, cur=None): + mappings=None, tool_ids=None): """Returns the list of origins whose metadata contain all the terms. Args: @@ -815,29 +537,11 @@ these metadata """ - assert isinstance(page_token, str) - # we go to limit+1 to check whether we should add next_page_token in - # the response - res = db.origin_intrinsic_metadata_search_by_producer( - page_token, limit + 1, ids_only, mappings, tool_ids, cur) - result = {} - if ids_only: - result['origins'] = [origin for (origin,) in res] - if len(result['origins']) > limit: - result['origins'][limit:] = [] - result['next_page_token'] = result['origins'][-1] - else: - result['origins'] = [converters.db_to_metadata( - dict(zip(db.origin_intrinsic_metadata_cols, c)))for c in res] - if len(result['origins']) > limit: - result['origins'][limit:] = [] - result['next_page_token'] = result['origins'][-1]['id'] - return result + ... @remote_api_endpoint('origin_intrinsic_metadata/stats') - @db_transaction() def origin_intrinsic_metadata_stats( - self, db=None, cur=None): + self): """Returns counts of indexed metadata per origins, broken down into metadata types. @@ -853,36 +557,10 @@ mapping. Note that indexing a given origin may use 0, 1, or many mappings. """ - mapping_names = [m for m in MAPPING_NAMES] - select_parts = [] - - # Count rows for each mapping - for mapping_name in mapping_names: - select_parts.append(( - "sum(case when (mappings @> ARRAY['%s']) " - " then 1 else 0 end)" - ) % mapping_name) - - # Total - select_parts.append("sum(1)") - - # Rows whose metadata has at least one key that is not '@context' - select_parts.append( - "sum(case when ('{}'::jsonb @> (metadata - '@context')) " - " then 0 else 1 end)") - cur.execute('select ' + ', '.join(select_parts) - + ' from origin_intrinsic_metadata') - results = dict(zip(mapping_names + ['total', 'non_empty'], - cur.fetchone())) - return { - 'total': results.pop('total'), - 'non_empty': results.pop('non_empty'), - 'per_mapping': results, - } + ... @remote_api_endpoint('indexer_configuration/add') - @db_transaction_generator() - def indexer_configuration_add(self, tools, db=None, cur=None): + def indexer_configuration_add(self, tools): """Add new tools to the storage. Args: @@ -900,18 +578,10 @@ the order of the initial list. """ - db.mktemp_indexer_configuration(cur) - db.copy_to(tools, 'tmp_indexer_configuration', - ['tool_name', 'tool_version', 'tool_configuration'], - cur) - - tools = db.indexer_configuration_add_from_temp(cur) - for line in tools: - yield dict(zip(db.indexer_configuration_cols, line)) + ... @remote_api_endpoint('indexer_configuration/data') - @db_transaction() - def indexer_configuration_get(self, tool, db=None, cur=None): + def indexer_configuration_get(self, tool): """Retrieve tool information. Args: @@ -927,12 +597,4 @@ The same dictionary with an `id` key, None otherwise. """ - tool_conf = tool['tool_configuration'] - if isinstance(tool_conf, dict): - tool_conf = json.dumps(tool_conf) - idx = db.indexer_configuration_get(tool['tool_name'], - tool['tool_version'], - tool_conf) - if not idx: - return None - return dict(zip(db.indexer_configuration_cols, idx)) + ... diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -3,10 +3,15 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import inspect import threading + import pytest + from swh.model.hashutil import hash_to_bytes +from swh.indexer.storage.interface import IndexerStorageInterface + def prepare_mimetypes_from(fossology_licenses): """Fossology license needs some consistent data in db to run. @@ -1853,3 +1858,31 @@ storage = swh_indexer_storage assert storage.check_config(check_write=True) assert storage.check_config(check_write=False) + + def test_types(self, swh_indexer_storage): + """Checks all methods of StorageInterface are implemented by this + backend, and that they have the same signature.""" + # Create an instance of the protocol (which cannot be instantiated + # directly, so this creates a subclass, then instantiates it) + interface = type('_', (IndexerStorageInterface,), {})() + + assert 'content_mimetype_add' in dir(interface) + + missing_methods = [] + + for meth_name in dir(interface): + if meth_name.startswith('_'): + continue + interface_meth = getattr(interface, meth_name) + try: + concrete_meth = getattr(swh_indexer_storage, meth_name) + except AttributeError: + missing_methods.append(meth_name) + continue + + expected_signature = inspect.signature(interface_meth) + actual_signature = inspect.signature(concrete_meth) + + assert expected_signature == actual_signature, meth_name + + assert missing_methods == []