diff --git a/requirements-swh.txt b/requirements-swh.txt index 330716a..c49aa91 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,6 +1,6 @@ -swh.core[db,http] >= 0.0.65 +swh.core[db,http] >= 0.0.87 swh.model >= 0.0.15 swh.objstorage >= 0.0.28 swh.scheduler >= 0.0.47 swh.storage >= 0.0.156 swh.journal >= 0.0.17 diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py index 4eaa7d5..8c61aff 100644 --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -1,938 +1,451 @@ -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import psycopg2 from collections import defaultdict -from swh.core.api import remote_api_endpoint from swh.storage.common import db_transaction_generator, db_transaction from swh.storage.exc import StorageDBError from .db import Db from . import converters INDEXER_CFG_KEY = 'indexer_storage' MAPPING_NAMES = ['codemeta', 'gemspec', 'maven', 'npm', 'pkg-info'] def get_indexer_storage(cls, args): """Get an indexer storage object of class `storage_class` with arguments `storage_args`. Args: cls (str): storage's class, either 'local' or 'remote' args (dict): dictionary of arguments passed to the storage class constructor Returns: an instance of swh.indexer's storage (either local or remote) Raises: ValueError if passed an unknown storage class. """ if cls == 'remote': from .api.client import RemoteStorage as IndexerStorage elif cls == 'local': from . import IndexerStorage elif cls == 'memory': from .in_memory import IndexerStorage else: raise ValueError('Unknown indexer storage class `%s`' % cls) return IndexerStorage(**args) def _check_id_duplicates(data): """ If any two dictionaries in `data` have the same id, raises a `ValueError`. Values associated to the key must be hashable. Args: data (List[dict]): List of dictionaries to be inserted >>> _check_id_duplicates([ ... {'id': 'foo', 'data': 'spam'}, ... {'id': 'bar', 'data': 'egg'}, ... ]) >>> _check_id_duplicates([ ... {'id': 'foo', 'data': 'spam'}, ... {'id': 'foo', 'data': 'egg'}, ... ]) Traceback (most recent call last): ... ValueError: The same id is present more than once. """ if len({item['id'] for item in data}) < len(data): raise ValueError('The same id is present more than once.') class IndexerStorage: """SWH Indexer Storage """ def __init__(self, db, min_pool_conns=1, max_pool_conns=10): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection """ try: if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = Db(db) else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db ) self._db = None except psycopg2.OperationalError as e: raise StorageDBError(e) def get_db(self): if self._db: return self._db return Db.from_pool(self._pool) def put_db(self, db): if db is not self._db: db.put_conn() - @remote_api_endpoint('check_config') @db_transaction() def check_config(self, *, check_write, db=None, cur=None): - """Check that the storage is configured and ready to go.""" # Check permissions on one of the tables if check_write: check = 'INSERT' else: check = 'SELECT' cur.execute( "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa (check,) ) return cur.fetchone()[0] - @remote_api_endpoint('content_mimetype/missing') @db_transaction_generator() def content_mimetype_missing(self, mimetypes, db=None, cur=None): - """Generate mimetypes missing from storage. - - Args: - mimetypes (iterable): iterable of dict with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute the - results - - Yields: - tuple (id, indexer_configuration_id): missing id - - """ for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] def _content_get_range(self, content_type, start, end, indexer_configuration_id, limit=1000, with_textual_data=False, db=None, cur=None): - """Retrieve ids of type content_type within range [start, end] bound - by limit. - - Args: - **content_type** (str): content's type (mimetype, language, etc...) - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **indexer_configuration_id** (int): The tool used to index data - **limit** (int): Limit result (default to 1000) - **with_textual_data** (bool): Deal with only textual - content (True) or all - content (all contents by - defaults, False) - - Raises: - ValueError for; - - limit to None - - wrong content_type provided - - Returns: - a dict with keys: - - **ids** [bytes]: iterable of content ids within the range. - - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any - - """ if limit is None: raise ValueError('Development error: limit should not be None') if content_type not in db.content_indexer_names: err = 'Development error: Wrong type. Should be one of [%s]' % ( ','.join(db.content_indexer_names)) raise ValueError(err) ids = [] next_id = None for counter, obj in enumerate(db.content_get_range( content_type, start, end, indexer_configuration_id, limit=limit+1, with_textual_data=with_textual_data, cur=cur)): _id = obj[0] if counter >= limit: next_id = _id break ids.append(_id) return { 'ids': ids, 'next': next_id } - @remote_api_endpoint('content_mimetype/range') @db_transaction() def content_mimetype_get_range(self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): - """Retrieve mimetypes within range [start, end] bound by limit. - - Args: - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **indexer_configuration_id** (int): The tool used to index data - **limit** (int): Limit result (default to 1000) - - Raises: - ValueError for limit to None - - Returns: - a dict with keys: - - **ids** [bytes]: iterable of content ids within the range. - - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any - - """ return self._content_get_range('mimetype', start, end, indexer_configuration_id, limit=limit, db=db, cur=cur) - @remote_api_endpoint('content_mimetype/add') @db_transaction() def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, cur=None): - """Add mimetypes not present in storage. - - Args: - mimetypes (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **mimetype** (bytes): raw content's mimetype - - **encoding** (bytes): raw content's encoding - - **indexer_configuration_id** (int): tool's id used to - compute the results - - **conflict_update** (bool): Flag to determine if we want to - overwrite (``True``) or skip duplicates (``False``, the - default) - - """ _check_id_duplicates(mimetypes) mimetypes.sort(key=lambda m: m['id']) db.mktemp_content_mimetype(cur) db.copy_to(mimetypes, 'tmp_content_mimetype', ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], cur) db.content_mimetype_add_from_temp(conflict_update, cur) - @remote_api_endpoint('content_mimetype') @db_transaction_generator() def content_mimetype_get(self, ids, db=None, cur=None): - """Retrieve full content mimetype per ids. - - Args: - ids (iterable): sha1 identifier - - Yields: - mimetypes (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **mimetype** (bytes): raw content's mimetype - - **encoding** (bytes): raw content's encoding - - **tool** (dict): Tool used to compute the language - - """ for c in db.content_mimetype_get_from_list(ids, cur): yield converters.db_to_mimetype( dict(zip(db.content_mimetype_cols, c))) - @remote_api_endpoint('content_language/missing') @db_transaction_generator() def content_language_missing(self, languages, db=None, cur=None): - """List languages missing from storage. - - Args: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - an iterable of missing id for the tuple (id, - indexer_configuration_id) - - """ for obj in db.content_language_missing_from_list(languages, cur): yield obj[0] - @remote_api_endpoint('content_language') @db_transaction_generator() def content_language_get(self, ids, db=None, cur=None): - """Retrieve full content language per ids. - - Args: - ids (iterable): sha1 identifier - - Yields: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **lang** (bytes): raw content's language - - **tool** (dict): Tool used to compute the language - - """ for c in db.content_language_get_from_list(ids, cur): yield converters.db_to_language( dict(zip(db.content_language_cols, c))) - @remote_api_endpoint('content_language/add') @db_transaction() def content_language_add(self, languages, conflict_update=False, db=None, cur=None): - """Add languages not present in storage. - - Args: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 - - **lang** (bytes): language detected - - conflict_update (bool): Flag to determine if we want to - overwrite (true) or skip duplicates (false, the - default) - - """ _check_id_duplicates(languages) languages.sort(key=lambda m: m['id']) db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( ({ 'id': l['id'], 'lang': 'unknown' if not l['lang'] else l['lang'], 'indexer_configuration_id': l['indexer_configuration_id'], } for l in languages), 'tmp_content_language', ['id', 'lang', 'indexer_configuration_id'], cur) db.content_language_add_from_temp(conflict_update, cur) - @remote_api_endpoint('content/ctags/missing') @db_transaction_generator() def content_ctags_missing(self, ctags, db=None, cur=None): - """List ctags missing from storage. - - Args: - ctags (iterable): dicts with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - an iterable of missing id for the tuple (id, - indexer_configuration_id) - - """ for obj in db.content_ctags_missing_from_list(ctags, cur): yield obj[0] - @remote_api_endpoint('content/ctags') @db_transaction_generator() def content_ctags_get(self, ids, db=None, cur=None): - """Retrieve ctags per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - Dictionaries with keys: - - - **id** (bytes): content's identifier - - **name** (str): symbol's name - - **kind** (str): symbol's kind - - **lang** (str): language for that content - - **tool** (dict): tool used to compute the ctags' info - - - """ for c in db.content_ctags_get_from_list(ids, cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) - @remote_api_endpoint('content/ctags/add') @db_transaction() def content_ctags_add(self, ctags, conflict_update=False, db=None, cur=None): - """Add ctags not present in storage - - Args: - ctags (iterable): dictionaries with keys: - - - **id** (bytes): sha1 - - **ctags** ([list): List of dictionary with keys: name, kind, - line, lang - - """ _check_id_duplicates(ctags) ctags.sort(key=lambda m: m['id']) def _convert_ctags(__ctags): """Convert ctags dict to list of ctags. """ for ctags in __ctags: yield from converters.ctags_to_db(ctags) db.mktemp_content_ctags(cur) db.copy_to(list(_convert_ctags(ctags)), tblname='tmp_content_ctags', columns=['id', 'name', 'kind', 'line', 'lang', 'indexer_configuration_id'], cur=cur) db.content_ctags_add_from_temp(conflict_update, cur) - @remote_api_endpoint('content/ctags/search') @db_transaction_generator() def content_ctags_search(self, expression, limit=10, last_sha1=None, db=None, cur=None): - """Search through content's raw ctags symbols. - - Args: - expression (str): Expression to search for - limit (int): Number of rows to return (default to 10). - last_sha1 (str): Offset from which retrieving data (default to ''). - - Yields: - rows of ctags including id, name, lang, kind, line, etc... - - """ for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) - @remote_api_endpoint('content/fossology_license') @db_transaction_generator() def content_fossology_license_get(self, ids, db=None, cur=None): - """Retrieve licenses per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dict: ``{id: facts}`` where ``facts`` is a dict with the - following keys: - - - **licenses** ([str]): associated licenses for that content - - **tool** (dict): Tool used to compute the license - - """ d = defaultdict(list) for c in db.content_fossology_license_get_from_list(ids, cur): license = dict(zip(db.content_fossology_license_cols, c)) id_ = license['id'] d[id_].append(converters.db_to_fossology_license(license)) for id_, facts in d.items(): yield {id_: facts} - @remote_api_endpoint('content/fossology_license/add') @db_transaction() def content_fossology_license_add(self, licenses, conflict_update=False, db=None, cur=None): - """Add licenses not present in storage. - - Args: - licenses (iterable): dictionaries with keys: - - - **id**: sha1 - - **licenses** ([bytes]): List of licenses associated to sha1 - - **tool** (str): nomossa - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - Returns: - list: content_license entries which failed due to unknown licenses - - """ _check_id_duplicates(licenses) licenses.sort(key=lambda m: m['id']) db.mktemp_content_fossology_license(cur) db.copy_to( ({ 'id': sha1['id'], 'indexer_configuration_id': sha1['indexer_configuration_id'], 'license': license, } for sha1 in licenses for license in sha1['licenses']), tblname='tmp_content_fossology_license', columns=['id', 'license', 'indexer_configuration_id'], cur=cur) db.content_fossology_license_add_from_temp(conflict_update, cur) - @remote_api_endpoint('content/fossology_license/range') @db_transaction() def content_fossology_license_get_range( self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): - """Retrieve licenses within range [start, end] bound by limit. - - Args: - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **indexer_configuration_id** (int): The tool used to index data - **limit** (int): Limit result (default to 1000) - - Raises: - ValueError for limit to None - - Returns: - a dict with keys: - - **ids** [bytes]: iterable of content ids within the range. - - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any - - """ return self._content_get_range('fossology_license', start, end, indexer_configuration_id, limit=limit, with_textual_data=True, db=db, cur=cur) - @remote_api_endpoint('content_metadata/missing') @db_transaction_generator() def content_metadata_missing(self, metadata, db=None, cur=None): - """List metadata missing from storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - missing sha1s - - """ for obj in db.content_metadata_missing_from_list(metadata, cur): yield obj[0] - @remote_api_endpoint('content_metadata') @db_transaction_generator() def content_metadata_get(self, ids, db=None, cur=None): - """Retrieve metadata per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dictionaries with the following keys: - - id (bytes) - metadata (str): associated metadata - tool (dict): tool used to compute metadata - - """ for c in db.content_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.content_metadata_cols, c))) - @remote_api_endpoint('content_metadata/add') @db_transaction() def content_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - """Add metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: sha1 - - **metadata**: arbitrary dict - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ _check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_content_metadata(cur) db.copy_to(metadata, 'tmp_content_metadata', ['id', 'metadata', 'indexer_configuration_id'], cur) db.content_metadata_add_from_temp(conflict_update, cur) - @remote_api_endpoint('revision_intrinsic_metadata/missing') @db_transaction_generator() def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None): - """List metadata missing from storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id** (bytes): sha1_git revision identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - missing ids - - """ for obj in db.revision_intrinsic_metadata_missing_from_list( metadata, cur): yield obj[0] - @remote_api_endpoint('revision_intrinsic_metadata') @db_transaction_generator() def revision_intrinsic_metadata_get(self, ids, db=None, cur=None): - """Retrieve revision metadata per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - : dictionaries with the following keys: - - - **id** (bytes) - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ for c in db.revision_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.revision_intrinsic_metadata_cols, c))) - @remote_api_endpoint('revision_intrinsic_metadata/add') @db_transaction() def revision_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - """Add metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: sha1_git of revision - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ _check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_revision_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_revision_intrinsic_metadata', ['id', 'metadata', 'mappings', 'indexer_configuration_id'], cur) db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur) - @remote_api_endpoint('revision_intrinsic_metadata/delete') @db_transaction() def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None): - """Remove revision metadata from the storage. - - Args: - entries (dict): dictionaries with the following keys: - - - **id** (bytes): revision identifier - - **indexer_configuration_id** (int): tool used to compute - metadata - """ db.revision_intrinsic_metadata_delete(entries, cur) - @remote_api_endpoint('origin_intrinsic_metadata') @db_transaction_generator() def origin_intrinsic_metadata_get(self, ids, db=None, cur=None): - """Retrieve origin metadata per id. - - Args: - ids (iterable): origin identifiers - - Yields: - list: dictionaries with the following keys: - - - **id** (str): origin url - - **from_revision** (bytes): which revision this metadata - was extracted from - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) - @remote_api_endpoint('origin_intrinsic_metadata/add') @db_transaction() def origin_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - """Add origin metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: origin urls - - **from_revision**: sha1 id of the revision used to generate - these metadata. - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ _check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', ['id', 'metadata', 'indexer_configuration_id', 'from_revision', 'mappings'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) - @remote_api_endpoint('origin_intrinsic_metadata/delete') @db_transaction() def origin_intrinsic_metadata_delete( self, entries, db=None, cur=None): - """Remove origin metadata from the storage. - - Args: - entries (dict): dictionaries with the following keys: - - - **id** (str): origin urls - - **indexer_configuration_id** (int): tool used to compute - metadata - """ db.origin_intrinsic_metadata_delete(entries, cur) - @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext') @db_transaction_generator() def origin_intrinsic_metadata_search_fulltext( self, conjunction, limit=100, db=None, cur=None): - """Returns the list of origins whose metadata contain all the terms. - - Args: - conjunction (List[str]): List of terms to be searched for. - limit (int): The maximum number of results to return - - Yields: - list: dictionaries with the following keys: - - - **id** (str): origin urls - - **from_revision**: sha1 id of the revision used to generate - these metadata. - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ for c in db.origin_intrinsic_metadata_search_fulltext( conjunction, limit=limit, cur=cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) - @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer') @db_transaction() def origin_intrinsic_metadata_search_by_producer( self, page_token='', limit=100, ids_only=False, mappings=None, tool_ids=None, db=None, cur=None): - """Returns the list of origins whose metadata contain all the terms. - - Args: - page_token (str): Opaque token used for pagination. - limit (int): The maximum number of results to return - ids_only (bool): Determines whether only origin urls are - returned or the content as well - mappings (List[str]): Returns origins whose intrinsic metadata - were generated using at least one of these mappings. - - Returns: - dict: dict with the following keys: - - **next_page_token** (str, optional): opaque token to be used as - `page_token` for retrieving the next page. If absent, there is - no more pages to gather. - - **origins** (list): list of origin url (str) if `ids_only=True` - else dictionaries with the following keys: - - - **id** (str): origin urls - - **from_revision**: sha1 id of the revision used to generate - these metadata. - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ assert isinstance(page_token, str) # we go to limit+1 to check whether we should add next_page_token in # the response res = db.origin_intrinsic_metadata_search_by_producer( page_token, limit + 1, ids_only, mappings, tool_ids, cur) result = {} if ids_only: result['origins'] = [origin for (origin,) in res] if len(result['origins']) > limit: result['origins'][limit:] = [] result['next_page_token'] = result['origins'][-1] else: result['origins'] = [converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c)))for c in res] if len(result['origins']) > limit: result['origins'][limit:] = [] result['next_page_token'] = result['origins'][-1]['id'] return result - @remote_api_endpoint('origin_intrinsic_metadata/stats') @db_transaction() def origin_intrinsic_metadata_stats( self, db=None, cur=None): - """Returns counts of indexed metadata per origins, broken down - into metadata types. - - Returns: - dict: dictionary with keys: - - - total (int): total number of origins that were indexed - (possibly yielding an empty metadata dictionary) - - non_empty (int): total number of origins that we extracted - a non-empty metadata dictionary from - - per_mapping (dict): a dictionary with mapping names as - keys and number of origins whose indexing used this - mapping. Note that indexing a given origin may use - 0, 1, or many mappings. - """ mapping_names = [m for m in MAPPING_NAMES] select_parts = [] # Count rows for each mapping for mapping_name in mapping_names: select_parts.append(( "sum(case when (mappings @> ARRAY['%s']) " " then 1 else 0 end)" ) % mapping_name) # Total select_parts.append("sum(1)") # Rows whose metadata has at least one key that is not '@context' select_parts.append( "sum(case when ('{}'::jsonb @> (metadata - '@context')) " " then 0 else 1 end)") cur.execute('select ' + ', '.join(select_parts) + ' from origin_intrinsic_metadata') results = dict(zip(mapping_names + ['total', 'non_empty'], cur.fetchone())) return { 'total': results.pop('total'), 'non_empty': results.pop('non_empty'), 'per_mapping': results, } - @remote_api_endpoint('indexer_configuration/add') @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): - """Add new tools to the storage. - - Args: - tools ([dict]): List of dictionary representing tool to - insert in the db. Dictionary with the following keys: - - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) - - Returns: - List of dict inserted in the db (holding the id key as - well). The order of the list is not guaranteed to match - the order of the initial list. - - """ db.mktemp_indexer_configuration(cur) db.copy_to(tools, 'tmp_indexer_configuration', ['tool_name', 'tool_version', 'tool_configuration'], cur) tools = db.indexer_configuration_add_from_temp(cur) for line in tools: yield dict(zip(db.indexer_configuration_cols, line)) - @remote_api_endpoint('indexer_configuration/data') @db_transaction() def indexer_configuration_get(self, tool, db=None, cur=None): - """Retrieve tool information. - - Args: - tool (dict): Dictionary representing a tool with the - following keys: - - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) - - Returns: - The same dictionary with an `id` key, None otherwise. - - """ tool_conf = tool['tool_configuration'] if isinstance(tool_conf, dict): tool_conf = json.dumps(tool_conf) idx = db.indexer_configuration_get(tool['tool_name'], tool['tool_version'], tool_conf) if not idx: return None return dict(zip(db.indexer_configuration_cols, idx)) diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py index ec4c234..0e62adc 100644 --- a/swh/indexer/storage/api/client.py +++ b/swh/indexer/storage/api/client.py @@ -1,17 +1,17 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.core.api import RPCClient from swh.storage.exc import StorageAPIError -from .. import IndexerStorage +from ..interface import IndexerStorageInterface class RemoteStorage(RPCClient): """Proxy to a remote storage API""" - backend_class = IndexerStorage + backend_class = IndexerStorageInterface api_exception = StorageAPIError diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py index 7edfec6..2b71f2b 100644 --- a/swh/indexer/storage/api/server.py +++ b/swh/indexer/storage/api/server.py @@ -1,106 +1,107 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import logging from swh.core import config from swh.core.api import (RPCServerApp, error_handler, encode_data_server as encode_data) from swh.indexer.storage import ( - get_indexer_storage, INDEXER_CFG_KEY, IndexerStorage + get_indexer_storage, INDEXER_CFG_KEY ) +from swh.indexer.storage.interface import IndexerStorageInterface def get_storage(): global storage if not storage: storage = get_indexer_storage(**app.config[INDEXER_CFG_KEY]) return storage app = RPCServerApp(__name__, - backend_class=IndexerStorage, + backend_class=IndexerStorageInterface, backend_factory=get_storage) storage = None @app.errorhandler(Exception) def my_error_handler(exception): return error_handler(exception, encode_data) @app.route('/') def index(): return 'SWH Indexer Storage API server' api_cfg = None def load_and_check_config(config_file, type='local'): """Check the minimal configuration is set to run the api or raise an error explanation. Args: config_file (str): Path to the configuration file to load type (str): configuration type. For 'local' type, more checks are done. Raises: Error if the setup is not as expected Returns: configuration as a dict """ if not config_file: raise EnvironmentError('Configuration file must be defined') if not os.path.exists(config_file): raise FileNotFoundError('Configuration file %s does not exist' % ( config_file, )) cfg = config.read(config_file) if 'indexer_storage' not in cfg: raise KeyError("Missing '%indexer_storage' configuration") if type == 'local': vcfg = cfg['indexer_storage'] cls = vcfg.get('cls') if cls != 'local': raise ValueError( "The indexer_storage backend can only be started with a " "'local' configuration") args = vcfg['args'] if not args.get('db'): raise ValueError( "Invalid configuration; missing 'db' config entry") return cfg def make_app_from_configfile(): """Run the WSGI app from the webserver, loading the configuration from a configuration file. SWH_CONFIG_FILENAME environment variable defines the configuration path to load. """ global api_cfg if not api_cfg: config_file = os.environ.get('SWH_CONFIG_FILENAME') api_cfg = load_and_check_config(config_file) app.config.update(api_cfg) handler = logging.StreamHandler() app.logger.addHandler(handler) return app if __name__ == '__main__': print('Deprecated. Use swh-indexer') diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py index dcb0cea..74a41a5 100644 --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -1,849 +1,421 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import bisect from collections import defaultdict, Counter import itertools import json import operator import math import re from . import MAPPING_NAMES SHA1_DIGEST_SIZE = 160 def _transform_tool(tool): return { 'id': tool['id'], 'name': tool['tool_name'], 'version': tool['tool_version'], 'configuration': tool['tool_configuration'], } class SubStorage: """Implements common missing/get/add logic for each indexer type.""" def __init__(self, tools): self._tools = tools self._sorted_ids = [] self._data = {} # map (id_, tool_id) -> metadata_dict self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] def missing(self, ids): """List data missing from storage. Args: data (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing sha1s """ for id_ in ids: tool_id = id_['indexer_configuration_id'] id_ = id_['id'] if tool_id not in self._tools_per_id.get(id_, set()): yield id_ def get(self, ids): """Retrieve data per id. Args: ids (iterable): sha1 checksums Yields: dict: dictionaries with the following keys: - **id** (bytes) - **tool** (dict): tool used to compute metadata - arbitrary data (as provided to `add`) """ for id_ in ids: for tool_id in self._tools_per_id.get(id_, set()): key = (id_, tool_id) yield { 'id': id_, 'tool': _transform_tool(self._tools[tool_id]), **self._data[key], } def get_all(self): yield from self.get(self._sorted_ids) def get_range(self, start, end, indexer_configuration_id, limit): """Retrieve data within range [start, end] bound by limit. Args: **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result Raises: ValueError for limit to None Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ if limit is None: raise ValueError('Development error: limit should not be None') from_index = bisect.bisect_left(self._sorted_ids, start) to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index) if to_index - from_index >= limit: return { 'ids': self._sorted_ids[from_index:from_index+limit], 'next': self._sorted_ids[from_index+limit], } else: return { 'ids': self._sorted_ids[from_index:to_index], 'next': None, } def add(self, data, conflict_update): """Add data not present in storage. Args: data (iterable): dictionaries with keys: - **id**: sha1 - **indexer_configuration_id**: tool used to compute the results - arbitrary data conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false) """ data = list(data) if len({x['id'] for x in data}) < len(data): # For "exception-compatibility" with the pgsql backend raise ValueError('The same id is present more than once.') for item in data: item = item.copy() tool_id = item.pop('indexer_configuration_id') id_ = item.pop('id') data = item if not conflict_update and \ tool_id in self._tools_per_id.get(id_, set()): # Duplicate, should not be updated continue key = (id_, tool_id) self._data[key] = data self._tools_per_id[id_].add(tool_id) if id_ not in self._sorted_ids: bisect.insort(self._sorted_ids, id_) def add_merge(self, new_data, conflict_update, merged_key): for new_item in new_data: id_ = new_item['id'] tool_id = new_item['indexer_configuration_id'] if conflict_update: all_subitems = [] else: existing = list(self.get([id_])) all_subitems = [ old_subitem for existing_item in existing if existing_item['tool']['id'] == tool_id for old_subitem in existing_item[merged_key] ] for new_subitem in new_item[merged_key]: if new_subitem not in all_subitems: all_subitems.append(new_subitem) self.add([ { 'id': id_, 'indexer_configuration_id': tool_id, merged_key: all_subitems, } ], conflict_update=True) if id_ not in self._sorted_ids: bisect.insort(self._sorted_ids, id_) def delete(self, entries): for entry in entries: (id_, tool_id) = (entry['id'], entry['indexer_configuration_id']) key = (id_, tool_id) if tool_id in self._tools_per_id[id_]: self._tools_per_id[id_].remove(tool_id) if key in self._data: del self._data[key] class IndexerStorage: """In-memory SWH indexer storage.""" def __init__(self): self._tools = {} self._mimetypes = SubStorage(self._tools) self._languages = SubStorage(self._tools) self._content_ctags = SubStorage(self._tools) self._licenses = SubStorage(self._tools) self._content_metadata = SubStorage(self._tools) self._revision_intrinsic_metadata = SubStorage(self._tools) self._origin_intrinsic_metadata = SubStorage(self._tools) def check_config(self, *, check_write): return True def content_mimetype_missing(self, mimetypes): - """Generate mimetypes missing from storage. - - Args: - mimetypes (iterable): iterable of dict with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute the - results - - Yields: - tuple (id, indexer_configuration_id): missing id - - """ yield from self._mimetypes.missing(mimetypes) def content_mimetype_get_range( self, start, end, indexer_configuration_id, limit=1000): - """Retrieve mimetypes within range [start, end] bound by limit. - - Args: - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **indexer_configuration_id** (int): The tool used to index data - **limit** (int): Limit result (default to 1000) - - Raises: - ValueError for limit to None - - Returns: - a dict with keys: - - **ids** [bytes]: iterable of content ids within the range. - - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any - - """ return self._mimetypes.get_range( start, end, indexer_configuration_id, limit) def content_mimetype_add(self, mimetypes, conflict_update=False): - """Add mimetypes not present in storage. - - Args: - mimetypes (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **mimetype** (bytes): raw content's mimetype - - **encoding** (bytes): raw content's encoding - - **indexer_configuration_id** (int): tool's id used to - compute the results - - **conflict_update** (bool): Flag to determine if we want to - overwrite (``True``) or skip duplicates (``False``, the - default) - - """ if not all(isinstance(x['id'], bytes) for x in mimetypes): raise TypeError('identifiers must be bytes.') self._mimetypes.add(mimetypes, conflict_update) - def content_mimetype_get(self, ids, db=None, cur=None): - """Retrieve full content mimetype per ids. - - Args: - ids (iterable): sha1 identifier - - Yields: - mimetypes (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **mimetype** (bytes): raw content's mimetype - - **encoding** (bytes): raw content's encoding - - **tool** (dict): Tool used to compute the language - - """ + def content_mimetype_get(self, ids): yield from self._mimetypes.get(ids) def content_language_missing(self, languages): - """List languages missing from storage. - - Args: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - an iterable of missing id for the tuple (id, - indexer_configuration_id) - - """ yield from self._languages.missing(languages) def content_language_get(self, ids): - """Retrieve full content language per ids. - - Args: - ids (iterable): sha1 identifier - - Yields: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **lang** (bytes): raw content's language - - **tool** (dict): Tool used to compute the language - - """ yield from self._languages.get(ids) def content_language_add(self, languages, conflict_update=False): - """Add languages not present in storage. - - Args: - languages (iterable): dictionaries with keys: - - - **id** (bytes): sha1 - - **lang** (bytes): language detected - - conflict_update (bool): Flag to determine if we want to - overwrite (true) or skip duplicates (false, the - default) - - """ if not all(isinstance(x['id'], bytes) for x in languages): raise TypeError('identifiers must be bytes.') self._languages.add(languages, conflict_update) def content_ctags_missing(self, ctags): - """List ctags missing from storage. - - Args: - ctags (iterable): dicts with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - an iterable of missing id for the tuple (id, - indexer_configuration_id) - - """ yield from self._content_ctags.missing(ctags) def content_ctags_get(self, ids): - """Retrieve ctags per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - Dictionaries with keys: - - - **id** (bytes): content's identifier - - **name** (str): symbol's name - - **kind** (str): symbol's kind - - **lang** (str): language for that content - - **tool** (dict): tool used to compute the ctags' info - - - """ for item in self._content_ctags.get(ids): for item_ctags_item in item['ctags']: yield { 'id': item['id'], 'tool': item['tool'], **item_ctags_item } def content_ctags_add(self, ctags, conflict_update=False): - """Add ctags not present in storage - - Args: - ctags (iterable): dictionaries with keys: - - - **id** (bytes): sha1 - - **ctags** ([list): List of dictionary with keys: name, kind, - line, lang - - **indexer_configuration_id**: tool used to compute the - results - - """ if not all(isinstance(x['id'], bytes) for x in ctags): raise TypeError('identifiers must be bytes.') self._content_ctags.add_merge(ctags, conflict_update, 'ctags') def content_ctags_search(self, expression, - limit=10, last_sha1=None, db=None, cur=None): - """Search through content's raw ctags symbols. - - Args: - expression (str): Expression to search for - limit (int): Number of rows to return (default to 10). - last_sha1 (str): Offset from which retrieving data (default to ''). - - Yields: - rows of ctags including id, name, lang, kind, line, etc... - - """ + limit=10, last_sha1=None): nb_matches = 0 for ((id_, tool_id), item) in \ sorted(self._content_ctags._data.items()): if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): continue for ctags_item in item['ctags']: if ctags_item['name'] != expression: continue nb_matches += 1 yield { 'id': id_, 'tool': _transform_tool(self._tools[tool_id]), **ctags_item } if nb_matches >= limit: return def content_fossology_license_get(self, ids): - """Retrieve licenses per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dict: ``{id: facts}`` where ``facts`` is a dict with the - following keys: - - - **licenses** ([str]): associated licenses for that content - - **tool** (dict): Tool used to compute the license - - """ # Rewrites the output of SubStorage.get from the old format to # the new one. SubStorage.get should be updated once all other # *_get methods use the new format. # See: https://forge.softwareheritage.org/T1433 res = {} for d in self._licenses.get(ids): res.setdefault(d.pop('id'), []).append(d) for (id_, facts) in res.items(): yield {id_: facts} def content_fossology_license_add(self, licenses, conflict_update=False): - """Add licenses not present in storage. - - Args: - licenses (iterable): dictionaries with keys: - - - **id**: sha1 - - **licenses** ([bytes]): List of licenses associated to sha1 - - **tool** (str): nomossa - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - Returns: - list: content_license entries which failed due to unknown licenses - - """ if not all(isinstance(x['id'], bytes) for x in licenses): raise TypeError('identifiers must be bytes.') self._licenses.add_merge(licenses, conflict_update, 'licenses') def content_fossology_license_get_range( self, start, end, indexer_configuration_id, limit=1000): - """Retrieve licenses within range [start, end] bound by limit. - - Args: - **start** (bytes): Starting identifier range (expected smaller - than end) - **end** (bytes): Ending identifier range (expected larger - than start) - **indexer_configuration_id** (int): The tool used to index data - **limit** (int): Limit result (default to 1000) - - Raises: - ValueError for limit to None - - Returns: - a dict with keys: - - **ids** [bytes]: iterable of content ids within the range. - - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any - - """ return self._licenses.get_range( start, end, indexer_configuration_id, limit) def content_metadata_missing(self, metadata): - """List metadata missing from storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id** (bytes): sha1 identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - missing sha1s - - """ yield from self._content_metadata.missing(metadata) def content_metadata_get(self, ids): - """Retrieve metadata per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dictionaries with the following keys: - - - **id** (bytes) - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - """ yield from self._content_metadata.get(ids) def content_metadata_add(self, metadata, conflict_update=False): - """Add metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: sha1 - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute the - results - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ if not all(isinstance(x['id'], bytes) for x in metadata): raise TypeError('identifiers must be bytes.') self._content_metadata.add(metadata, conflict_update) def revision_intrinsic_metadata_missing(self, metadata): - """List metadata missing from storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id** (bytes): sha1_git revision identifier - - **indexer_configuration_id** (int): tool used to compute - the results - - Yields: - missing ids - - """ yield from self._revision_intrinsic_metadata.missing(metadata) def revision_intrinsic_metadata_get(self, ids): - """Retrieve revision metadata per id. - - Args: - ids (iterable): sha1 checksums - - Yields: - dictionaries with the following keys: - - - **id** (bytes) - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ yield from self._revision_intrinsic_metadata.get(ids) def revision_intrinsic_metadata_add(self, metadata, conflict_update=False): - """Add metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: sha1_git of revision - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ if not all(isinstance(x['id'], bytes) for x in metadata): raise TypeError('identifiers must be bytes.') self._revision_intrinsic_metadata.add(metadata, conflict_update) def revision_intrinsic_metadata_delete(self, entries): - """Remove revision metadata from the storage. - - Args: - entries (dict): dictionaries with the following keys: - - **revision** (int): origin identifier - - **id** (int): tool used to compute metadata - """ self._revision_intrinsic_metadata.delete(entries) def origin_intrinsic_metadata_get(self, ids): - """Retrieve origin metadata per id. - - Args: - ids (iterable): origin identifiers - - Yields: - list: dictionaries with the following keys: - - - **id** (str): origin url - - **from_revision** (bytes): which revision this metadata - was extracted from - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ yield from self._origin_intrinsic_metadata.get(ids) def origin_intrinsic_metadata_add(self, metadata, conflict_update=False): - """Add origin metadata not present in storage. - - Args: - metadata (iterable): dictionaries with keys: - - - **id**: origin url - - **from_revision**: sha1 id of the revision used to generate - these metadata. - - **metadata**: arbitrary dict - - **indexer_configuration_id**: tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - conflict_update: Flag to determine if we want to overwrite (true) - or skip duplicates (false, the default) - - """ self._origin_intrinsic_metadata.add(metadata, conflict_update) def origin_intrinsic_metadata_delete(self, entries): - """Remove origin metadata from the storage. - - Args: - entries (dict): dictionaries with the following keys: - - - **id** (str): origin url - - **indexer_configuration_id** (int): tool used to compute - metadata - """ self._origin_intrinsic_metadata.delete(entries) def origin_intrinsic_metadata_search_fulltext( self, conjunction, limit=100): - """Returns the list of origins whose metadata contain all the terms. - - Args: - conjunction (List[str]): List of terms to be searched for. - limit (int): The maximum number of results to return - - Yields: - list: dictionaries with the following keys: - - - **id** (str): origin url - - **from_revision** (bytes): which revision this metadata - was extracted from - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ # A very crude fulltext search implementation, but that's enough # to work on English metadata tokens_re = re.compile('[a-zA-Z0-9]+') search_tokens = list(itertools.chain( *map(tokens_re.findall, conjunction))) def rank(data): # Tokenize the metadata text = json.dumps(data['metadata']) text_tokens = tokens_re.findall(text) text_token_occurences = Counter(text_tokens) # Count the number of occurrences of search tokens in the text score = 0 for search_token in search_tokens: if text_token_occurences[search_token] == 0: # Search token is not in the text. return 0 score += text_token_occurences[search_token] # Normalize according to the text's length return score / math.log(len(text_tokens)) results = [(rank(data), data) for data in self._origin_intrinsic_metadata.get_all()] results = [(rank_, data) for (rank_, data) in results if rank_ > 0] results.sort(key=operator.itemgetter(0), # Don't try to order 'data' reverse=True) for (rank_, result) in results[:limit]: yield result def origin_intrinsic_metadata_search_by_producer( self, page_token='', limit=100, ids_only=False, - mappings=None, tool_ids=None, - db=None, cur=None): - """Returns the list of origins whose metadata contain all the terms. - - Args: - page_token (str): Opaque token used for pagination. - limit (int): The maximum number of results to return - ids_only (bool): Determines whether only origin ids are returned - or the content as well - mappings (List[str]): Returns origins whose intrinsic metadata - were generated using at least one of these mappings. - - Returns: - dict: dict with the following keys: - - **next_page_token** (str, optional): opaque token to be used as - `page_token` for retrieveing the next page. - - **origins** (list): list of origin url (str) if `ids_only=True` - else dictionaries with the following keys: - - - **id** (str): origin urls - - **from_revision**: sha1 id of the revision used to generate - these metadata. - - **metadata** (str): associated metadata - - **tool** (dict): tool used to compute metadata - - **mappings** (List[str]): list of mappings used to translate - these metadata - - """ + mappings=None, tool_ids=None): assert isinstance(page_token, str) nb_results = 0 if mappings is not None: mappings = frozenset(mappings) if tool_ids is not None: tool_ids = frozenset(tool_ids) origins = [] # we go to limit+1 to check whether we should add next_page_token in # the response for entry in self._origin_intrinsic_metadata.get_all(): if entry['id'] <= page_token: continue if nb_results >= (limit + 1): break if mappings is not None and mappings.isdisjoint(entry['mappings']): continue if tool_ids is not None and entry['tool']['id'] not in tool_ids: continue origins.append(entry) nb_results += 1 result = {} if len(origins) > limit: origins = origins[:limit] result['next_page_token'] = origins[-1]['id'] if ids_only: origins = [origin['id'] for origin in origins] result['origins'] = origins return result def origin_intrinsic_metadata_stats(self): - """Returns statistics on stored intrinsic metadata. - - Returns: - dict: dictionary with keys: - - - total (int): total number of origins that were indexed - (possibly yielding an empty metadata dictionary) - - non_empty (int): total number of origins that we extracted - a non-empty metadata dictionary from - - per_mapping (dict): a dictionary with mapping names as - keys and number of origins whose indexing used this - mapping. Note that indexing a given origin may use - 0, 1, or many mappings. - """ mapping_count = {m: 0 for m in MAPPING_NAMES} total = non_empty = 0 for data in self._origin_intrinsic_metadata.get_all(): total += 1 if set(data['metadata']) - {'@context'}: non_empty += 1 for mapping in data['mappings']: mapping_count[mapping] += 1 return { 'per_mapping': mapping_count, 'total': total, 'non_empty': non_empty } def indexer_configuration_add(self, tools): - """Add new tools to the storage. - - Args: - tools ([dict]): List of dictionary representing tool to - insert in the db. Dictionary with the following keys: - - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) - - Returns: - list: List of dict inserted in the db (holding the id key as - well). The order of the list is not guaranteed to match - the order of the initial list. - - """ inserted = [] for tool in tools: tool = tool.copy() id_ = self._tool_key(tool) tool['id'] = id_ self._tools[id_] = tool inserted.append(tool) return inserted def indexer_configuration_get(self, tool): - """Retrieve tool information. - - Args: - tool (dict): Dictionary representing a tool with the - following keys: - - - **tool_name** (str): tool's name - - **tool_version** (str): tool's version - - **tool_configuration** (dict): tool's configuration - (free form dict) - - Returns: - The same dictionary with an `id` key, None otherwise. - - """ return self._tools.get(self._tool_key(tool)) def _tool_key(self, tool): return hash((tool['tool_name'], tool['tool_version'], json.dumps(tool['tool_configuration'], sort_keys=True))) diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/interface.py similarity index 54% copy from swh/indexer/storage/__init__.py copy to swh/indexer/storage/interface.py index 4eaa7d5..a884f1c 100644 --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/interface.py @@ -1,938 +1,600 @@ -# Copyright (C) 2015-2018 The Software Heritage developers +# Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import json -import psycopg2 - -from collections import defaultdict - from swh.core.api import remote_api_endpoint -from swh.storage.common import db_transaction_generator, db_transaction -from swh.storage.exc import StorageDBError -from .db import Db - -from . import converters - - -INDEXER_CFG_KEY = 'indexer_storage' - - -MAPPING_NAMES = ['codemeta', 'gemspec', 'maven', 'npm', 'pkg-info'] - - -def get_indexer_storage(cls, args): - """Get an indexer storage object of class `storage_class` with - arguments `storage_args`. - - Args: - cls (str): storage's class, either 'local' or 'remote' - args (dict): dictionary of arguments passed to the - storage class constructor - - Returns: - an instance of swh.indexer's storage (either local or remote) - - Raises: - ValueError if passed an unknown storage class. - - """ - if cls == 'remote': - from .api.client import RemoteStorage as IndexerStorage - elif cls == 'local': - from . import IndexerStorage - elif cls == 'memory': - from .in_memory import IndexerStorage - else: - raise ValueError('Unknown indexer storage class `%s`' % cls) - - return IndexerStorage(**args) - - -def _check_id_duplicates(data): - """ - If any two dictionaries in `data` have the same id, raises - a `ValueError`. - - Values associated to the key must be hashable. - - Args: - data (List[dict]): List of dictionaries to be inserted - - >>> _check_id_duplicates([ - ... {'id': 'foo', 'data': 'spam'}, - ... {'id': 'bar', 'data': 'egg'}, - ... ]) - >>> _check_id_duplicates([ - ... {'id': 'foo', 'data': 'spam'}, - ... {'id': 'foo', 'data': 'egg'}, - ... ]) - Traceback (most recent call last): - ... - ValueError: The same id is present more than once. - """ - if len({item['id'] for item in data}) < len(data): - raise ValueError('The same id is present more than once.') -class IndexerStorage: - """SWH Indexer Storage - - """ - def __init__(self, db, min_pool_conns=1, max_pool_conns=10): - """ - Args: - db_conn: either a libpq connection string, or a psycopg2 connection - - """ - try: - if isinstance(db, psycopg2.extensions.connection): - self._pool = None - self._db = Db(db) - else: - self._pool = psycopg2.pool.ThreadedConnectionPool( - min_pool_conns, max_pool_conns, db - ) - self._db = None - except psycopg2.OperationalError as e: - raise StorageDBError(e) - - def get_db(self): - if self._db: - return self._db - return Db.from_pool(self._pool) - - def put_db(self, db): - if db is not self._db: - db.put_conn() - +class IndexerStorageInterface: @remote_api_endpoint('check_config') - @db_transaction() - def check_config(self, *, check_write, db=None, cur=None): + def check_config(self, *, check_write): """Check that the storage is configured and ready to go.""" - # Check permissions on one of the tables - if check_write: - check = 'INSERT' - else: - check = 'SELECT' - - cur.execute( - "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa - (check,) - ) - return cur.fetchone()[0] + ... @remote_api_endpoint('content_mimetype/missing') - @db_transaction_generator() - def content_mimetype_missing(self, mimetypes, db=None, cur=None): + def content_mimetype_missing(self, mimetypes): """Generate mimetypes missing from storage. Args: mimetypes (iterable): iterable of dict with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: tuple (id, indexer_configuration_id): missing id """ - for obj in db.content_mimetype_missing_from_list(mimetypes, cur): - yield obj[0] + ... def _content_get_range(self, content_type, start, end, indexer_configuration_id, limit=1000, - with_textual_data=False, - db=None, cur=None): + with_textual_data=False): """Retrieve ids of type content_type within range [start, end] bound by limit. Args: **content_type** (str): content's type (mimetype, language, etc...) **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) **with_textual_data** (bool): Deal with only textual content (True) or all content (all contents by defaults, False) Raises: ValueError for; - limit to None - wrong content_type provided Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ - if limit is None: - raise ValueError('Development error: limit should not be None') - if content_type not in db.content_indexer_names: - err = 'Development error: Wrong type. Should be one of [%s]' % ( - ','.join(db.content_indexer_names)) - raise ValueError(err) - - ids = [] - next_id = None - for counter, obj in enumerate(db.content_get_range( - content_type, start, end, indexer_configuration_id, - limit=limit+1, with_textual_data=with_textual_data, cur=cur)): - _id = obj[0] - if counter >= limit: - next_id = _id - break - - ids.append(_id) - - return { - 'ids': ids, - 'next': next_id - } + ... @remote_api_endpoint('content_mimetype/range') - @db_transaction() def content_mimetype_get_range(self, start, end, indexer_configuration_id, - limit=1000, db=None, cur=None): + limit=1000): """Retrieve mimetypes within range [start, end] bound by limit. Args: **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) Raises: ValueError for limit to None Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ - return self._content_get_range('mimetype', start, end, - indexer_configuration_id, limit=limit, - db=db, cur=cur) + ... @remote_api_endpoint('content_mimetype/add') - @db_transaction() - def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, - cur=None): + def content_mimetype_add(self, mimetypes, conflict_update=False): """Add mimetypes not present in storage. Args: mimetypes (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **mimetype** (bytes): raw content's mimetype - **encoding** (bytes): raw content's encoding - **indexer_configuration_id** (int): tool's id used to compute the results - **conflict_update** (bool): Flag to determine if we want to overwrite (``True``) or skip duplicates (``False``, the default) """ - _check_id_duplicates(mimetypes) - mimetypes.sort(key=lambda m: m['id']) - db.mktemp_content_mimetype(cur) - db.copy_to(mimetypes, 'tmp_content_mimetype', - ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], - cur) - db.content_mimetype_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('content_mimetype') - @db_transaction_generator() - def content_mimetype_get(self, ids, db=None, cur=None): + def content_mimetype_get(self, ids): """Retrieve full content mimetype per ids. Args: ids (iterable): sha1 identifier Yields: mimetypes (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **mimetype** (bytes): raw content's mimetype - **encoding** (bytes): raw content's encoding - **tool** (dict): Tool used to compute the language """ - for c in db.content_mimetype_get_from_list(ids, cur): - yield converters.db_to_mimetype( - dict(zip(db.content_mimetype_cols, c))) + ... @remote_api_endpoint('content_language/missing') - @db_transaction_generator() - def content_language_missing(self, languages, db=None, cur=None): + def content_language_missing(self, languages): """List languages missing from storage. Args: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: an iterable of missing id for the tuple (id, indexer_configuration_id) """ - for obj in db.content_language_missing_from_list(languages, cur): - yield obj[0] + ... @remote_api_endpoint('content_language') - @db_transaction_generator() - def content_language_get(self, ids, db=None, cur=None): + def content_language_get(self, ids): """Retrieve full content language per ids. Args: ids (iterable): sha1 identifier Yields: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **lang** (bytes): raw content's language - **tool** (dict): Tool used to compute the language """ - for c in db.content_language_get_from_list(ids, cur): - yield converters.db_to_language( - dict(zip(db.content_language_cols, c))) + ... @remote_api_endpoint('content_language/add') - @db_transaction() - def content_language_add(self, languages, conflict_update=False, db=None, - cur=None): + def content_language_add(self, languages, conflict_update=False): """Add languages not present in storage. Args: languages (iterable): dictionaries with keys: - **id** (bytes): sha1 - **lang** (bytes): language detected conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ - _check_id_duplicates(languages) - languages.sort(key=lambda m: m['id']) - db.mktemp_content_language(cur) - # empty language is mapped to 'unknown' - db.copy_to( - ({ - 'id': l['id'], - 'lang': 'unknown' if not l['lang'] else l['lang'], - 'indexer_configuration_id': l['indexer_configuration_id'], - } for l in languages), - 'tmp_content_language', - ['id', 'lang', 'indexer_configuration_id'], cur) - - db.content_language_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('content/ctags/missing') - @db_transaction_generator() - def content_ctags_missing(self, ctags, db=None, cur=None): + def content_ctags_missing(self, ctags): """List ctags missing from storage. Args: ctags (iterable): dicts with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: an iterable of missing id for the tuple (id, indexer_configuration_id) """ - for obj in db.content_ctags_missing_from_list(ctags, cur): - yield obj[0] + ... @remote_api_endpoint('content/ctags') - @db_transaction_generator() - def content_ctags_get(self, ids, db=None, cur=None): + def content_ctags_get(self, ids): """Retrieve ctags per id. Args: ids (iterable): sha1 checksums Yields: Dictionaries with keys: - **id** (bytes): content's identifier - **name** (str): symbol's name - **kind** (str): symbol's kind - **lang** (str): language for that content - **tool** (dict): tool used to compute the ctags' info """ - for c in db.content_ctags_get_from_list(ids, cur): - yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) + ... @remote_api_endpoint('content/ctags/add') - @db_transaction() - def content_ctags_add(self, ctags, conflict_update=False, db=None, - cur=None): + def content_ctags_add(self, ctags, conflict_update=False): """Add ctags not present in storage Args: ctags (iterable): dictionaries with keys: - **id** (bytes): sha1 - **ctags** ([list): List of dictionary with keys: name, kind, line, lang """ - _check_id_duplicates(ctags) - ctags.sort(key=lambda m: m['id']) - - def _convert_ctags(__ctags): - """Convert ctags dict to list of ctags. - - """ - for ctags in __ctags: - yield from converters.ctags_to_db(ctags) - - db.mktemp_content_ctags(cur) - db.copy_to(list(_convert_ctags(ctags)), - tblname='tmp_content_ctags', - columns=['id', 'name', 'kind', 'line', - 'lang', 'indexer_configuration_id'], - cur=cur) - - db.content_ctags_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('content/ctags/search') - @db_transaction_generator() def content_ctags_search(self, expression, - limit=10, last_sha1=None, db=None, cur=None): + limit=10, last_sha1=None): """Search through content's raw ctags symbols. Args: expression (str): Expression to search for limit (int): Number of rows to return (default to 10). last_sha1 (str): Offset from which retrieving data (default to ''). Yields: rows of ctags including id, name, lang, kind, line, etc... """ - for obj in db.content_ctags_search(expression, last_sha1, limit, - cur=cur): - yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) + ... @remote_api_endpoint('content/fossology_license') - @db_transaction_generator() - def content_fossology_license_get(self, ids, db=None, cur=None): + def content_fossology_license_get(self, ids): """Retrieve licenses per id. Args: ids (iterable): sha1 checksums Yields: dict: ``{id: facts}`` where ``facts`` is a dict with the following keys: - **licenses** ([str]): associated licenses for that content - **tool** (dict): Tool used to compute the license """ - d = defaultdict(list) - for c in db.content_fossology_license_get_from_list(ids, cur): - license = dict(zip(db.content_fossology_license_cols, c)) - - id_ = license['id'] - d[id_].append(converters.db_to_fossology_license(license)) - - for id_, facts in d.items(): - yield {id_: facts} + ... @remote_api_endpoint('content/fossology_license/add') - @db_transaction() - def content_fossology_license_add(self, licenses, conflict_update=False, - db=None, cur=None): + def content_fossology_license_add(self, licenses, conflict_update=False): """Add licenses not present in storage. Args: licenses (iterable): dictionaries with keys: - **id**: sha1 - **licenses** ([bytes]): List of licenses associated to sha1 - **tool** (str): nomossa conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) Returns: list: content_license entries which failed due to unknown licenses """ - _check_id_duplicates(licenses) - licenses.sort(key=lambda m: m['id']) - db.mktemp_content_fossology_license(cur) - db.copy_to( - ({ - 'id': sha1['id'], - 'indexer_configuration_id': sha1['indexer_configuration_id'], - 'license': license, - } for sha1 in licenses - for license in sha1['licenses']), - tblname='tmp_content_fossology_license', - columns=['id', 'license', 'indexer_configuration_id'], - cur=cur) - db.content_fossology_license_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('content/fossology_license/range') - @db_transaction() def content_fossology_license_get_range( self, start, end, indexer_configuration_id, - limit=1000, db=None, cur=None): + limit=1000): """Retrieve licenses within range [start, end] bound by limit. Args: **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result (default to 1000) Raises: ValueError for limit to None Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ - return self._content_get_range('fossology_license', start, end, - indexer_configuration_id, limit=limit, - with_textual_data=True, db=db, cur=cur) + ... @remote_api_endpoint('content_metadata/missing') - @db_transaction_generator() - def content_metadata_missing(self, metadata, db=None, cur=None): + def content_metadata_missing(self, metadata): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing sha1s """ - for obj in db.content_metadata_missing_from_list(metadata, cur): - yield obj[0] + ... @remote_api_endpoint('content_metadata') - @db_transaction_generator() - def content_metadata_get(self, ids, db=None, cur=None): + def content_metadata_get(self, ids): """Retrieve metadata per id. Args: ids (iterable): sha1 checksums Yields: dictionaries with the following keys: id (bytes) metadata (str): associated metadata tool (dict): tool used to compute metadata """ - for c in db.content_metadata_get_from_list(ids, cur): - yield converters.db_to_metadata( - dict(zip(db.content_metadata_cols, c))) + ... @remote_api_endpoint('content_metadata/add') - @db_transaction() - def content_metadata_add(self, metadata, conflict_update=False, db=None, - cur=None): + def content_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1 - **metadata**: arbitrary dict conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ - _check_id_duplicates(metadata) - metadata.sort(key=lambda m: m['id']) - - db.mktemp_content_metadata(cur) - - db.copy_to(metadata, 'tmp_content_metadata', - ['id', 'metadata', 'indexer_configuration_id'], - cur) - db.content_metadata_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('revision_intrinsic_metadata/missing') - @db_transaction_generator() - def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None): + def revision_intrinsic_metadata_missing(self, metadata): """List metadata missing from storage. Args: metadata (iterable): dictionaries with keys: - **id** (bytes): sha1_git revision identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing ids """ - for obj in db.revision_intrinsic_metadata_missing_from_list( - metadata, cur): - yield obj[0] + ... @remote_api_endpoint('revision_intrinsic_metadata') - @db_transaction_generator() - def revision_intrinsic_metadata_get(self, ids, db=None, cur=None): + def revision_intrinsic_metadata_get(self, ids): """Retrieve revision metadata per id. Args: ids (iterable): sha1 checksums Yields: : dictionaries with the following keys: - **id** (bytes) - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ - for c in db.revision_intrinsic_metadata_get_from_list(ids, cur): - yield converters.db_to_metadata( - dict(zip(db.revision_intrinsic_metadata_cols, c))) + ... @remote_api_endpoint('revision_intrinsic_metadata/add') - @db_transaction() - def revision_intrinsic_metadata_add(self, metadata, conflict_update=False, - db=None, cur=None): + def revision_intrinsic_metadata_add(self, metadata, conflict_update=False): """Add metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: sha1_git of revision - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ - _check_id_duplicates(metadata) - metadata.sort(key=lambda m: m['id']) - - db.mktemp_revision_intrinsic_metadata(cur) - - db.copy_to(metadata, 'tmp_revision_intrinsic_metadata', - ['id', 'metadata', 'mappings', - 'indexer_configuration_id'], - cur) - db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('revision_intrinsic_metadata/delete') - @db_transaction() - def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None): + def revision_intrinsic_metadata_delete(self, entries): """Remove revision metadata from the storage. Args: entries (dict): dictionaries with the following keys: - **id** (bytes): revision identifier - **indexer_configuration_id** (int): tool used to compute metadata """ - db.revision_intrinsic_metadata_delete(entries, cur) + ... @remote_api_endpoint('origin_intrinsic_metadata') - @db_transaction_generator() - def origin_intrinsic_metadata_get(self, ids, db=None, cur=None): + def origin_intrinsic_metadata_get(self, ids): """Retrieve origin metadata per id. Args: ids (iterable): origin identifiers Yields: list: dictionaries with the following keys: - **id** (str): origin url - **from_revision** (bytes): which revision this metadata was extracted from - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ - for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): - yield converters.db_to_metadata( - dict(zip(db.origin_intrinsic_metadata_cols, c))) + ... @remote_api_endpoint('origin_intrinsic_metadata/add') - @db_transaction() def origin_intrinsic_metadata_add(self, metadata, - conflict_update=False, db=None, - cur=None): + conflict_update=False): """Add origin metadata not present in storage. Args: metadata (iterable): dictionaries with keys: - **id**: origin urls - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata**: arbitrary dict - **indexer_configuration_id**: tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata conflict_update: Flag to determine if we want to overwrite (true) or skip duplicates (false, the default) """ - _check_id_duplicates(metadata) - metadata.sort(key=lambda m: m['id']) - - db.mktemp_origin_intrinsic_metadata(cur) - - db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', - ['id', 'metadata', - 'indexer_configuration_id', - 'from_revision', 'mappings'], - cur) - db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) + ... @remote_api_endpoint('origin_intrinsic_metadata/delete') - @db_transaction() def origin_intrinsic_metadata_delete( - self, entries, db=None, cur=None): + self, entries): """Remove origin metadata from the storage. Args: entries (dict): dictionaries with the following keys: - **id** (str): origin urls - **indexer_configuration_id** (int): tool used to compute metadata """ - db.origin_intrinsic_metadata_delete(entries, cur) + ... @remote_api_endpoint('origin_intrinsic_metadata/search/fulltext') - @db_transaction_generator() def origin_intrinsic_metadata_search_fulltext( - self, conjunction, limit=100, db=None, cur=None): + self, conjunction, limit=100): """Returns the list of origins whose metadata contain all the terms. Args: conjunction (List[str]): List of terms to be searched for. limit (int): The maximum number of results to return Yields: list: dictionaries with the following keys: - **id** (str): origin urls - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ - for c in db.origin_intrinsic_metadata_search_fulltext( - conjunction, limit=limit, cur=cur): - yield converters.db_to_metadata( - dict(zip(db.origin_intrinsic_metadata_cols, c))) + ... @remote_api_endpoint('origin_intrinsic_metadata/search/by_producer') - @db_transaction() def origin_intrinsic_metadata_search_by_producer( self, page_token='', limit=100, ids_only=False, - mappings=None, tool_ids=None, - db=None, cur=None): + mappings=None, tool_ids=None): """Returns the list of origins whose metadata contain all the terms. Args: page_token (str): Opaque token used for pagination. limit (int): The maximum number of results to return ids_only (bool): Determines whether only origin urls are returned or the content as well mappings (List[str]): Returns origins whose intrinsic metadata were generated using at least one of these mappings. Returns: dict: dict with the following keys: - **next_page_token** (str, optional): opaque token to be used as `page_token` for retrieving the next page. If absent, there is no more pages to gather. - **origins** (list): list of origin url (str) if `ids_only=True` else dictionaries with the following keys: - **id** (str): origin urls - **from_revision**: sha1 id of the revision used to generate these metadata. - **metadata** (str): associated metadata - **tool** (dict): tool used to compute metadata - **mappings** (List[str]): list of mappings used to translate these metadata """ - assert isinstance(page_token, str) - # we go to limit+1 to check whether we should add next_page_token in - # the response - res = db.origin_intrinsic_metadata_search_by_producer( - page_token, limit + 1, ids_only, mappings, tool_ids, cur) - result = {} - if ids_only: - result['origins'] = [origin for (origin,) in res] - if len(result['origins']) > limit: - result['origins'][limit:] = [] - result['next_page_token'] = result['origins'][-1] - else: - result['origins'] = [converters.db_to_metadata( - dict(zip(db.origin_intrinsic_metadata_cols, c)))for c in res] - if len(result['origins']) > limit: - result['origins'][limit:] = [] - result['next_page_token'] = result['origins'][-1]['id'] - return result + ... @remote_api_endpoint('origin_intrinsic_metadata/stats') - @db_transaction() def origin_intrinsic_metadata_stats( - self, db=None, cur=None): + self): """Returns counts of indexed metadata per origins, broken down into metadata types. Returns: dict: dictionary with keys: - total (int): total number of origins that were indexed (possibly yielding an empty metadata dictionary) - non_empty (int): total number of origins that we extracted a non-empty metadata dictionary from - per_mapping (dict): a dictionary with mapping names as keys and number of origins whose indexing used this mapping. Note that indexing a given origin may use 0, 1, or many mappings. """ - mapping_names = [m for m in MAPPING_NAMES] - select_parts = [] - - # Count rows for each mapping - for mapping_name in mapping_names: - select_parts.append(( - "sum(case when (mappings @> ARRAY['%s']) " - " then 1 else 0 end)" - ) % mapping_name) - - # Total - select_parts.append("sum(1)") - - # Rows whose metadata has at least one key that is not '@context' - select_parts.append( - "sum(case when ('{}'::jsonb @> (metadata - '@context')) " - " then 0 else 1 end)") - cur.execute('select ' + ', '.join(select_parts) - + ' from origin_intrinsic_metadata') - results = dict(zip(mapping_names + ['total', 'non_empty'], - cur.fetchone())) - return { - 'total': results.pop('total'), - 'non_empty': results.pop('non_empty'), - 'per_mapping': results, - } + ... @remote_api_endpoint('indexer_configuration/add') - @db_transaction_generator() - def indexer_configuration_add(self, tools, db=None, cur=None): + def indexer_configuration_add(self, tools): """Add new tools to the storage. Args: tools ([dict]): List of dictionary representing tool to insert in the db. Dictionary with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: List of dict inserted in the db (holding the id key as well). The order of the list is not guaranteed to match the order of the initial list. """ - db.mktemp_indexer_configuration(cur) - db.copy_to(tools, 'tmp_indexer_configuration', - ['tool_name', 'tool_version', 'tool_configuration'], - cur) - - tools = db.indexer_configuration_add_from_temp(cur) - for line in tools: - yield dict(zip(db.indexer_configuration_cols, line)) + ... @remote_api_endpoint('indexer_configuration/data') - @db_transaction() - def indexer_configuration_get(self, tool, db=None, cur=None): + def indexer_configuration_get(self, tool): """Retrieve tool information. Args: tool (dict): Dictionary representing a tool with the following keys: - **tool_name** (str): tool's name - **tool_version** (str): tool's version - **tool_configuration** (dict): tool's configuration (free form dict) Returns: The same dictionary with an `id` key, None otherwise. """ - tool_conf = tool['tool_configuration'] - if isinstance(tool_conf, dict): - tool_conf = json.dumps(tool_conf) - idx = db.indexer_configuration_get(tool['tool_name'], - tool['tool_version'], - tool_conf) - if not idx: - return None - return dict(zip(db.indexer_configuration_cols, idx)) + ... diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py index c6dbc94..3218dc2 100644 --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1,1855 +1,1888 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import inspect import threading + import pytest + from swh.model.hashutil import hash_to_bytes +from swh.indexer.storage.interface import IndexerStorageInterface + def prepare_mimetypes_from(fossology_licenses): """Fossology license needs some consistent data in db to run. """ mimetypes = [] for c in fossology_licenses: mimetypes.append({ 'id': c['id'], 'mimetype': 'text/plain', 'encoding': 'utf-8', 'indexer_configuration_id': c['indexer_configuration_id'], }) return mimetypes def endpoint(storage, endpoint_type, endpoint_name): return getattr(storage, endpoint_type + '_' + endpoint_name) class StorageETypeTester: """Base class for testing a series of common behaviour between a bunch of endpoint types supported by an IndexerStorage. This is supposed to be inherited with the following class attributes: - endpoint_type - tool_name - example_data See below for example usage. """ def test_missing(self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool_id = data.tools[self.tool_name]['id'] # given 2 (hopefully) unknown objects query = [ { 'id': data.sha1_1, 'indexer_configuration_id': tool_id, }, { 'id': data.sha1_2, 'indexer_configuration_id': tool_id, }] # we expect these are both returned by the xxx_missing endpoint actual_missing = endpoint(storage, etype, 'missing')(query) assert list(actual_missing) == [ data.sha1_1, data.sha1_2, ] # now, when we add one of them endpoint(storage, etype, 'add')([{ 'id': data.sha1_2, **self.example_data[0], 'indexer_configuration_id': tool_id, }]) # we expect only the other one returned actual_missing = endpoint(storage, etype, 'missing')(query) assert list(actual_missing) == [data.sha1_1] def test_add__drop_duplicate(self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool_id = data.tools[self.tool_name]['id'] # add the first object data_v1 = { 'id': data.sha1_2, **self.example_data[0], 'indexer_configuration_id': tool_id, } endpoint(storage, etype, 'add')([data_v1]) # should be able to retrieve it actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2])) expected_data_v1 = [{ 'id': data.sha1_2, **self.example_data[0], 'tool': data.tools[self.tool_name], }] assert actual_data == expected_data_v1 # now if we add a modified version of the same object (same id) data_v2 = data_v1.copy() data_v2.update(self.example_data[1]) endpoint(storage, etype, 'add')([data_v2]) # we expect to retrieve the original data, not the modified one actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2])) assert actual_data == expected_data_v1 def test_add__update_in_place_duplicate( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] data_v1 = { 'id': data.sha1_2, **self.example_data[0], 'indexer_configuration_id': tool['id'], } # given endpoint(storage, etype, 'add')([data_v1]) # when actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2])) expected_data_v1 = [{ 'id': data.sha1_2, **self.example_data[0], 'tool': tool, }] # then assert actual_data == expected_data_v1 # given data_v2 = data_v1.copy() data_v2.update(self.example_data[1]) endpoint(storage, etype, 'add')([data_v2], conflict_update=True) actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2])) expected_data_v2 = [{ 'id': data.sha1_2, **self.example_data[1], 'tool': tool, }] # data did change as the v2 was used to overwrite v1 assert actual_data == expected_data_v2 def test_add__update_in_place_deadlock( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] hashes = [ hash_to_bytes( '34973274ccef6ab4dfaaf86599792fa9c3fe4{:03d}'.format(i)) for i in range(1000)] data_v1 = [ { 'id': hash_, **self.example_data[0], 'indexer_configuration_id': tool['id'], } for hash_ in hashes ] data_v2 = [ { 'id': hash_, **self.example_data[1], 'indexer_configuration_id': tool['id'], } for hash_ in hashes ] # Remove one item from each, so that both queries have to succeed for # all items to be in the DB. data_v2a = data_v2[1:] data_v2b = list(reversed(data_v2[0:-1])) # given endpoint(storage, etype, 'add')(data_v1) # when actual_data = list(endpoint(storage, etype, 'get')(hashes)) expected_data_v1 = [ { 'id': hash_, **self.example_data[0], 'tool': tool, } for hash_ in hashes ] # then assert actual_data == expected_data_v1 # given def f1(): endpoint(storage, etype, 'add')(data_v2a, conflict_update=True) def f2(): endpoint(storage, etype, 'add')(data_v2b, conflict_update=True) t1 = threading.Thread(target=f1) t2 = threading.Thread(target=f2) t2.start() t1.start() t1.join() t2.join() actual_data = sorted(endpoint(storage, etype, 'get')(hashes), key=lambda x: x['id']) expected_data_v2 = [ { 'id': hash_, **self.example_data[1], 'tool': tool, } for hash_ in hashes ] assert actual_data == expected_data_v2 def test_add__duplicate_twice(self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] data_rev1 = { 'id': data.revision_id_2, **self.example_data[0], 'indexer_configuration_id': tool['id'] } data_rev2 = { 'id': data.revision_id_2, **self.example_data[1], 'indexer_configuration_id': tool['id'] } # when endpoint(storage, etype, 'add')([data_rev1]) with pytest.raises(ValueError): endpoint(storage, etype, 'add')( [data_rev2, data_rev2], conflict_update=True) # then actual_data = list(endpoint(storage, etype, 'get')( [data.revision_id_2, data.revision_id_1])) expected_data = [{ 'id': data.revision_id_2, **self.example_data[0], 'tool': tool, }] assert actual_data == expected_data def test_get(self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] query = [data.sha1_2, data.sha1_1] data1 = { 'id': data.sha1_2, **self.example_data[0], 'indexer_configuration_id': tool['id'], } # when endpoint(storage, etype, 'add')([data1]) # then actual_data = list(endpoint(storage, etype, 'get')(query)) # then expected_data = [{ 'id': data.sha1_2, **self.example_data[0], 'tool': tool, }] assert actual_data == expected_data class TestIndexerStorageContentMimetypes(StorageETypeTester): """Test Indexer Storage content_mimetype related methods """ endpoint_type = 'content_mimetype' tool_name = 'file' example_data = [ { 'mimetype': 'text/plain', 'encoding': 'utf-8', }, { 'mimetype': 'text/html', 'encoding': 'us-ascii', }, ] def test_generate_content_mimetype_get_range_limit_none( self, swh_indexer_storage): """mimetype_get_range call with wrong limit input should fail""" storage = swh_indexer_storage with pytest.raises(ValueError) as e: storage.content_mimetype_get_range( start=None, end=None, indexer_configuration_id=None, limit=None) assert e.value.args == ( 'Development error: limit should not be None',) def test_generate_content_mimetype_get_range_no_limit( self, swh_indexer_storage_with_data): """mimetype_get_range returns mimetypes within range provided""" storage, data = swh_indexer_storage_with_data mimetypes = data.mimetypes # All ids from the db content_ids = sorted([c['id'] for c in mimetypes]) start = content_ids[0] end = content_ids[-1] # retrieve mimetypes tool_id = mimetypes[0]['indexer_configuration_id'] actual_result = storage.content_mimetype_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert len(mimetypes) == len(actual_ids) assert actual_next is None assert content_ids == actual_ids def test_generate_content_mimetype_get_range_limit( self, swh_indexer_storage_with_data): """mimetype_get_range paginates results if limit exceeded""" storage, data = swh_indexer_storage_with_data indexer_configuration_id = data.tools['file']['id'] # input the list of sha1s we want from storage content_ids = sorted( [c['id'] for c in data.mimetypes]) mimetypes = list(storage.content_mimetype_get(content_ids)) assert len(mimetypes) == len(data.mimetypes) start = content_ids[0] end = content_ids[-1] # retrieve mimetypes limited to 10 results actual_result = storage.content_mimetype_get_range( start, end, indexer_configuration_id=indexer_configuration_id, limit=10) assert actual_result assert set(actual_result.keys()) == {'ids', 'next'} actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert len(actual_ids) == 10 assert actual_next is not None assert actual_next == content_ids[10] expected_mimetypes = content_ids[:10] assert expected_mimetypes == actual_ids # retrieve next part actual_result = storage.content_mimetype_get_range( start=end, end=end, indexer_configuration_id=indexer_configuration_id) assert set(actual_result.keys()) == {'ids', 'next'} actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert actual_next is None expected_mimetypes = [content_ids[-1]] assert expected_mimetypes == actual_ids class TestIndexerStorageContentLanguage(StorageETypeTester): """Test Indexer Storage content_language related methods """ endpoint_type = 'content_language' tool_name = 'pygments' example_data = [ { 'lang': 'haskell', }, { 'lang': 'common-lisp', }, ] class TestIndexerStorageContentCTags(StorageETypeTester): """Test Indexer Storage content_ctags related methods """ endpoint_type = 'content_ctags' tool_name = 'universal-ctags' example_data = [ { 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 119, 'lang': 'OCaml', }] }, { 'ctags': [ { 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Python', }, { 'name': 'main', 'kind': 'function', 'line': 119, 'lang': 'Python', }] }, ] # the following tests are disabled because CTAGS behaves differently @pytest.mark.skip def test_add__drop_duplicate(self): pass @pytest.mark.skip def test_add__update_in_place_duplicate(self): pass @pytest.mark.skip def test_add__update_in_place_deadlock(self): pass @pytest.mark.skip def test_add__duplicate_twice(self): pass @pytest.mark.skip def test_get(self): pass def test_content_ctags_search(self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # 1. given tool = data.tools['universal-ctags'] tool_id = tool['id'] ctag1 = { 'id': data.sha1_1, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', }, { 'name': 'counter', 'kind': 'variable', 'line': 119, 'lang': 'Python', }, { 'name': 'hello', 'kind': 'variable', 'line': 210, 'lang': 'Python', }, ] } ctag2 = { 'id': data.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', }, { 'name': 'result', 'kind': 'variable', 'line': 120, 'lang': 'C', }, ] } storage.content_ctags_add([ctag1, ctag2]) # 1. when actual_ctags = list(storage.content_ctags_search('hello', limit=1)) # 1. then assert actual_ctags == [ { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', } ] # 2. when actual_ctags = list(storage.content_ctags_search( 'hello', limit=1, last_sha1=ctag1['id'])) # 2. then assert actual_ctags == [ { 'id': ctag2['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', } ] # 3. when actual_ctags = list(storage.content_ctags_search('hello')) # 3. then assert actual_ctags == [ { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', }, { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 210, 'lang': 'Python', }, { 'id': ctag2['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', }, ] # 4. when actual_ctags = list(storage.content_ctags_search('counter')) # then assert actual_ctags == [{ 'id': ctag1['id'], 'tool': tool, 'name': 'counter', 'kind': 'variable', 'line': 119, 'lang': 'Python', }] # 5. when actual_ctags = list(storage.content_ctags_search('result', limit=1)) # then assert actual_ctags == [{ 'id': ctag2['id'], 'tool': tool, 'name': 'result', 'kind': 'variable', 'line': 120, 'lang': 'C', }] def test_content_ctags_search_no_result(self, swh_indexer_storage): storage = swh_indexer_storage actual_ctags = list(storage.content_ctags_search('counter')) assert not actual_ctags def test_content_ctags_add__add_new_ctags_added( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool = data.tools['universal-ctags'] tool_id = tool['id'] ctag_v1 = { 'id': data.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }] } # given storage.content_ctags_add([ctag_v1]) storage.content_ctags_add([ctag_v1]) # conflict does nothing # when actual_ctags = list(storage.content_ctags_get([data.sha1_2])) # then expected_ctags = [{ 'id': data.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }] assert actual_ctags == expected_ctags # given ctag_v2 = ctag_v1.copy() ctag_v2.update({ 'ctags': [ { 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', } ] }) storage.content_ctags_add([ctag_v2]) expected_ctags = [ { 'id': data.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }, { 'id': data.sha1_2, 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', 'tool': tool, } ] actual_ctags = list(storage.content_ctags_get( [data.sha1_2])) assert actual_ctags == expected_ctags def test_content_ctags_add__update_in_place( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool = data.tools['universal-ctags'] tool_id = tool['id'] ctag_v1 = { 'id': data.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }] } # given storage.content_ctags_add([ctag_v1]) # when actual_ctags = list(storage.content_ctags_get( [data.sha1_2])) # then expected_ctags = [ { 'id': data.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool } ] assert actual_ctags == expected_ctags # given ctag_v2 = ctag_v1.copy() ctag_v2.update({ 'ctags': [ { 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }, { 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', } ] }) storage.content_ctags_add([ctag_v2], conflict_update=True) actual_ctags = list(storage.content_ctags_get( [data.sha1_2])) # ctag did change as the v2 was used to overwrite v1 expected_ctags = [ { 'id': data.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }, { 'id': data.sha1_2, 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', 'tool': tool, } ] assert actual_ctags == expected_ctags class TestIndexerStorageContentMetadata(StorageETypeTester): """Test Indexer Storage content_metadata related methods """ tool_name = 'swh-metadata-detector' endpoint_type = 'content_metadata' example_data = [ { 'metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, }, { 'metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, }, ] class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester): """Test Indexer Storage revision_intrinsic_metadata related methods """ tool_name = 'swh-metadata-detector' endpoint_type = 'revision_intrinsic_metadata' example_data = [ { 'metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'mappings': ['mapping1'], }, { 'metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, 'mappings': ['mapping2'], }, ] def test_revision_intrinsic_metadata_delete( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] query = [data.sha1_2, data.sha1_1] data1 = { 'id': data.sha1_2, **self.example_data[0], 'indexer_configuration_id': tool['id'], } # when endpoint(storage, etype, 'add')([data1]) endpoint(storage, etype, 'delete')([ { 'id': data.sha1_2, 'indexer_configuration_id': tool['id'], } ]) # then actual_data = list(endpoint(storage, etype, 'get')(query)) # then assert not actual_data def test_revision_intrinsic_metadata_delete_nonexisting( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] endpoint(storage, etype, 'delete')([ { 'id': data.sha1_2, 'indexer_configuration_id': tool['id'], } ]) class TestIndexerStorageContentFossologyLicence: def test_content_fossology_license_add__new_license_added( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool = data.tools['nomos'] tool_id = tool['id'] license_v1 = { 'id': data.sha1_1, 'licenses': ['Apache-2.0'], 'indexer_configuration_id': tool_id, } # given storage.content_fossology_license_add([license_v1]) # conflict does nothing storage.content_fossology_license_add([license_v1]) # when actual_licenses = list(storage.content_fossology_license_get( [data.sha1_1])) # then expected_license = { data.sha1_1: [{ 'licenses': ['Apache-2.0'], 'tool': tool, }] } assert actual_licenses == [expected_license] # given license_v2 = license_v1.copy() license_v2.update({ 'licenses': ['BSD-2-Clause'], }) storage.content_fossology_license_add([license_v2]) actual_licenses = list(storage.content_fossology_license_get( [data.sha1_1])) expected_license = { data.sha1_1: [{ 'licenses': ['Apache-2.0', 'BSD-2-Clause'], 'tool': tool }] } # license did not change as the v2 was dropped. assert actual_licenses == [expected_license] def test_generate_content_fossology_license_get_range_limit_none( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data """license_get_range call with wrong limit input should fail""" with pytest.raises(ValueError) as e: storage.content_fossology_license_get_range( start=None, end=None, indexer_configuration_id=None, limit=None) assert e.value.args == ( 'Development error: limit should not be None',) def test_generate_content_fossology_license_get_range_no_limit( self, swh_indexer_storage_with_data): """license_get_range returns licenses within range provided""" storage, data = swh_indexer_storage_with_data # craft some consistent mimetypes fossology_licenses = data.fossology_licenses mimetypes = prepare_mimetypes_from(fossology_licenses) storage.content_mimetype_add(mimetypes, conflict_update=True) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) # All ids from the db content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert len(fossology_licenses) == len(actual_ids) assert actual_next is None assert content_ids == actual_ids def test_generate_content_fossology_license_get_range_no_limit_with_filter( self, swh_indexer_storage_with_data): """This filters non textual, then returns results within range""" storage, data = swh_indexer_storage_with_data fossology_licenses = data.fossology_licenses mimetypes = data.mimetypes # craft some consistent mimetypes _mimetypes = prepare_mimetypes_from(fossology_licenses) # add binary mimetypes which will get filtered out in results for m in mimetypes: _mimetypes.append({ 'mimetype': 'binary', **m, }) storage.content_mimetype_add(_mimetypes, conflict_update=True) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) # All ids from the db content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert len(fossology_licenses) == len(actual_ids) assert actual_next is None assert content_ids == actual_ids def test_generate_fossology_license_get_range_limit( self, swh_indexer_storage_with_data): """fossology_license_get_range paginates results if limit exceeded""" storage, data = swh_indexer_storage_with_data fossology_licenses = data.fossology_licenses # craft some consistent mimetypes mimetypes = prepare_mimetypes_from(fossology_licenses) # add fossology_licenses to storage storage.content_mimetype_add(mimetypes, conflict_update=True) storage.content_fossology_license_add(fossology_licenses) # input the list of sha1s we want from storage content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses limited to 3 results limited_results = len(fossology_licenses) - 1 tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id, limit=limited_results) actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert limited_results == len(actual_ids) assert actual_next is not None assert actual_next == content_ids[-1] expected_fossology_licenses = content_ids[:-1] assert expected_fossology_licenses == actual_ids # retrieve next part actual_results2 = storage.content_fossology_license_get_range( start=end, end=end, indexer_configuration_id=tool_id) actual_ids2 = actual_results2['ids'] actual_next2 = actual_results2['next'] assert actual_next2 is None expected_fossology_licenses2 = [content_ids[-1]] assert expected_fossology_licenses2 == actual_ids2 class TestIndexerStorageOriginIntrinsicMetadata: def test_origin_intrinsic_metadata_get( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata = { 'version': None, 'name': None, } metadata_rev = { 'id': data.revision_id_2, 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'id': data.origin_url_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], 'from_revision': data.revision_id_2, } # when storage.revision_intrinsic_metadata_add([metadata_rev]) storage.origin_intrinsic_metadata_add([metadata_origin]) # then actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1, 'no://where'])) expected_metadata = [{ 'id': data.origin_url_1, 'metadata': metadata, 'tool': data.tools['swh-metadata-detector'], 'from_revision': data.revision_id_2, 'mappings': ['mapping1'], }] assert actual_metadata == expected_metadata def test_origin_intrinsic_metadata_delete( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata = { 'version': None, 'name': None, } metadata_rev = { 'id': data.revision_id_2, 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'id': data.origin_url_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], 'from_revision': data.revision_id_2, } metadata_origin2 = metadata_origin.copy() metadata_origin2['id'] = data.origin_url_2 # when storage.revision_intrinsic_metadata_add([metadata_rev]) storage.origin_intrinsic_metadata_add([ metadata_origin, metadata_origin2]) storage.origin_intrinsic_metadata_delete([ { 'id': data.origin_url_1, 'indexer_configuration_id': tool_id } ]) # then actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1, data.origin_url_2, 'no://where'])) for item in actual_metadata: item['indexer_configuration_id'] = item.pop('tool')['id'] assert actual_metadata == [metadata_origin2] def test_origin_intrinsic_metadata_delete_nonexisting( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool_id = data.tools['swh-metadata-detector']['id'] storage.origin_intrinsic_metadata_delete([ { 'id': data.origin_url_1, 'indexer_configuration_id': tool_id } ]) def test_origin_intrinsic_metadata_add_drop_duplicate( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata_v1 = { 'version': None, 'name': None, } metadata_rev_v1 = { 'id': data.revision_id_1, 'metadata': metadata_v1.copy(), 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'id': data.origin_url_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], 'from_revision': data.revision_id_1, } # given storage.revision_intrinsic_metadata_add([metadata_rev_v1]) storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1, 'no://where'])) expected_metadata_v1 = [{ 'id': data.origin_url_1, 'metadata': metadata_v1, 'tool': data.tools['swh-metadata-detector'], 'from_revision': data.revision_id_1, 'mappings': [], }] assert actual_metadata == expected_metadata_v1 # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'name': 'test_metadata', 'author': 'MG', }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() metadata_rev_v2['metadata'] = metadata_v2 metadata_origin_v2['metadata'] = metadata_v2 storage.revision_intrinsic_metadata_add([metadata_rev_v2]) storage.origin_intrinsic_metadata_add([metadata_origin_v2]) # then actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1])) # metadata did not change as the v2 was dropped. assert actual_metadata == expected_metadata_v1 def test_origin_intrinsic_metadata_add_update_in_place_duplicate( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata_v1 = { 'version': None, 'name': None, } metadata_rev_v1 = { 'id': data.revision_id_2, 'metadata': metadata_v1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'id': data.origin_url_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], 'from_revision': data.revision_id_2, } # given storage.revision_intrinsic_metadata_add([metadata_rev_v1]) storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1])) # then expected_metadata_v1 = [{ 'id': data.origin_url_1, 'metadata': metadata_v1, 'tool': data.tools['swh-metadata-detector'], 'from_revision': data.revision_id_2, 'mappings': [], }] assert actual_metadata == expected_metadata_v1 # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'name': 'test_update_duplicated_metadata', 'author': 'MG', }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() metadata_rev_v2['metadata'] = metadata_v2 metadata_origin_v2 = { 'id': data.origin_url_1, 'metadata': metadata_v2.copy(), 'indexer_configuration_id': tool_id, 'mappings': ['npm'], 'from_revision': data.revision_id_1, } storage.revision_intrinsic_metadata_add( [metadata_rev_v2], conflict_update=True) storage.origin_intrinsic_metadata_add( [metadata_origin_v2], conflict_update=True) actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1])) expected_metadata_v2 = [{ 'id': data.origin_url_1, 'metadata': metadata_v2, 'tool': data.tools['swh-metadata-detector'], 'from_revision': data.revision_id_1, 'mappings': ['npm'], }] # metadata did change as the v2 was used to overwrite v1 assert actual_metadata == expected_metadata_v2 def test_origin_intrinsic_metadata_add__update_in_place_deadlock( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] ids = list(range(10)) example_data1 = { 'metadata': { 'version': None, 'name': None, }, 'mappings': [], } example_data2 = { 'metadata': { 'version': 'v1.1.1', 'name': 'foo', }, 'mappings': [], } metadata_rev_v1 = { 'id': data.revision_id_2, 'metadata': { 'version': None, 'name': None, }, 'mappings': [], 'indexer_configuration_id': tool_id, } data_v1 = [ { 'id': 'file:///tmp/origin%d' % id_, 'from_revision': data.revision_id_2, **example_data1, 'indexer_configuration_id': tool_id, } for id_ in ids ] data_v2 = [ { 'id': 'file:///tmp/origin%d' % id_, 'from_revision': data.revision_id_2, **example_data2, 'indexer_configuration_id': tool_id, } for id_ in ids ] # Remove one item from each, so that both queries have to succeed for # all items to be in the DB. data_v2a = data_v2[1:] data_v2b = list(reversed(data_v2[0:-1])) # given storage.revision_intrinsic_metadata_add([metadata_rev_v1]) storage.origin_intrinsic_metadata_add(data_v1) # when origins = ['file:///tmp/origin%d' % i for i in ids] actual_data = list(storage.origin_intrinsic_metadata_get(origins)) expected_data_v1 = [ { 'id': 'file:///tmp/origin%d' % id_, 'from_revision': data.revision_id_2, **example_data1, 'tool': data.tools['swh-metadata-detector'], } for id_ in ids ] # then assert actual_data == expected_data_v1 # given def f1(): storage.origin_intrinsic_metadata_add( data_v2a, conflict_update=True) def f2(): storage.origin_intrinsic_metadata_add( data_v2b, conflict_update=True) t1 = threading.Thread(target=f1) t2 = threading.Thread(target=f2) t2.start() t1.start() t1.join() t2.join() actual_data = list(storage.origin_intrinsic_metadata_get(origins)) expected_data_v2 = [ { 'id': 'file:///tmp/origin%d' % id_, 'from_revision': data.revision_id_2, **example_data2, 'tool': data.tools['swh-metadata-detector'], } for id_ in ids ] assert len(actual_data) == len(expected_data_v2) assert sorted(actual_data, key=lambda x: x['id']) == expected_data_v2 def test_origin_intrinsic_metadata_add__duplicate_twice( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata = { 'developmentStatus': None, 'name': None, } metadata_rev = { 'id': data.revision_id_2, 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'id': data.origin_url_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], 'from_revision': data.revision_id_2, } # when storage.revision_intrinsic_metadata_add([metadata_rev]) with pytest.raises(ValueError): storage.origin_intrinsic_metadata_add([ metadata_origin, metadata_origin]) def test_origin_intrinsic_metadata_search_fulltext( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata1 = { 'author': 'John Doe', } metadata1_rev = { 'id': data.revision_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'id': data.origin_url_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': data.revision_id_1, } metadata2 = { 'author': 'Jane Doe', } metadata2_rev = { 'id': data.revision_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'id': data.origin_url_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': data.revision_id_2, } # when storage.revision_intrinsic_metadata_add([metadata1_rev]) storage.origin_intrinsic_metadata_add([metadata1_origin]) storage.revision_intrinsic_metadata_add([metadata2_rev]) storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = storage.origin_intrinsic_metadata_search_fulltext assert set([res['id'] for res in search(['Doe'])]) \ == set([data.origin_url_1, data.origin_url_2]) assert [res['id'] for res in search(['John', 'Doe'])] \ == [data.origin_url_1] assert [res['id'] for res in search(['John'])] \ == [data.origin_url_1] assert not list(search(['John', 'Jane'])) def test_origin_intrinsic_metadata_search_fulltext_rank( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] # The following authors have "Random Person" to add some more content # to the JSON data, to work around normalization quirks when there # are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words # for small values of nb_words). metadata1 = { 'author': [ 'Random Person', 'John Doe', 'Jane Doe', ] } metadata1_rev = { 'id': data.revision_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'id': data.origin_url_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': data.revision_id_1, } metadata2 = { 'author': [ 'Random Person', 'Jane Doe', ] } metadata2_rev = { 'id': data.revision_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'id': data.origin_url_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': data.revision_id_2, } # when storage.revision_intrinsic_metadata_add([metadata1_rev]) storage.origin_intrinsic_metadata_add([metadata1_origin]) storage.revision_intrinsic_metadata_add([metadata2_rev]) storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = storage.origin_intrinsic_metadata_search_fulltext assert [res['id'] for res in search(['Doe'])] \ == [data.origin_url_1, data.origin_url_2] assert [res['id'] for res in search(['Doe'], limit=1)] \ == [data.origin_url_1] assert [res['id'] for res in search(['John'])] \ == [data.origin_url_1] assert [res['id'] for res in search(['Jane'])] \ == [data.origin_url_2, data.origin_url_1] assert [res['id'] for res in search(['John', 'Jane'])] \ == [data.origin_url_1] def _fill_origin_intrinsic_metadata( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool1_id = data.tools['swh-metadata-detector']['id'] tool2_id = data.tools['swh-metadata-detector2']['id'] metadata1 = { '@context': 'foo', 'author': 'John Doe', } metadata1_rev = { 'id': data.revision_id_1, 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, } metadata1_origin = { 'id': data.origin_url_1, 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, 'from_revision': data.revision_id_1, } metadata2 = { '@context': 'foo', 'author': 'Jane Doe', } metadata2_rev = { 'id': data.revision_id_2, 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, } metadata2_origin = { 'id': data.origin_url_2, 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, 'from_revision': data.revision_id_2, } metadata3 = { '@context': 'foo', } metadata3_rev = { 'id': data.revision_id_3, 'metadata': metadata3, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, } metadata3_origin = { 'id': data.origin_url_3, 'metadata': metadata3, 'mappings': ['pkg-info'], 'indexer_configuration_id': tool2_id, 'from_revision': data.revision_id_3, } storage.revision_intrinsic_metadata_add([metadata1_rev]) storage.origin_intrinsic_metadata_add([metadata1_origin]) storage.revision_intrinsic_metadata_add([metadata2_rev]) storage.origin_intrinsic_metadata_add([metadata2_origin]) storage.revision_intrinsic_metadata_add([metadata3_rev]) storage.origin_intrinsic_metadata_add([metadata3_origin]) def test_origin_intrinsic_metadata_search_by_producer( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data self._fill_origin_intrinsic_metadata( swh_indexer_storage_with_data) tool1 = data.tools['swh-metadata-detector'] tool2 = data.tools['swh-metadata-detector2'] endpoint = storage.origin_intrinsic_metadata_search_by_producer # test pagination # no 'page_token' param, return all origins result = endpoint(ids_only=True) assert result['origins'] \ == [data.origin_url_1, data.origin_url_2, data.origin_url_3] assert 'next_page_token' not in result # 'page_token' is < than origin_1, return everything result = endpoint(page_token=data.origin_url_1[:-1], ids_only=True) assert result['origins'] \ == [data.origin_url_1, data.origin_url_2, data.origin_url_3] assert 'next_page_token' not in result # 'page_token' is origin_3, return nothing result = endpoint(page_token=data.origin_url_3, ids_only=True) assert not result['origins'] assert 'next_page_token' not in result # test limit argument result = endpoint(page_token=data.origin_url_1[:-1], limit=2, ids_only=True) assert result['origins'] == [data.origin_url_1, data.origin_url_2] assert result['next_page_token'] == result['origins'][-1] result = endpoint(page_token=data.origin_url_1, limit=2, ids_only=True) assert result['origins'] == [data.origin_url_2, data.origin_url_3] assert 'next_page_token' not in result result = endpoint(page_token=data.origin_url_2, limit=2, ids_only=True) assert result['origins'] == [data.origin_url_3] assert 'next_page_token' not in result # test mappings filtering result = endpoint(mappings=['npm'], ids_only=True) assert result['origins'] == [data.origin_url_1, data.origin_url_2] assert 'next_page_token' not in result result = endpoint(mappings=['npm', 'gemspec'], ids_only=True) assert result['origins'] == [data.origin_url_1, data.origin_url_2] assert 'next_page_token' not in result result = endpoint(mappings=['gemspec'], ids_only=True) assert result['origins'] == [data.origin_url_2] assert 'next_page_token' not in result result = endpoint(mappings=['pkg-info'], ids_only=True) assert result['origins'] == [data.origin_url_3] assert 'next_page_token' not in result result = endpoint(mappings=['foobar'], ids_only=True) assert not result['origins'] assert 'next_page_token' not in result # test pagination + mappings result = endpoint(mappings=['npm'], limit=1, ids_only=True) assert result['origins'] == [data.origin_url_1] assert result['next_page_token'] == result['origins'][-1] # test tool filtering result = endpoint(tool_ids=[tool1['id']], ids_only=True) assert result['origins'] == [data.origin_url_1] assert 'next_page_token' not in result result = endpoint(tool_ids=[tool2['id']], ids_only=True) assert sorted(result['origins']) \ == [data.origin_url_2, data.origin_url_3] assert 'next_page_token' not in result result = endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True) assert sorted(result['origins']) \ == [data.origin_url_1, data.origin_url_2, data.origin_url_3] assert 'next_page_token' not in result # test ids_only=False assert endpoint(mappings=['gemspec'])['origins'] \ == [{ 'id': data.origin_url_2, 'metadata': { '@context': 'foo', 'author': 'Jane Doe', }, 'mappings': ['npm', 'gemspec'], 'tool': tool2, 'from_revision': data.revision_id_2, }] def test_origin_intrinsic_metadata_stats( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data self._fill_origin_intrinsic_metadata( swh_indexer_storage_with_data) result = storage.origin_intrinsic_metadata_stats() assert result == { 'per_mapping': { 'gemspec': 1, 'npm': 2, 'pkg-info': 1, 'codemeta': 0, 'maven': 0, }, 'total': 3, 'non_empty': 2, } class TestIndexerStorageIndexerCondifuration: def test_indexer_configuration_add( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'some-unknown-tool', 'tool_version': 'some-version', 'tool_configuration': {"debian-package": "some-package"}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool is None # does not exist # add it actual_tools = list(storage.indexer_configuration_add([tool])) assert len(actual_tools) == 1 actual_tool = actual_tools[0] assert actual_tool is not None # now it exists new_id = actual_tool.pop('id') assert actual_tool == tool actual_tools2 = list(storage.indexer_configuration_add([tool])) actual_tool2 = actual_tools2[0] assert actual_tool2 is not None # now it exists new_id2 = actual_tool2.pop('id') assert new_id == new_id2 assert actual_tool == actual_tool2 def test_indexer_configuration_add_multiple( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'some-unknown-tool', 'tool_version': 'some-version', 'tool_configuration': {"debian-package": "some-package"}, } actual_tools = list(storage.indexer_configuration_add([tool])) assert len(actual_tools) == 1 new_tools = [tool, { 'tool_name': 'yet-another-tool', 'tool_version': 'version', 'tool_configuration': {}, }] actual_tools = list(storage.indexer_configuration_add(new_tools)) assert len(actual_tools) == 2 # order not guaranteed, so we iterate over results to check for tool in actual_tools: _id = tool.pop('id') assert _id is not None assert tool in new_tools def test_indexer_configuration_get_missing( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'unknown-tool', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool is None def test_indexer_configuration_get( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'nomos', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool expected_tool = tool.copy() del actual_tool['id'] assert expected_tool == actual_tool def test_indexer_configuration_metadata_get_missing_context( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"context": "unknown-context"}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool is None def test_indexer_configuration_metadata_get( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"type": "local", "context": "NpmMapping"}, } storage.indexer_configuration_add([tool]) actual_tool = storage.indexer_configuration_get(tool) assert actual_tool expected_tool = tool.copy() expected_tool['id'] = actual_tool['id'] assert expected_tool == actual_tool class TestIndexerStorageMisc: """Misc endpoints tests for the IndexerStorage. """ def test_check_config(self, swh_indexer_storage): storage = swh_indexer_storage assert storage.check_config(check_write=True) assert storage.check_config(check_write=False) + + def test_types(self, swh_indexer_storage): + """Checks all methods of StorageInterface are implemented by this + backend, and that they have the same signature.""" + # Create an instance of the protocol (which cannot be instantiated + # directly, so this creates a subclass, then instantiates it) + interface = type('_', (IndexerStorageInterface,), {})() + + assert 'content_mimetype_add' in dir(interface) + + missing_methods = [] + + for meth_name in dir(interface): + if meth_name.startswith('_'): + continue + interface_meth = getattr(interface, meth_name) + try: + concrete_meth = getattr(swh_indexer_storage, meth_name) + except AttributeError: + missing_methods.append(meth_name) + continue + + expected_signature = inspect.signature(interface_meth) + actual_signature = inspect.signature(concrete_meth) + + assert expected_signature == actual_signature, meth_name + + assert missing_methods == []