diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py index 4987333..d853f17 100644 --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -1,452 +1,455 @@ # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import psycopg2 import psycopg2.pool -from collections import defaultdict +from collections import defaultdict, Counter from swh.storage.common import db_transaction_generator, db_transaction from swh.storage.exc import StorageDBError -from .db import Db from . import converters +from .db import Db +from .exc import IndexerStorageArgumentException, DuplicateId INDEXER_CFG_KEY = 'indexer_storage' MAPPING_NAMES = ['codemeta', 'gemspec', 'maven', 'npm', 'pkg-info'] def get_indexer_storage(cls, args): """Get an indexer storage object of class `storage_class` with arguments `storage_args`. Args: cls (str): storage's class, either 'local' or 'remote' args (dict): dictionary of arguments passed to the storage class constructor Returns: an instance of swh.indexer's storage (either local or remote) Raises: ValueError if passed an unknown storage class. """ if cls == 'remote': from .api.client import RemoteStorage as IndexerStorage elif cls == 'local': from . import IndexerStorage elif cls == 'memory': from .in_memory import IndexerStorage else: raise ValueError('Unknown indexer storage class `%s`' % cls) return IndexerStorage(**args) -def _check_id_duplicates(data): +def check_id_duplicates(data): """ If any two dictionaries in `data` have the same id, raises a `ValueError`. Values associated to the key must be hashable. Args: data (List[dict]): List of dictionaries to be inserted - >>> _check_id_duplicates([ + >>> check_id_duplicates([ ... {'id': 'foo', 'data': 'spam'}, ... {'id': 'bar', 'data': 'egg'}, ... ]) - >>> _check_id_duplicates([ + >>> check_id_duplicates([ ... {'id': 'foo', 'data': 'spam'}, ... {'id': 'foo', 'data': 'egg'}, ... ]) Traceback (most recent call last): ... - ValueError: The same id is present more than once. + swh.indexer.storage.exc.DuplicateId: ['foo'] """ - if len({item['id'] for item in data}) < len(data): - raise ValueError('The same id is present more than once.') + counter = Counter(item['id'] for item in data) + duplicates = [id_ for (id_, count) in counter.items() if count >= 2] + if duplicates: + raise DuplicateId(duplicates) class IndexerStorage: """SWH Indexer Storage """ def __init__(self, db, min_pool_conns=1, max_pool_conns=10): """ Args: db_conn: either a libpq connection string, or a psycopg2 connection """ try: if isinstance(db, psycopg2.extensions.connection): self._pool = None self._db = Db(db) else: self._pool = psycopg2.pool.ThreadedConnectionPool( min_pool_conns, max_pool_conns, db ) self._db = None except psycopg2.OperationalError as e: raise StorageDBError(e) def get_db(self): if self._db: return self._db return Db.from_pool(self._pool) def put_db(self, db): if db is not self._db: db.put_conn() @db_transaction() def check_config(self, *, check_write, db=None, cur=None): # Check permissions on one of the tables if check_write: check = 'INSERT' else: check = 'SELECT' cur.execute( "select has_table_privilege(current_user, 'content_mimetype', %s)", # noqa (check,) ) return cur.fetchone()[0] @db_transaction_generator() def content_mimetype_missing(self, mimetypes, db=None, cur=None): for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] def _content_get_range(self, content_type, start, end, indexer_configuration_id, limit=1000, with_textual_data=False, db=None, cur=None): if limit is None: - raise ValueError('Development error: limit should not be None') + raise IndexerStorageArgumentException('limit should not be None') if content_type not in db.content_indexer_names: - err = 'Development error: Wrong type. Should be one of [%s]' % ( + err = 'Wrong type. Should be one of [%s]' % ( ','.join(db.content_indexer_names)) - raise ValueError(err) + raise IndexerStorageArgumentException(err) ids = [] next_id = None for counter, obj in enumerate(db.content_get_range( content_type, start, end, indexer_configuration_id, limit=limit+1, with_textual_data=with_textual_data, cur=cur)): _id = obj[0] if counter >= limit: next_id = _id break ids.append(_id) return { 'ids': ids, 'next': next_id } @db_transaction() def content_mimetype_get_range(self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): return self._content_get_range('mimetype', start, end, indexer_configuration_id, limit=limit, db=db, cur=cur) @db_transaction() def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, cur=None): - _check_id_duplicates(mimetypes) + check_id_duplicates(mimetypes) mimetypes.sort(key=lambda m: m['id']) db.mktemp_content_mimetype(cur) db.copy_to(mimetypes, 'tmp_content_mimetype', ['id', 'mimetype', 'encoding', 'indexer_configuration_id'], cur) db.content_mimetype_add_from_temp(conflict_update, cur) @db_transaction_generator() def content_mimetype_get(self, ids, db=None, cur=None): for c in db.content_mimetype_get_from_list(ids, cur): yield converters.db_to_mimetype( dict(zip(db.content_mimetype_cols, c))) @db_transaction_generator() def content_language_missing(self, languages, db=None, cur=None): for obj in db.content_language_missing_from_list(languages, cur): yield obj[0] @db_transaction_generator() def content_language_get(self, ids, db=None, cur=None): for c in db.content_language_get_from_list(ids, cur): yield converters.db_to_language( dict(zip(db.content_language_cols, c))) @db_transaction() def content_language_add(self, languages, conflict_update=False, db=None, cur=None): - _check_id_duplicates(languages) + check_id_duplicates(languages) languages.sort(key=lambda m: m['id']) db.mktemp_content_language(cur) # empty language is mapped to 'unknown' db.copy_to( ({ 'id': l['id'], 'lang': 'unknown' if not l['lang'] else l['lang'], 'indexer_configuration_id': l['indexer_configuration_id'], } for l in languages), 'tmp_content_language', ['id', 'lang', 'indexer_configuration_id'], cur) db.content_language_add_from_temp(conflict_update, cur) @db_transaction_generator() def content_ctags_missing(self, ctags, db=None, cur=None): for obj in db.content_ctags_missing_from_list(ctags, cur): yield obj[0] @db_transaction_generator() def content_ctags_get(self, ids, db=None, cur=None): for c in db.content_ctags_get_from_list(ids, cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, c))) @db_transaction() def content_ctags_add(self, ctags, conflict_update=False, db=None, cur=None): - _check_id_duplicates(ctags) + check_id_duplicates(ctags) ctags.sort(key=lambda m: m['id']) def _convert_ctags(__ctags): """Convert ctags dict to list of ctags. """ for ctags in __ctags: yield from converters.ctags_to_db(ctags) db.mktemp_content_ctags(cur) db.copy_to(list(_convert_ctags(ctags)), tblname='tmp_content_ctags', columns=['id', 'name', 'kind', 'line', 'lang', 'indexer_configuration_id'], cur=cur) db.content_ctags_add_from_temp(conflict_update, cur) @db_transaction_generator() def content_ctags_search(self, expression, limit=10, last_sha1=None, db=None, cur=None): for obj in db.content_ctags_search(expression, last_sha1, limit, cur=cur): yield converters.db_to_ctags(dict(zip(db.content_ctags_cols, obj))) @db_transaction_generator() def content_fossology_license_get(self, ids, db=None, cur=None): d = defaultdict(list) for c in db.content_fossology_license_get_from_list(ids, cur): license = dict(zip(db.content_fossology_license_cols, c)) id_ = license['id'] d[id_].append(converters.db_to_fossology_license(license)) for id_, facts in d.items(): yield {id_: facts} @db_transaction() def content_fossology_license_add(self, licenses, conflict_update=False, db=None, cur=None): - _check_id_duplicates(licenses) + check_id_duplicates(licenses) licenses.sort(key=lambda m: m['id']) db.mktemp_content_fossology_license(cur) db.copy_to( ({ 'id': sha1['id'], 'indexer_configuration_id': sha1['indexer_configuration_id'], 'license': license, } for sha1 in licenses for license in sha1['licenses']), tblname='tmp_content_fossology_license', columns=['id', 'license', 'indexer_configuration_id'], cur=cur) db.content_fossology_license_add_from_temp(conflict_update, cur) @db_transaction() def content_fossology_license_get_range( self, start, end, indexer_configuration_id, limit=1000, db=None, cur=None): return self._content_get_range('fossology_license', start, end, indexer_configuration_id, limit=limit, with_textual_data=True, db=db, cur=cur) @db_transaction_generator() def content_metadata_missing(self, metadata, db=None, cur=None): for obj in db.content_metadata_missing_from_list(metadata, cur): yield obj[0] @db_transaction_generator() def content_metadata_get(self, ids, db=None, cur=None): for c in db.content_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.content_metadata_cols, c))) @db_transaction() def content_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - _check_id_duplicates(metadata) + check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_content_metadata(cur) db.copy_to(metadata, 'tmp_content_metadata', ['id', 'metadata', 'indexer_configuration_id'], cur) db.content_metadata_add_from_temp(conflict_update, cur) @db_transaction_generator() def revision_intrinsic_metadata_missing(self, metadata, db=None, cur=None): for obj in db.revision_intrinsic_metadata_missing_from_list( metadata, cur): yield obj[0] @db_transaction_generator() def revision_intrinsic_metadata_get(self, ids, db=None, cur=None): for c in db.revision_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.revision_intrinsic_metadata_cols, c))) @db_transaction() def revision_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - _check_id_duplicates(metadata) + check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_revision_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_revision_intrinsic_metadata', ['id', 'metadata', 'mappings', 'indexer_configuration_id'], cur) db.revision_intrinsic_metadata_add_from_temp(conflict_update, cur) @db_transaction() def revision_intrinsic_metadata_delete(self, entries, db=None, cur=None): db.revision_intrinsic_metadata_delete(entries, cur) @db_transaction_generator() def origin_intrinsic_metadata_get(self, ids, db=None, cur=None): for c in db.origin_intrinsic_metadata_get_from_list(ids, cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) @db_transaction() def origin_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - _check_id_duplicates(metadata) + check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_origin_intrinsic_metadata(cur) db.copy_to(metadata, 'tmp_origin_intrinsic_metadata', ['id', 'metadata', 'indexer_configuration_id', 'from_revision', 'mappings'], cur) db.origin_intrinsic_metadata_add_from_temp(conflict_update, cur) @db_transaction() def origin_intrinsic_metadata_delete( self, entries, db=None, cur=None): db.origin_intrinsic_metadata_delete(entries, cur) @db_transaction_generator() def origin_intrinsic_metadata_search_fulltext( self, conjunction, limit=100, db=None, cur=None): for c in db.origin_intrinsic_metadata_search_fulltext( conjunction, limit=limit, cur=cur): yield converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c))) @db_transaction() def origin_intrinsic_metadata_search_by_producer( self, page_token='', limit=100, ids_only=False, mappings=None, tool_ids=None, db=None, cur=None): assert isinstance(page_token, str) # we go to limit+1 to check whether we should add next_page_token in # the response res = db.origin_intrinsic_metadata_search_by_producer( page_token, limit + 1, ids_only, mappings, tool_ids, cur) result = {} if ids_only: result['origins'] = [origin for (origin,) in res] if len(result['origins']) > limit: result['origins'][limit:] = [] result['next_page_token'] = result['origins'][-1] else: result['origins'] = [converters.db_to_metadata( dict(zip(db.origin_intrinsic_metadata_cols, c)))for c in res] if len(result['origins']) > limit: result['origins'][limit:] = [] result['next_page_token'] = result['origins'][-1]['id'] return result @db_transaction() def origin_intrinsic_metadata_stats( self, db=None, cur=None): mapping_names = [m for m in MAPPING_NAMES] select_parts = [] # Count rows for each mapping for mapping_name in mapping_names: select_parts.append(( "sum(case when (mappings @> ARRAY['%s']) " " then 1 else 0 end)" ) % mapping_name) # Total select_parts.append("sum(1)") # Rows whose metadata has at least one key that is not '@context' select_parts.append( "sum(case when ('{}'::jsonb @> (metadata - '@context')) " " then 0 else 1 end)") cur.execute('select ' + ', '.join(select_parts) + ' from origin_intrinsic_metadata') results = dict(zip(mapping_names + ['total', 'non_empty'], cur.fetchone())) return { 'total': results.pop('total'), 'non_empty': results.pop('non_empty'), 'per_mapping': results, } @db_transaction_generator() def indexer_configuration_add(self, tools, db=None, cur=None): db.mktemp_indexer_configuration(cur) db.copy_to(tools, 'tmp_indexer_configuration', ['tool_name', 'tool_version', 'tool_configuration'], cur) tools = db.indexer_configuration_add_from_temp(cur) for line in tools: yield dict(zip(db.indexer_configuration_cols, line)) @db_transaction() def indexer_configuration_get(self, tool, db=None, cur=None): tool_conf = tool['tool_configuration'] if isinstance(tool_conf, dict): tool_conf = json.dumps(tool_conf) idx = db.indexer_configuration_get(tool['tool_name'], tool['tool_version'], tool_conf) if not idx: return None return dict(zip(db.indexer_configuration_cols, idx)) diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py index 0e62adc..f1768f9 100644 --- a/swh/indexer/storage/api/client.py +++ b/swh/indexer/storage/api/client.py @@ -1,17 +1,23 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.core.api import RPCClient -from swh.storage.exc import StorageAPIError +from swh.indexer.storage.exc import ( + IndexerStorageAPIError, IndexerStorageArgumentException, + DuplicateId, +) from ..interface import IndexerStorageInterface class RemoteStorage(RPCClient): """Proxy to a remote storage API""" backend_class = IndexerStorageInterface - api_exception = StorageAPIError + api_exception = IndexerStorageAPIError + reraise_exceptions = [ + IndexerStorageArgumentException, DuplicateId + ] diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py index 2b71f2b..0266a55 100644 --- a/swh/indexer/storage/api/server.py +++ b/swh/indexer/storage/api/server.py @@ -1,107 +1,113 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import logging from swh.core import config from swh.core.api import (RPCServerApp, error_handler, encode_data_server as encode_data) from swh.indexer.storage import ( get_indexer_storage, INDEXER_CFG_KEY ) +from swh.indexer.storage.exc import IndexerStorageArgumentException from swh.indexer.storage.interface import IndexerStorageInterface def get_storage(): global storage if not storage: storage = get_indexer_storage(**app.config[INDEXER_CFG_KEY]) return storage app = RPCServerApp(__name__, backend_class=IndexerStorageInterface, backend_factory=get_storage) storage = None @app.errorhandler(Exception) def my_error_handler(exception): return error_handler(exception, encode_data) +@app.errorhandler(IndexerStorageArgumentException) +def argument_error_handler(exception): + return error_handler(exception, encode_data, status_code=400) + + @app.route('/') def index(): return 'SWH Indexer Storage API server' api_cfg = None def load_and_check_config(config_file, type='local'): """Check the minimal configuration is set to run the api or raise an error explanation. Args: config_file (str): Path to the configuration file to load type (str): configuration type. For 'local' type, more checks are done. Raises: Error if the setup is not as expected Returns: configuration as a dict """ if not config_file: raise EnvironmentError('Configuration file must be defined') if not os.path.exists(config_file): raise FileNotFoundError('Configuration file %s does not exist' % ( config_file, )) cfg = config.read(config_file) if 'indexer_storage' not in cfg: raise KeyError("Missing '%indexer_storage' configuration") if type == 'local': vcfg = cfg['indexer_storage'] cls = vcfg.get('cls') if cls != 'local': raise ValueError( "The indexer_storage backend can only be started with a " "'local' configuration") args = vcfg['args'] if not args.get('db'): raise ValueError( "Invalid configuration; missing 'db' config entry") return cfg def make_app_from_configfile(): """Run the WSGI app from the webserver, loading the configuration from a configuration file. SWH_CONFIG_FILENAME environment variable defines the configuration path to load. """ global api_cfg if not api_cfg: config_file = os.environ.get('SWH_CONFIG_FILENAME') api_cfg = load_and_check_config(config_file) app.config.update(api_cfg) handler = logging.StreamHandler() app.logger.addHandler(handler) return app if __name__ == '__main__': print('Deprecated. Use swh-indexer') diff --git a/swh/indexer/storage/exc.py b/swh/indexer/storage/exc.py new file mode 100644 index 0000000..841ee0c --- /dev/null +++ b/swh/indexer/storage/exc.py @@ -0,0 +1,19 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +class IndexerStorageAPIError(Exception): + """Generic error of the indexer storage.""" + pass + + +class IndexerStorageArgumentException(Exception): + """Argument passed to an IndexerStorage endpoint is invalid.""" + pass + + +class DuplicateId(IndexerStorageArgumentException): + """The same identifier is present more than once.""" + pass diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py index 74a41a5..46e1516 100644 --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -1,421 +1,421 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import bisect from collections import defaultdict, Counter import itertools import json import operator import math import re +from typing import Any, Dict, List -from . import MAPPING_NAMES +from . import MAPPING_NAMES, check_id_duplicates +from .exc import IndexerStorageArgumentException SHA1_DIGEST_SIZE = 160 def _transform_tool(tool): return { 'id': tool['id'], 'name': tool['tool_name'], 'version': tool['tool_version'], 'configuration': tool['tool_configuration'], } +def check_id_types(data: List[Dict[str, Any]]): + """Checks all elements of the list have an 'id' whose type is 'bytes'.""" + if not all(isinstance(item.get('id'), bytes) for item in data): + raise IndexerStorageArgumentException('identifiers must be bytes.') + + class SubStorage: """Implements common missing/get/add logic for each indexer type.""" def __init__(self, tools): self._tools = tools self._sorted_ids = [] self._data = {} # map (id_, tool_id) -> metadata_dict self._tools_per_id = defaultdict(set) # map id_ -> Set[tool_id] def missing(self, ids): """List data missing from storage. Args: data (iterable): dictionaries with keys: - **id** (bytes): sha1 identifier - **indexer_configuration_id** (int): tool used to compute the results Yields: missing sha1s """ for id_ in ids: tool_id = id_['indexer_configuration_id'] id_ = id_['id'] if tool_id not in self._tools_per_id.get(id_, set()): yield id_ def get(self, ids): """Retrieve data per id. Args: ids (iterable): sha1 checksums Yields: dict: dictionaries with the following keys: - **id** (bytes) - **tool** (dict): tool used to compute metadata - arbitrary data (as provided to `add`) """ for id_ in ids: for tool_id in self._tools_per_id.get(id_, set()): key = (id_, tool_id) yield { 'id': id_, 'tool': _transform_tool(self._tools[tool_id]), **self._data[key], } def get_all(self): yield from self.get(self._sorted_ids) def get_range(self, start, end, indexer_configuration_id, limit): """Retrieve data within range [start, end] bound by limit. Args: **start** (bytes): Starting identifier range (expected smaller than end) **end** (bytes): Ending identifier range (expected larger than start) **indexer_configuration_id** (int): The tool used to index data **limit** (int): Limit result Raises: - ValueError for limit to None + IndexerStorageArgumentException for limit to None Returns: a dict with keys: - **ids** [bytes]: iterable of content ids within the range. - **next** (Optional[bytes]): The next range of sha1 starts at this sha1 if any """ if limit is None: - raise ValueError('Development error: limit should not be None') + raise IndexerStorageArgumentException('limit should not be None') from_index = bisect.bisect_left(self._sorted_ids, start) to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index) if to_index - from_index >= limit: return { 'ids': self._sorted_ids[from_index:from_index+limit], 'next': self._sorted_ids[from_index+limit], } else: return { 'ids': self._sorted_ids[from_index:to_index], 'next': None, } def add(self, data, conflict_update): """Add data not present in storage. Args: data (iterable): dictionaries with keys: - **id**: sha1 - **indexer_configuration_id**: tool used to compute the results - arbitrary data conflict_update (bool): Flag to determine if we want to overwrite (true) or skip duplicates (false) """ data = list(data) - if len({x['id'] for x in data}) < len(data): - # For "exception-compatibility" with the pgsql backend - raise ValueError('The same id is present more than once.') + check_id_duplicates(data) for item in data: item = item.copy() tool_id = item.pop('indexer_configuration_id') id_ = item.pop('id') data = item if not conflict_update and \ tool_id in self._tools_per_id.get(id_, set()): # Duplicate, should not be updated continue key = (id_, tool_id) self._data[key] = data self._tools_per_id[id_].add(tool_id) if id_ not in self._sorted_ids: bisect.insort(self._sorted_ids, id_) def add_merge(self, new_data, conflict_update, merged_key): for new_item in new_data: id_ = new_item['id'] tool_id = new_item['indexer_configuration_id'] if conflict_update: all_subitems = [] else: existing = list(self.get([id_])) all_subitems = [ old_subitem for existing_item in existing if existing_item['tool']['id'] == tool_id for old_subitem in existing_item[merged_key] ] for new_subitem in new_item[merged_key]: if new_subitem not in all_subitems: all_subitems.append(new_subitem) self.add([ { 'id': id_, 'indexer_configuration_id': tool_id, merged_key: all_subitems, } ], conflict_update=True) if id_ not in self._sorted_ids: bisect.insort(self._sorted_ids, id_) def delete(self, entries): for entry in entries: (id_, tool_id) = (entry['id'], entry['indexer_configuration_id']) key = (id_, tool_id) if tool_id in self._tools_per_id[id_]: self._tools_per_id[id_].remove(tool_id) if key in self._data: del self._data[key] class IndexerStorage: """In-memory SWH indexer storage.""" def __init__(self): self._tools = {} self._mimetypes = SubStorage(self._tools) self._languages = SubStorage(self._tools) self._content_ctags = SubStorage(self._tools) self._licenses = SubStorage(self._tools) self._content_metadata = SubStorage(self._tools) self._revision_intrinsic_metadata = SubStorage(self._tools) self._origin_intrinsic_metadata = SubStorage(self._tools) def check_config(self, *, check_write): return True def content_mimetype_missing(self, mimetypes): yield from self._mimetypes.missing(mimetypes) def content_mimetype_get_range( self, start, end, indexer_configuration_id, limit=1000): return self._mimetypes.get_range( start, end, indexer_configuration_id, limit) def content_mimetype_add(self, mimetypes, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in mimetypes): - raise TypeError('identifiers must be bytes.') + check_id_types(mimetypes) self._mimetypes.add(mimetypes, conflict_update) def content_mimetype_get(self, ids): yield from self._mimetypes.get(ids) def content_language_missing(self, languages): yield from self._languages.missing(languages) def content_language_get(self, ids): yield from self._languages.get(ids) def content_language_add(self, languages, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in languages): - raise TypeError('identifiers must be bytes.') + check_id_types(languages) self._languages.add(languages, conflict_update) def content_ctags_missing(self, ctags): yield from self._content_ctags.missing(ctags) def content_ctags_get(self, ids): for item in self._content_ctags.get(ids): for item_ctags_item in item['ctags']: yield { 'id': item['id'], 'tool': item['tool'], **item_ctags_item } def content_ctags_add(self, ctags, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in ctags): - raise TypeError('identifiers must be bytes.') + check_id_types(ctags) self._content_ctags.add_merge(ctags, conflict_update, 'ctags') def content_ctags_search(self, expression, limit=10, last_sha1=None): nb_matches = 0 for ((id_, tool_id), item) in \ sorted(self._content_ctags._data.items()): if id_ <= (last_sha1 or bytes(0 for _ in range(SHA1_DIGEST_SIZE))): continue for ctags_item in item['ctags']: if ctags_item['name'] != expression: continue nb_matches += 1 yield { 'id': id_, 'tool': _transform_tool(self._tools[tool_id]), **ctags_item } if nb_matches >= limit: return def content_fossology_license_get(self, ids): # Rewrites the output of SubStorage.get from the old format to # the new one. SubStorage.get should be updated once all other # *_get methods use the new format. # See: https://forge.softwareheritage.org/T1433 res = {} for d in self._licenses.get(ids): res.setdefault(d.pop('id'), []).append(d) for (id_, facts) in res.items(): yield {id_: facts} def content_fossology_license_add(self, licenses, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in licenses): - raise TypeError('identifiers must be bytes.') + check_id_types(licenses) self._licenses.add_merge(licenses, conflict_update, 'licenses') def content_fossology_license_get_range( self, start, end, indexer_configuration_id, limit=1000): return self._licenses.get_range( start, end, indexer_configuration_id, limit) def content_metadata_missing(self, metadata): yield from self._content_metadata.missing(metadata) def content_metadata_get(self, ids): yield from self._content_metadata.get(ids) def content_metadata_add(self, metadata, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in metadata): - raise TypeError('identifiers must be bytes.') + check_id_types(metadata) self._content_metadata.add(metadata, conflict_update) def revision_intrinsic_metadata_missing(self, metadata): yield from self._revision_intrinsic_metadata.missing(metadata) def revision_intrinsic_metadata_get(self, ids): yield from self._revision_intrinsic_metadata.get(ids) def revision_intrinsic_metadata_add(self, metadata, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in metadata): - raise TypeError('identifiers must be bytes.') + check_id_types(metadata) self._revision_intrinsic_metadata.add(metadata, conflict_update) def revision_intrinsic_metadata_delete(self, entries): self._revision_intrinsic_metadata.delete(entries) def origin_intrinsic_metadata_get(self, ids): yield from self._origin_intrinsic_metadata.get(ids) def origin_intrinsic_metadata_add(self, metadata, conflict_update=False): self._origin_intrinsic_metadata.add(metadata, conflict_update) def origin_intrinsic_metadata_delete(self, entries): self._origin_intrinsic_metadata.delete(entries) def origin_intrinsic_metadata_search_fulltext( self, conjunction, limit=100): # A very crude fulltext search implementation, but that's enough # to work on English metadata tokens_re = re.compile('[a-zA-Z0-9]+') search_tokens = list(itertools.chain( *map(tokens_re.findall, conjunction))) def rank(data): # Tokenize the metadata text = json.dumps(data['metadata']) text_tokens = tokens_re.findall(text) text_token_occurences = Counter(text_tokens) # Count the number of occurrences of search tokens in the text score = 0 for search_token in search_tokens: if text_token_occurences[search_token] == 0: # Search token is not in the text. return 0 score += text_token_occurences[search_token] # Normalize according to the text's length return score / math.log(len(text_tokens)) results = [(rank(data), data) for data in self._origin_intrinsic_metadata.get_all()] results = [(rank_, data) for (rank_, data) in results if rank_ > 0] results.sort(key=operator.itemgetter(0), # Don't try to order 'data' reverse=True) for (rank_, result) in results[:limit]: yield result def origin_intrinsic_metadata_search_by_producer( self, page_token='', limit=100, ids_only=False, mappings=None, tool_ids=None): assert isinstance(page_token, str) nb_results = 0 if mappings is not None: mappings = frozenset(mappings) if tool_ids is not None: tool_ids = frozenset(tool_ids) origins = [] # we go to limit+1 to check whether we should add next_page_token in # the response for entry in self._origin_intrinsic_metadata.get_all(): if entry['id'] <= page_token: continue if nb_results >= (limit + 1): break if mappings is not None and mappings.isdisjoint(entry['mappings']): continue if tool_ids is not None and entry['tool']['id'] not in tool_ids: continue origins.append(entry) nb_results += 1 result = {} if len(origins) > limit: origins = origins[:limit] result['next_page_token'] = origins[-1]['id'] if ids_only: origins = [origin['id'] for origin in origins] result['origins'] = origins return result def origin_intrinsic_metadata_stats(self): mapping_count = {m: 0 for m in MAPPING_NAMES} total = non_empty = 0 for data in self._origin_intrinsic_metadata.get_all(): total += 1 if set(data['metadata']) - {'@context'}: non_empty += 1 for mapping in data['mappings']: mapping_count[mapping] += 1 return { 'per_mapping': mapping_count, 'total': total, 'non_empty': non_empty } def indexer_configuration_add(self, tools): inserted = [] for tool in tools: tool = tool.copy() id_ = self._tool_key(tool) tool['id'] = id_ self._tools[id_] = tool inserted.append(tool) return inserted def indexer_configuration_get(self, tool): return self._tools.get(self._tool_key(tool)) def _tool_key(self, tool): return hash((tool['tool_name'], tool['tool_version'], json.dumps(tool['tool_configuration'], sort_keys=True))) diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py index 3218dc2..8f4c2fb 100644 --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -1,1888 +1,1889 @@ # Copyright (C) 2015-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import inspect import threading import pytest from swh.model.hashutil import hash_to_bytes +from swh.indexer.storage.exc import ( + IndexerStorageArgumentException, DuplicateId, +) from swh.indexer.storage.interface import IndexerStorageInterface def prepare_mimetypes_from(fossology_licenses): """Fossology license needs some consistent data in db to run. """ mimetypes = [] for c in fossology_licenses: mimetypes.append({ 'id': c['id'], 'mimetype': 'text/plain', 'encoding': 'utf-8', 'indexer_configuration_id': c['indexer_configuration_id'], }) return mimetypes def endpoint(storage, endpoint_type, endpoint_name): return getattr(storage, endpoint_type + '_' + endpoint_name) class StorageETypeTester: """Base class for testing a series of common behaviour between a bunch of endpoint types supported by an IndexerStorage. This is supposed to be inherited with the following class attributes: - endpoint_type - tool_name - example_data See below for example usage. """ def test_missing(self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool_id = data.tools[self.tool_name]['id'] # given 2 (hopefully) unknown objects query = [ { 'id': data.sha1_1, 'indexer_configuration_id': tool_id, }, { 'id': data.sha1_2, 'indexer_configuration_id': tool_id, }] # we expect these are both returned by the xxx_missing endpoint actual_missing = endpoint(storage, etype, 'missing')(query) assert list(actual_missing) == [ data.sha1_1, data.sha1_2, ] # now, when we add one of them endpoint(storage, etype, 'add')([{ 'id': data.sha1_2, **self.example_data[0], 'indexer_configuration_id': tool_id, }]) # we expect only the other one returned actual_missing = endpoint(storage, etype, 'missing')(query) assert list(actual_missing) == [data.sha1_1] def test_add__drop_duplicate(self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool_id = data.tools[self.tool_name]['id'] # add the first object data_v1 = { 'id': data.sha1_2, **self.example_data[0], 'indexer_configuration_id': tool_id, } endpoint(storage, etype, 'add')([data_v1]) # should be able to retrieve it actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2])) expected_data_v1 = [{ 'id': data.sha1_2, **self.example_data[0], 'tool': data.tools[self.tool_name], }] assert actual_data == expected_data_v1 # now if we add a modified version of the same object (same id) data_v2 = data_v1.copy() data_v2.update(self.example_data[1]) endpoint(storage, etype, 'add')([data_v2]) # we expect to retrieve the original data, not the modified one actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2])) assert actual_data == expected_data_v1 def test_add__update_in_place_duplicate( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] data_v1 = { 'id': data.sha1_2, **self.example_data[0], 'indexer_configuration_id': tool['id'], } # given endpoint(storage, etype, 'add')([data_v1]) # when actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2])) expected_data_v1 = [{ 'id': data.sha1_2, **self.example_data[0], 'tool': tool, }] # then assert actual_data == expected_data_v1 # given data_v2 = data_v1.copy() data_v2.update(self.example_data[1]) endpoint(storage, etype, 'add')([data_v2], conflict_update=True) actual_data = list(endpoint(storage, etype, 'get')([data.sha1_2])) expected_data_v2 = [{ 'id': data.sha1_2, **self.example_data[1], 'tool': tool, }] # data did change as the v2 was used to overwrite v1 assert actual_data == expected_data_v2 def test_add__update_in_place_deadlock( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] hashes = [ hash_to_bytes( '34973274ccef6ab4dfaaf86599792fa9c3fe4{:03d}'.format(i)) for i in range(1000)] data_v1 = [ { 'id': hash_, **self.example_data[0], 'indexer_configuration_id': tool['id'], } for hash_ in hashes ] data_v2 = [ { 'id': hash_, **self.example_data[1], 'indexer_configuration_id': tool['id'], } for hash_ in hashes ] # Remove one item from each, so that both queries have to succeed for # all items to be in the DB. data_v2a = data_v2[1:] data_v2b = list(reversed(data_v2[0:-1])) # given endpoint(storage, etype, 'add')(data_v1) # when actual_data = list(endpoint(storage, etype, 'get')(hashes)) expected_data_v1 = [ { 'id': hash_, **self.example_data[0], 'tool': tool, } for hash_ in hashes ] # then assert actual_data == expected_data_v1 # given def f1(): endpoint(storage, etype, 'add')(data_v2a, conflict_update=True) def f2(): endpoint(storage, etype, 'add')(data_v2b, conflict_update=True) t1 = threading.Thread(target=f1) t2 = threading.Thread(target=f2) t2.start() t1.start() t1.join() t2.join() actual_data = sorted(endpoint(storage, etype, 'get')(hashes), key=lambda x: x['id']) expected_data_v2 = [ { 'id': hash_, **self.example_data[1], 'tool': tool, } for hash_ in hashes ] assert actual_data == expected_data_v2 def test_add__duplicate_twice(self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] data_rev1 = { 'id': data.revision_id_2, **self.example_data[0], 'indexer_configuration_id': tool['id'] } data_rev2 = { 'id': data.revision_id_2, **self.example_data[1], 'indexer_configuration_id': tool['id'] } # when endpoint(storage, etype, 'add')([data_rev1]) - with pytest.raises(ValueError): + with pytest.raises(DuplicateId): endpoint(storage, etype, 'add')( [data_rev2, data_rev2], conflict_update=True) # then actual_data = list(endpoint(storage, etype, 'get')( [data.revision_id_2, data.revision_id_1])) expected_data = [{ 'id': data.revision_id_2, **self.example_data[0], 'tool': tool, }] assert actual_data == expected_data def test_get(self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] query = [data.sha1_2, data.sha1_1] data1 = { 'id': data.sha1_2, **self.example_data[0], 'indexer_configuration_id': tool['id'], } # when endpoint(storage, etype, 'add')([data1]) # then actual_data = list(endpoint(storage, etype, 'get')(query)) # then expected_data = [{ 'id': data.sha1_2, **self.example_data[0], 'tool': tool, }] assert actual_data == expected_data class TestIndexerStorageContentMimetypes(StorageETypeTester): """Test Indexer Storage content_mimetype related methods """ endpoint_type = 'content_mimetype' tool_name = 'file' example_data = [ { 'mimetype': 'text/plain', 'encoding': 'utf-8', }, { 'mimetype': 'text/html', 'encoding': 'us-ascii', }, ] def test_generate_content_mimetype_get_range_limit_none( self, swh_indexer_storage): """mimetype_get_range call with wrong limit input should fail""" storage = swh_indexer_storage - with pytest.raises(ValueError) as e: + with pytest.raises(IndexerStorageArgumentException) as e: storage.content_mimetype_get_range( start=None, end=None, indexer_configuration_id=None, limit=None) - assert e.value.args == ( - 'Development error: limit should not be None',) + assert e.value.args == ('limit should not be None',) def test_generate_content_mimetype_get_range_no_limit( self, swh_indexer_storage_with_data): """mimetype_get_range returns mimetypes within range provided""" storage, data = swh_indexer_storage_with_data mimetypes = data.mimetypes # All ids from the db content_ids = sorted([c['id'] for c in mimetypes]) start = content_ids[0] end = content_ids[-1] # retrieve mimetypes tool_id = mimetypes[0]['indexer_configuration_id'] actual_result = storage.content_mimetype_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert len(mimetypes) == len(actual_ids) assert actual_next is None assert content_ids == actual_ids def test_generate_content_mimetype_get_range_limit( self, swh_indexer_storage_with_data): """mimetype_get_range paginates results if limit exceeded""" storage, data = swh_indexer_storage_with_data indexer_configuration_id = data.tools['file']['id'] # input the list of sha1s we want from storage content_ids = sorted( [c['id'] for c in data.mimetypes]) mimetypes = list(storage.content_mimetype_get(content_ids)) assert len(mimetypes) == len(data.mimetypes) start = content_ids[0] end = content_ids[-1] # retrieve mimetypes limited to 10 results actual_result = storage.content_mimetype_get_range( start, end, indexer_configuration_id=indexer_configuration_id, limit=10) assert actual_result assert set(actual_result.keys()) == {'ids', 'next'} actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert len(actual_ids) == 10 assert actual_next is not None assert actual_next == content_ids[10] expected_mimetypes = content_ids[:10] assert expected_mimetypes == actual_ids # retrieve next part actual_result = storage.content_mimetype_get_range( start=end, end=end, indexer_configuration_id=indexer_configuration_id) assert set(actual_result.keys()) == {'ids', 'next'} actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert actual_next is None expected_mimetypes = [content_ids[-1]] assert expected_mimetypes == actual_ids class TestIndexerStorageContentLanguage(StorageETypeTester): """Test Indexer Storage content_language related methods """ endpoint_type = 'content_language' tool_name = 'pygments' example_data = [ { 'lang': 'haskell', }, { 'lang': 'common-lisp', }, ] class TestIndexerStorageContentCTags(StorageETypeTester): """Test Indexer Storage content_ctags related methods """ endpoint_type = 'content_ctags' tool_name = 'universal-ctags' example_data = [ { 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 119, 'lang': 'OCaml', }] }, { 'ctags': [ { 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Python', }, { 'name': 'main', 'kind': 'function', 'line': 119, 'lang': 'Python', }] }, ] # the following tests are disabled because CTAGS behaves differently @pytest.mark.skip def test_add__drop_duplicate(self): pass @pytest.mark.skip def test_add__update_in_place_duplicate(self): pass @pytest.mark.skip def test_add__update_in_place_deadlock(self): pass @pytest.mark.skip def test_add__duplicate_twice(self): pass @pytest.mark.skip def test_get(self): pass def test_content_ctags_search(self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # 1. given tool = data.tools['universal-ctags'] tool_id = tool['id'] ctag1 = { 'id': data.sha1_1, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', }, { 'name': 'counter', 'kind': 'variable', 'line': 119, 'lang': 'Python', }, { 'name': 'hello', 'kind': 'variable', 'line': 210, 'lang': 'Python', }, ] } ctag2 = { 'id': data.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [ { 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', }, { 'name': 'result', 'kind': 'variable', 'line': 120, 'lang': 'C', }, ] } storage.content_ctags_add([ctag1, ctag2]) # 1. when actual_ctags = list(storage.content_ctags_search('hello', limit=1)) # 1. then assert actual_ctags == [ { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', } ] # 2. when actual_ctags = list(storage.content_ctags_search( 'hello', limit=1, last_sha1=ctag1['id'])) # 2. then assert actual_ctags == [ { 'id': ctag2['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', } ] # 3. when actual_ctags = list(storage.content_ctags_search('hello')) # 3. then assert actual_ctags == [ { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'function', 'line': 133, 'lang': 'Python', }, { 'id': ctag1['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 210, 'lang': 'Python', }, { 'id': ctag2['id'], 'tool': tool, 'name': 'hello', 'kind': 'variable', 'line': 100, 'lang': 'C', }, ] # 4. when actual_ctags = list(storage.content_ctags_search('counter')) # then assert actual_ctags == [{ 'id': ctag1['id'], 'tool': tool, 'name': 'counter', 'kind': 'variable', 'line': 119, 'lang': 'Python', }] # 5. when actual_ctags = list(storage.content_ctags_search('result', limit=1)) # then assert actual_ctags == [{ 'id': ctag2['id'], 'tool': tool, 'name': 'result', 'kind': 'variable', 'line': 120, 'lang': 'C', }] def test_content_ctags_search_no_result(self, swh_indexer_storage): storage = swh_indexer_storage actual_ctags = list(storage.content_ctags_search('counter')) assert not actual_ctags def test_content_ctags_add__add_new_ctags_added( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool = data.tools['universal-ctags'] tool_id = tool['id'] ctag_v1 = { 'id': data.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }] } # given storage.content_ctags_add([ctag_v1]) storage.content_ctags_add([ctag_v1]) # conflict does nothing # when actual_ctags = list(storage.content_ctags_get([data.sha1_2])) # then expected_ctags = [{ 'id': data.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }] assert actual_ctags == expected_ctags # given ctag_v2 = ctag_v1.copy() ctag_v2.update({ 'ctags': [ { 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', } ] }) storage.content_ctags_add([ctag_v2]) expected_ctags = [ { 'id': data.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }, { 'id': data.sha1_2, 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', 'tool': tool, } ] actual_ctags = list(storage.content_ctags_get( [data.sha1_2])) assert actual_ctags == expected_ctags def test_content_ctags_add__update_in_place( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool = data.tools['universal-ctags'] tool_id = tool['id'] ctag_v1 = { 'id': data.sha1_2, 'indexer_configuration_id': tool_id, 'ctags': [{ 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }] } # given storage.content_ctags_add([ctag_v1]) # when actual_ctags = list(storage.content_ctags_get( [data.sha1_2])) # then expected_ctags = [ { 'id': data.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool } ] assert actual_ctags == expected_ctags # given ctag_v2 = ctag_v1.copy() ctag_v2.update({ 'ctags': [ { 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', }, { 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', } ] }) storage.content_ctags_add([ctag_v2], conflict_update=True) actual_ctags = list(storage.content_ctags_get( [data.sha1_2])) # ctag did change as the v2 was used to overwrite v1 expected_ctags = [ { 'id': data.sha1_2, 'name': 'done', 'kind': 'variable', 'line': 100, 'lang': 'Scheme', 'tool': tool, }, { 'id': data.sha1_2, 'name': 'defn', 'kind': 'function', 'line': 120, 'lang': 'Scheme', 'tool': tool, } ] assert actual_ctags == expected_ctags class TestIndexerStorageContentMetadata(StorageETypeTester): """Test Indexer Storage content_metadata related methods """ tool_name = 'swh-metadata-detector' endpoint_type = 'content_metadata' example_data = [ { 'metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, }, { 'metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, }, ] class TestIndexerStorageRevisionIntrinsicMetadata(StorageETypeTester): """Test Indexer Storage revision_intrinsic_metadata related methods """ tool_name = 'swh-metadata-detector' endpoint_type = 'revision_intrinsic_metadata' example_data = [ { 'metadata': { 'other': {}, 'codeRepository': { 'type': 'git', 'url': 'https://github.com/moranegg/metadata_test' }, 'description': 'Simple package.json test for indexer', 'name': 'test_metadata', 'version': '0.0.1' }, 'mappings': ['mapping1'], }, { 'metadata': { 'other': {}, 'name': 'test_metadata', 'version': '0.0.1' }, 'mappings': ['mapping2'], }, ] def test_revision_intrinsic_metadata_delete( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] query = [data.sha1_2, data.sha1_1] data1 = { 'id': data.sha1_2, **self.example_data[0], 'indexer_configuration_id': tool['id'], } # when endpoint(storage, etype, 'add')([data1]) endpoint(storage, etype, 'delete')([ { 'id': data.sha1_2, 'indexer_configuration_id': tool['id'], } ]) # then actual_data = list(endpoint(storage, etype, 'get')(query)) # then assert not actual_data def test_revision_intrinsic_metadata_delete_nonexisting( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data etype = self.endpoint_type tool = data.tools[self.tool_name] endpoint(storage, etype, 'delete')([ { 'id': data.sha1_2, 'indexer_configuration_id': tool['id'], } ]) class TestIndexerStorageContentFossologyLicence: def test_content_fossology_license_add__new_license_added( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool = data.tools['nomos'] tool_id = tool['id'] license_v1 = { 'id': data.sha1_1, 'licenses': ['Apache-2.0'], 'indexer_configuration_id': tool_id, } # given storage.content_fossology_license_add([license_v1]) # conflict does nothing storage.content_fossology_license_add([license_v1]) # when actual_licenses = list(storage.content_fossology_license_get( [data.sha1_1])) # then expected_license = { data.sha1_1: [{ 'licenses': ['Apache-2.0'], 'tool': tool, }] } assert actual_licenses == [expected_license] # given license_v2 = license_v1.copy() license_v2.update({ 'licenses': ['BSD-2-Clause'], }) storage.content_fossology_license_add([license_v2]) actual_licenses = list(storage.content_fossology_license_get( [data.sha1_1])) expected_license = { data.sha1_1: [{ 'licenses': ['Apache-2.0', 'BSD-2-Clause'], 'tool': tool }] } # license did not change as the v2 was dropped. assert actual_licenses == [expected_license] def test_generate_content_fossology_license_get_range_limit_none( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data """license_get_range call with wrong limit input should fail""" - with pytest.raises(ValueError) as e: + with pytest.raises(IndexerStorageArgumentException) as e: storage.content_fossology_license_get_range( start=None, end=None, indexer_configuration_id=None, limit=None) - assert e.value.args == ( - 'Development error: limit should not be None',) + assert e.value.args == ('limit should not be None',) def test_generate_content_fossology_license_get_range_no_limit( self, swh_indexer_storage_with_data): """license_get_range returns licenses within range provided""" storage, data = swh_indexer_storage_with_data # craft some consistent mimetypes fossology_licenses = data.fossology_licenses mimetypes = prepare_mimetypes_from(fossology_licenses) storage.content_mimetype_add(mimetypes, conflict_update=True) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) # All ids from the db content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert len(fossology_licenses) == len(actual_ids) assert actual_next is None assert content_ids == actual_ids def test_generate_content_fossology_license_get_range_no_limit_with_filter( self, swh_indexer_storage_with_data): """This filters non textual, then returns results within range""" storage, data = swh_indexer_storage_with_data fossology_licenses = data.fossology_licenses mimetypes = data.mimetypes # craft some consistent mimetypes _mimetypes = prepare_mimetypes_from(fossology_licenses) # add binary mimetypes which will get filtered out in results for m in mimetypes: _mimetypes.append({ 'mimetype': 'binary', **m, }) storage.content_mimetype_add(_mimetypes, conflict_update=True) # add fossology_licenses to storage storage.content_fossology_license_add(fossology_licenses) # All ids from the db content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id) actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert len(fossology_licenses) == len(actual_ids) assert actual_next is None assert content_ids == actual_ids def test_generate_fossology_license_get_range_limit( self, swh_indexer_storage_with_data): """fossology_license_get_range paginates results if limit exceeded""" storage, data = swh_indexer_storage_with_data fossology_licenses = data.fossology_licenses # craft some consistent mimetypes mimetypes = prepare_mimetypes_from(fossology_licenses) # add fossology_licenses to storage storage.content_mimetype_add(mimetypes, conflict_update=True) storage.content_fossology_license_add(fossology_licenses) # input the list of sha1s we want from storage content_ids = sorted([c['id'] for c in fossology_licenses]) start = content_ids[0] end = content_ids[-1] # retrieve fossology_licenses limited to 3 results limited_results = len(fossology_licenses) - 1 tool_id = fossology_licenses[0]['indexer_configuration_id'] actual_result = storage.content_fossology_license_get_range( start, end, indexer_configuration_id=tool_id, limit=limited_results) actual_ids = actual_result['ids'] actual_next = actual_result['next'] assert limited_results == len(actual_ids) assert actual_next is not None assert actual_next == content_ids[-1] expected_fossology_licenses = content_ids[:-1] assert expected_fossology_licenses == actual_ids # retrieve next part actual_results2 = storage.content_fossology_license_get_range( start=end, end=end, indexer_configuration_id=tool_id) actual_ids2 = actual_results2['ids'] actual_next2 = actual_results2['next'] assert actual_next2 is None expected_fossology_licenses2 = [content_ids[-1]] assert expected_fossology_licenses2 == actual_ids2 class TestIndexerStorageOriginIntrinsicMetadata: def test_origin_intrinsic_metadata_get( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata = { 'version': None, 'name': None, } metadata_rev = { 'id': data.revision_id_2, 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'id': data.origin_url_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], 'from_revision': data.revision_id_2, } # when storage.revision_intrinsic_metadata_add([metadata_rev]) storage.origin_intrinsic_metadata_add([metadata_origin]) # then actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1, 'no://where'])) expected_metadata = [{ 'id': data.origin_url_1, 'metadata': metadata, 'tool': data.tools['swh-metadata-detector'], 'from_revision': data.revision_id_2, 'mappings': ['mapping1'], }] assert actual_metadata == expected_metadata def test_origin_intrinsic_metadata_delete( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata = { 'version': None, 'name': None, } metadata_rev = { 'id': data.revision_id_2, 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'id': data.origin_url_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], 'from_revision': data.revision_id_2, } metadata_origin2 = metadata_origin.copy() metadata_origin2['id'] = data.origin_url_2 # when storage.revision_intrinsic_metadata_add([metadata_rev]) storage.origin_intrinsic_metadata_add([ metadata_origin, metadata_origin2]) storage.origin_intrinsic_metadata_delete([ { 'id': data.origin_url_1, 'indexer_configuration_id': tool_id } ]) # then actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1, data.origin_url_2, 'no://where'])) for item in actual_metadata: item['indexer_configuration_id'] = item.pop('tool')['id'] assert actual_metadata == [metadata_origin2] def test_origin_intrinsic_metadata_delete_nonexisting( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool_id = data.tools['swh-metadata-detector']['id'] storage.origin_intrinsic_metadata_delete([ { 'id': data.origin_url_1, 'indexer_configuration_id': tool_id } ]) def test_origin_intrinsic_metadata_add_drop_duplicate( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata_v1 = { 'version': None, 'name': None, } metadata_rev_v1 = { 'id': data.revision_id_1, 'metadata': metadata_v1.copy(), 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'id': data.origin_url_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], 'from_revision': data.revision_id_1, } # given storage.revision_intrinsic_metadata_add([metadata_rev_v1]) storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1, 'no://where'])) expected_metadata_v1 = [{ 'id': data.origin_url_1, 'metadata': metadata_v1, 'tool': data.tools['swh-metadata-detector'], 'from_revision': data.revision_id_1, 'mappings': [], }] assert actual_metadata == expected_metadata_v1 # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'name': 'test_metadata', 'author': 'MG', }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() metadata_rev_v2['metadata'] = metadata_v2 metadata_origin_v2['metadata'] = metadata_v2 storage.revision_intrinsic_metadata_add([metadata_rev_v2]) storage.origin_intrinsic_metadata_add([metadata_origin_v2]) # then actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1])) # metadata did not change as the v2 was dropped. assert actual_metadata == expected_metadata_v1 def test_origin_intrinsic_metadata_add_update_in_place_duplicate( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata_v1 = { 'version': None, 'name': None, } metadata_rev_v1 = { 'id': data.revision_id_2, 'metadata': metadata_v1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata_origin_v1 = { 'id': data.origin_url_1, 'metadata': metadata_v1.copy(), 'indexer_configuration_id': tool_id, 'mappings': [], 'from_revision': data.revision_id_2, } # given storage.revision_intrinsic_metadata_add([metadata_rev_v1]) storage.origin_intrinsic_metadata_add([metadata_origin_v1]) # when actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1])) # then expected_metadata_v1 = [{ 'id': data.origin_url_1, 'metadata': metadata_v1, 'tool': data.tools['swh-metadata-detector'], 'from_revision': data.revision_id_2, 'mappings': [], }] assert actual_metadata == expected_metadata_v1 # given metadata_v2 = metadata_v1.copy() metadata_v2.update({ 'name': 'test_update_duplicated_metadata', 'author': 'MG', }) metadata_rev_v2 = metadata_rev_v1.copy() metadata_origin_v2 = metadata_origin_v1.copy() metadata_rev_v2['metadata'] = metadata_v2 metadata_origin_v2 = { 'id': data.origin_url_1, 'metadata': metadata_v2.copy(), 'indexer_configuration_id': tool_id, 'mappings': ['npm'], 'from_revision': data.revision_id_1, } storage.revision_intrinsic_metadata_add( [metadata_rev_v2], conflict_update=True) storage.origin_intrinsic_metadata_add( [metadata_origin_v2], conflict_update=True) actual_metadata = list(storage.origin_intrinsic_metadata_get( [data.origin_url_1])) expected_metadata_v2 = [{ 'id': data.origin_url_1, 'metadata': metadata_v2, 'tool': data.tools['swh-metadata-detector'], 'from_revision': data.revision_id_1, 'mappings': ['npm'], }] # metadata did change as the v2 was used to overwrite v1 assert actual_metadata == expected_metadata_v2 def test_origin_intrinsic_metadata_add__update_in_place_deadlock( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] ids = list(range(10)) example_data1 = { 'metadata': { 'version': None, 'name': None, }, 'mappings': [], } example_data2 = { 'metadata': { 'version': 'v1.1.1', 'name': 'foo', }, 'mappings': [], } metadata_rev_v1 = { 'id': data.revision_id_2, 'metadata': { 'version': None, 'name': None, }, 'mappings': [], 'indexer_configuration_id': tool_id, } data_v1 = [ { 'id': 'file:///tmp/origin%d' % id_, 'from_revision': data.revision_id_2, **example_data1, 'indexer_configuration_id': tool_id, } for id_ in ids ] data_v2 = [ { 'id': 'file:///tmp/origin%d' % id_, 'from_revision': data.revision_id_2, **example_data2, 'indexer_configuration_id': tool_id, } for id_ in ids ] # Remove one item from each, so that both queries have to succeed for # all items to be in the DB. data_v2a = data_v2[1:] data_v2b = list(reversed(data_v2[0:-1])) # given storage.revision_intrinsic_metadata_add([metadata_rev_v1]) storage.origin_intrinsic_metadata_add(data_v1) # when origins = ['file:///tmp/origin%d' % i for i in ids] actual_data = list(storage.origin_intrinsic_metadata_get(origins)) expected_data_v1 = [ { 'id': 'file:///tmp/origin%d' % id_, 'from_revision': data.revision_id_2, **example_data1, 'tool': data.tools['swh-metadata-detector'], } for id_ in ids ] # then assert actual_data == expected_data_v1 # given def f1(): storage.origin_intrinsic_metadata_add( data_v2a, conflict_update=True) def f2(): storage.origin_intrinsic_metadata_add( data_v2b, conflict_update=True) t1 = threading.Thread(target=f1) t2 = threading.Thread(target=f2) t2.start() t1.start() t1.join() t2.join() actual_data = list(storage.origin_intrinsic_metadata_get(origins)) expected_data_v2 = [ { 'id': 'file:///tmp/origin%d' % id_, 'from_revision': data.revision_id_2, **example_data2, 'tool': data.tools['swh-metadata-detector'], } for id_ in ids ] assert len(actual_data) == len(expected_data_v2) assert sorted(actual_data, key=lambda x: x['id']) == expected_data_v2 def test_origin_intrinsic_metadata_add__duplicate_twice( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata = { 'developmentStatus': None, 'name': None, } metadata_rev = { 'id': data.revision_id_2, 'metadata': metadata, 'mappings': ['mapping1'], 'indexer_configuration_id': tool_id, } metadata_origin = { 'id': data.origin_url_1, 'metadata': metadata, 'indexer_configuration_id': tool_id, 'mappings': ['mapping1'], 'from_revision': data.revision_id_2, } # when storage.revision_intrinsic_metadata_add([metadata_rev]) - with pytest.raises(ValueError): + with pytest.raises(DuplicateId): storage.origin_intrinsic_metadata_add([ metadata_origin, metadata_origin]) def test_origin_intrinsic_metadata_search_fulltext( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] metadata1 = { 'author': 'John Doe', } metadata1_rev = { 'id': data.revision_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'id': data.origin_url_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': data.revision_id_1, } metadata2 = { 'author': 'Jane Doe', } metadata2_rev = { 'id': data.revision_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'id': data.origin_url_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': data.revision_id_2, } # when storage.revision_intrinsic_metadata_add([metadata1_rev]) storage.origin_intrinsic_metadata_add([metadata1_origin]) storage.revision_intrinsic_metadata_add([metadata2_rev]) storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = storage.origin_intrinsic_metadata_search_fulltext assert set([res['id'] for res in search(['Doe'])]) \ == set([data.origin_url_1, data.origin_url_2]) assert [res['id'] for res in search(['John', 'Doe'])] \ == [data.origin_url_1] assert [res['id'] for res in search(['John'])] \ == [data.origin_url_1] assert not list(search(['John', 'Jane'])) def test_origin_intrinsic_metadata_search_fulltext_rank( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data # given tool_id = data.tools['swh-metadata-detector']['id'] # The following authors have "Random Person" to add some more content # to the JSON data, to work around normalization quirks when there # are few words (rank/(1+ln(nb_words)) is very sensitive to nb_words # for small values of nb_words). metadata1 = { 'author': [ 'Random Person', 'John Doe', 'Jane Doe', ] } metadata1_rev = { 'id': data.revision_id_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata1_origin = { 'id': data.origin_url_1, 'metadata': metadata1, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': data.revision_id_1, } metadata2 = { 'author': [ 'Random Person', 'Jane Doe', ] } metadata2_rev = { 'id': data.revision_id_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, } metadata2_origin = { 'id': data.origin_url_2, 'metadata': metadata2, 'mappings': [], 'indexer_configuration_id': tool_id, 'from_revision': data.revision_id_2, } # when storage.revision_intrinsic_metadata_add([metadata1_rev]) storage.origin_intrinsic_metadata_add([metadata1_origin]) storage.revision_intrinsic_metadata_add([metadata2_rev]) storage.origin_intrinsic_metadata_add([metadata2_origin]) # then search = storage.origin_intrinsic_metadata_search_fulltext assert [res['id'] for res in search(['Doe'])] \ == [data.origin_url_1, data.origin_url_2] assert [res['id'] for res in search(['Doe'], limit=1)] \ == [data.origin_url_1] assert [res['id'] for res in search(['John'])] \ == [data.origin_url_1] assert [res['id'] for res in search(['Jane'])] \ == [data.origin_url_2, data.origin_url_1] assert [res['id'] for res in search(['John', 'Jane'])] \ == [data.origin_url_1] def _fill_origin_intrinsic_metadata( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool1_id = data.tools['swh-metadata-detector']['id'] tool2_id = data.tools['swh-metadata-detector2']['id'] metadata1 = { '@context': 'foo', 'author': 'John Doe', } metadata1_rev = { 'id': data.revision_id_1, 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, } metadata1_origin = { 'id': data.origin_url_1, 'metadata': metadata1, 'mappings': ['npm'], 'indexer_configuration_id': tool1_id, 'from_revision': data.revision_id_1, } metadata2 = { '@context': 'foo', 'author': 'Jane Doe', } metadata2_rev = { 'id': data.revision_id_2, 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, } metadata2_origin = { 'id': data.origin_url_2, 'metadata': metadata2, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, 'from_revision': data.revision_id_2, } metadata3 = { '@context': 'foo', } metadata3_rev = { 'id': data.revision_id_3, 'metadata': metadata3, 'mappings': ['npm', 'gemspec'], 'indexer_configuration_id': tool2_id, } metadata3_origin = { 'id': data.origin_url_3, 'metadata': metadata3, 'mappings': ['pkg-info'], 'indexer_configuration_id': tool2_id, 'from_revision': data.revision_id_3, } storage.revision_intrinsic_metadata_add([metadata1_rev]) storage.origin_intrinsic_metadata_add([metadata1_origin]) storage.revision_intrinsic_metadata_add([metadata2_rev]) storage.origin_intrinsic_metadata_add([metadata2_origin]) storage.revision_intrinsic_metadata_add([metadata3_rev]) storage.origin_intrinsic_metadata_add([metadata3_origin]) def test_origin_intrinsic_metadata_search_by_producer( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data self._fill_origin_intrinsic_metadata( swh_indexer_storage_with_data) tool1 = data.tools['swh-metadata-detector'] tool2 = data.tools['swh-metadata-detector2'] endpoint = storage.origin_intrinsic_metadata_search_by_producer # test pagination # no 'page_token' param, return all origins result = endpoint(ids_only=True) assert result['origins'] \ == [data.origin_url_1, data.origin_url_2, data.origin_url_3] assert 'next_page_token' not in result # 'page_token' is < than origin_1, return everything result = endpoint(page_token=data.origin_url_1[:-1], ids_only=True) assert result['origins'] \ == [data.origin_url_1, data.origin_url_2, data.origin_url_3] assert 'next_page_token' not in result # 'page_token' is origin_3, return nothing result = endpoint(page_token=data.origin_url_3, ids_only=True) assert not result['origins'] assert 'next_page_token' not in result # test limit argument result = endpoint(page_token=data.origin_url_1[:-1], limit=2, ids_only=True) assert result['origins'] == [data.origin_url_1, data.origin_url_2] assert result['next_page_token'] == result['origins'][-1] result = endpoint(page_token=data.origin_url_1, limit=2, ids_only=True) assert result['origins'] == [data.origin_url_2, data.origin_url_3] assert 'next_page_token' not in result result = endpoint(page_token=data.origin_url_2, limit=2, ids_only=True) assert result['origins'] == [data.origin_url_3] assert 'next_page_token' not in result # test mappings filtering result = endpoint(mappings=['npm'], ids_only=True) assert result['origins'] == [data.origin_url_1, data.origin_url_2] assert 'next_page_token' not in result result = endpoint(mappings=['npm', 'gemspec'], ids_only=True) assert result['origins'] == [data.origin_url_1, data.origin_url_2] assert 'next_page_token' not in result result = endpoint(mappings=['gemspec'], ids_only=True) assert result['origins'] == [data.origin_url_2] assert 'next_page_token' not in result result = endpoint(mappings=['pkg-info'], ids_only=True) assert result['origins'] == [data.origin_url_3] assert 'next_page_token' not in result result = endpoint(mappings=['foobar'], ids_only=True) assert not result['origins'] assert 'next_page_token' not in result # test pagination + mappings result = endpoint(mappings=['npm'], limit=1, ids_only=True) assert result['origins'] == [data.origin_url_1] assert result['next_page_token'] == result['origins'][-1] # test tool filtering result = endpoint(tool_ids=[tool1['id']], ids_only=True) assert result['origins'] == [data.origin_url_1] assert 'next_page_token' not in result result = endpoint(tool_ids=[tool2['id']], ids_only=True) assert sorted(result['origins']) \ == [data.origin_url_2, data.origin_url_3] assert 'next_page_token' not in result result = endpoint(tool_ids=[tool1['id'], tool2['id']], ids_only=True) assert sorted(result['origins']) \ == [data.origin_url_1, data.origin_url_2, data.origin_url_3] assert 'next_page_token' not in result # test ids_only=False assert endpoint(mappings=['gemspec'])['origins'] \ == [{ 'id': data.origin_url_2, 'metadata': { '@context': 'foo', 'author': 'Jane Doe', }, 'mappings': ['npm', 'gemspec'], 'tool': tool2, 'from_revision': data.revision_id_2, }] def test_origin_intrinsic_metadata_stats( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data self._fill_origin_intrinsic_metadata( swh_indexer_storage_with_data) result = storage.origin_intrinsic_metadata_stats() assert result == { 'per_mapping': { 'gemspec': 1, 'npm': 2, 'pkg-info': 1, 'codemeta': 0, 'maven': 0, }, 'total': 3, 'non_empty': 2, } class TestIndexerStorageIndexerCondifuration: def test_indexer_configuration_add( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'some-unknown-tool', 'tool_version': 'some-version', 'tool_configuration': {"debian-package": "some-package"}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool is None # does not exist # add it actual_tools = list(storage.indexer_configuration_add([tool])) assert len(actual_tools) == 1 actual_tool = actual_tools[0] assert actual_tool is not None # now it exists new_id = actual_tool.pop('id') assert actual_tool == tool actual_tools2 = list(storage.indexer_configuration_add([tool])) actual_tool2 = actual_tools2[0] assert actual_tool2 is not None # now it exists new_id2 = actual_tool2.pop('id') assert new_id == new_id2 assert actual_tool == actual_tool2 def test_indexer_configuration_add_multiple( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'some-unknown-tool', 'tool_version': 'some-version', 'tool_configuration': {"debian-package": "some-package"}, } actual_tools = list(storage.indexer_configuration_add([tool])) assert len(actual_tools) == 1 new_tools = [tool, { 'tool_name': 'yet-another-tool', 'tool_version': 'version', 'tool_configuration': {}, }] actual_tools = list(storage.indexer_configuration_add(new_tools)) assert len(actual_tools) == 2 # order not guaranteed, so we iterate over results to check for tool in actual_tools: _id = tool.pop('id') assert _id is not None assert tool in new_tools def test_indexer_configuration_get_missing( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'unknown-tool', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool is None def test_indexer_configuration_get( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'nomos', 'tool_version': '3.1.0rc2-31-ga2cbb8c', 'tool_configuration': {"command_line": "nomossa "}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool expected_tool = tool.copy() del actual_tool['id'] assert expected_tool == actual_tool def test_indexer_configuration_metadata_get_missing_context( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"context": "unknown-context"}, } actual_tool = storage.indexer_configuration_get(tool) assert actual_tool is None def test_indexer_configuration_metadata_get( self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data tool = { 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': {"type": "local", "context": "NpmMapping"}, } storage.indexer_configuration_add([tool]) actual_tool = storage.indexer_configuration_get(tool) assert actual_tool expected_tool = tool.copy() expected_tool['id'] = actual_tool['id'] assert expected_tool == actual_tool class TestIndexerStorageMisc: """Misc endpoints tests for the IndexerStorage. """ def test_check_config(self, swh_indexer_storage): storage = swh_indexer_storage assert storage.check_config(check_write=True) assert storage.check_config(check_write=False) def test_types(self, swh_indexer_storage): """Checks all methods of StorageInterface are implemented by this backend, and that they have the same signature.""" # Create an instance of the protocol (which cannot be instantiated # directly, so this creates a subclass, then instantiates it) interface = type('_', (IndexerStorageInterface,), {})() assert 'content_mimetype_add' in dir(interface) missing_methods = [] for meth_name in dir(interface): if meth_name.startswith('_'): continue interface_meth = getattr(interface, meth_name) try: concrete_meth = getattr(swh_indexer_storage, meth_name) except AttributeError: missing_methods.append(meth_name) continue expected_signature = inspect.signature(interface_meth) actual_signature = inspect.signature(concrete_meth) assert expected_signature == actual_signature, meth_name assert missing_methods == []