diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -8,13 +8,14 @@ import psycopg2 import psycopg2.pool -from collections import defaultdict +from collections import defaultdict, Counter from swh.storage.common import db_transaction_generator, db_transaction from swh.storage.exc import StorageDBError -from .db import Db from . import converters +from .db import Db +from .exc import IndexerStorageArgumentException, DuplicateId INDEXER_CFG_KEY = 'indexer_storage' @@ -51,7 +52,7 @@ return IndexerStorage(**args) -def _check_id_duplicates(data): +def check_id_duplicates(data): """ If any two dictionaries in `data` have the same id, raises a `ValueError`. @@ -61,11 +62,11 @@ Args: data (List[dict]): List of dictionaries to be inserted - >>> _check_id_duplicates([ + >>> check_id_duplicates([ ... {'id': 'foo', 'data': 'spam'}, ... {'id': 'bar', 'data': 'egg'}, ... ]) - >>> _check_id_duplicates([ + >>> check_id_duplicates([ ... {'id': 'foo', 'data': 'spam'}, ... {'id': 'foo', 'data': 'egg'}, ... ]) @@ -73,8 +74,10 @@ ... ValueError: The same id is present more than once. """ - if len({item['id'] for item in data}) < len(data): - raise ValueError('The same id is present more than once.') + counter = Counter(item['id'] for item in data) + duplicates = [id_ for (id_, count) in counter.items() if count >= 2] + if duplicates: + raise DuplicateId(duplicates) class IndexerStorage: @@ -132,11 +135,11 @@ with_textual_data=False, db=None, cur=None): if limit is None: - raise ValueError('Development error: limit should not be None') + raise IndexerStorageArgumentException('limit should not be None') if content_type not in db.content_indexer_names: - err = 'Development error: Wrong type. Should be one of [%s]' % ( + err = 'Wrong type. Should be one of [%s]' % ( ','.join(db.content_indexer_names)) - raise ValueError(err) + raise IndexerStorageArgumentException(err) ids = [] next_id = None @@ -165,7 +168,7 @@ @db_transaction() def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, cur=None): - _check_id_duplicates(mimetypes) + check_id_duplicates(mimetypes) mimetypes.sort(key=lambda m: m['id']) db.mktemp_content_mimetype(cur) db.copy_to(mimetypes, 'tmp_content_mimetype', @@ -193,7 +196,7 @@ @db_transaction() def content_language_add(self, languages, conflict_update=False, db=None, cur=None): - _check_id_duplicates(languages) + check_id_duplicates(languages) languages.sort(key=lambda m: m['id']) db.mktemp_content_language(cur) # empty language is mapped to 'unknown' @@ -221,7 +224,7 @@ @db_transaction() def content_ctags_add(self, ctags, conflict_update=False, db=None, cur=None): - _check_id_duplicates(ctags) + check_id_duplicates(ctags) ctags.sort(key=lambda m: m['id']) def _convert_ctags(__ctags): @@ -262,7 +265,7 @@ @db_transaction() def content_fossology_license_add(self, licenses, conflict_update=False, db=None, cur=None): - _check_id_duplicates(licenses) + check_id_duplicates(licenses) licenses.sort(key=lambda m: m['id']) db.mktemp_content_fossology_license(cur) db.copy_to( @@ -299,7 +302,7 @@ @db_transaction() def content_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - _check_id_duplicates(metadata) + check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_content_metadata(cur) @@ -324,7 +327,7 @@ @db_transaction() def revision_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - _check_id_duplicates(metadata) + check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_revision_intrinsic_metadata(cur) @@ -349,7 +352,7 @@ def origin_intrinsic_metadata_add(self, metadata, conflict_update=False, db=None, cur=None): - _check_id_duplicates(metadata) + check_id_duplicates(metadata) metadata.sort(key=lambda m: m['id']) db.mktemp_origin_intrinsic_metadata(cur) diff --git a/swh/indexer/storage/api/client.py b/swh/indexer/storage/api/client.py --- a/swh/indexer/storage/api/client.py +++ b/swh/indexer/storage/api/client.py @@ -5,7 +5,10 @@ from swh.core.api import RPCClient -from swh.storage.exc import StorageAPIError +from swh.indexer.storage.exc import ( + IndexerStorageAPIError, IndexerStorageArgumentException, + DuplicateId, +) from ..interface import IndexerStorageInterface @@ -14,4 +17,7 @@ """Proxy to a remote storage API""" backend_class = IndexerStorageInterface - api_exception = StorageAPIError + api_exception = IndexerStorageAPIError + reraise_exceptions = [ + IndexerStorageArgumentException, DuplicateId + ] diff --git a/swh/indexer/storage/api/server.py b/swh/indexer/storage/api/server.py --- a/swh/indexer/storage/api/server.py +++ b/swh/indexer/storage/api/server.py @@ -12,6 +12,7 @@ from swh.indexer.storage import ( get_indexer_storage, INDEXER_CFG_KEY ) +from swh.indexer.storage.exc import IndexerStorageArgumentException from swh.indexer.storage.interface import IndexerStorageInterface @@ -34,6 +35,11 @@ return error_handler(exception, encode_data) +@app.errorhandler(IndexerStorageArgumentException) +def argument_error_handler(exception): + return error_handler(exception, encode_data, status_code=400) + + @app.route('/') def index(): return 'SWH Indexer Storage API server' diff --git a/swh/indexer/storage/exc.py b/swh/indexer/storage/exc.py new file mode 100644 --- /dev/null +++ b/swh/indexer/storage/exc.py @@ -0,0 +1,19 @@ +# Copyright (C) 2020 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +class IndexerStorageAPIError(Exception): + """Generic error of the indexer storage.""" + pass + + +class IndexerStorageArgumentException(Exception): + """Argument passed to an IndexerStorage endpoint is invalid.""" + pass + + +class DuplicateId(IndexerStorageArgumentException): + """The same identifier is present more than once.""" + pass diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -10,8 +10,10 @@ import operator import math import re +from typing import Any, Dict, List -from . import MAPPING_NAMES +from . import MAPPING_NAMES, check_id_duplicates +from .exc import IndexerStorageArgumentException SHA1_DIGEST_SIZE = 160 @@ -25,6 +27,12 @@ } +def check_id_types(data: List[Dict[str, Any]]): + """Checks all elements of the list have an 'id' whose type is 'bytes'.""" + if not all(isinstance(item.get('id'), bytes) for item in data): + raise IndexerStorageArgumentException('identifiers must be bytes.') + + class SubStorage: """Implements common missing/get/add logic for each indexer type.""" def __init__(self, tools): @@ -91,7 +99,7 @@ **limit** (int): Limit result Raises: - ValueError for limit to None + IndexerStorageArgumentException for limit to None Returns: a dict with keys: @@ -101,7 +109,7 @@ """ if limit is None: - raise ValueError('Development error: limit should not be None') + raise IndexerStorageArgumentException('limit should not be None') from_index = bisect.bisect_left(self._sorted_ids, start) to_index = bisect.bisect_right(self._sorted_ids, end, lo=from_index) if to_index - from_index >= limit: @@ -131,9 +139,7 @@ """ data = list(data) - if len({x['id'] for x in data}) < len(data): - # For "exception-compatibility" with the pgsql backend - raise ValueError('The same id is present more than once.') + check_id_duplicates(data) for item in data: item = item.copy() tool_id = item.pop('indexer_configuration_id') @@ -211,8 +217,7 @@ start, end, indexer_configuration_id, limit) def content_mimetype_add(self, mimetypes, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in mimetypes): - raise TypeError('identifiers must be bytes.') + check_id_types(mimetypes) self._mimetypes.add(mimetypes, conflict_update) def content_mimetype_get(self, ids): @@ -225,8 +230,7 @@ yield from self._languages.get(ids) def content_language_add(self, languages, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in languages): - raise TypeError('identifiers must be bytes.') + check_id_types(languages) self._languages.add(languages, conflict_update) def content_ctags_missing(self, ctags): @@ -242,8 +246,7 @@ } def content_ctags_add(self, ctags, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in ctags): - raise TypeError('identifiers must be bytes.') + check_id_types(ctags) self._content_ctags.add_merge(ctags, conflict_update, 'ctags') def content_ctags_search(self, expression, @@ -277,8 +280,7 @@ yield {id_: facts} def content_fossology_license_add(self, licenses, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in licenses): - raise TypeError('identifiers must be bytes.') + check_id_types(licenses) self._licenses.add_merge(licenses, conflict_update, 'licenses') def content_fossology_license_get_range( @@ -293,8 +295,7 @@ yield from self._content_metadata.get(ids) def content_metadata_add(self, metadata, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in metadata): - raise TypeError('identifiers must be bytes.') + check_id_types(metadata) self._content_metadata.add(metadata, conflict_update) def revision_intrinsic_metadata_missing(self, metadata): @@ -304,8 +305,7 @@ yield from self._revision_intrinsic_metadata.get(ids) def revision_intrinsic_metadata_add(self, metadata, conflict_update=False): - if not all(isinstance(x['id'], bytes) for x in metadata): - raise TypeError('identifiers must be bytes.') + check_id_types(metadata) self._revision_intrinsic_metadata.add(metadata, conflict_update) def revision_intrinsic_metadata_delete(self, entries): diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -10,6 +10,9 @@ from swh.model.hashutil import hash_to_bytes +from swh.indexer.storage.exc import ( + IndexerStorageArgumentException, DuplicateId, +) from swh.indexer.storage.interface import IndexerStorageInterface @@ -253,7 +256,7 @@ # when endpoint(storage, etype, 'add')([data_rev1]) - with pytest.raises(ValueError): + with pytest.raises(DuplicateId): endpoint(storage, etype, 'add')( [data_rev2, data_rev2], conflict_update=True) @@ -317,13 +320,12 @@ self, swh_indexer_storage): """mimetype_get_range call with wrong limit input should fail""" storage = swh_indexer_storage - with pytest.raises(ValueError) as e: + with pytest.raises(IndexerStorageArgumentException) as e: storage.content_mimetype_get_range( start=None, end=None, indexer_configuration_id=None, limit=None) - assert e.value.args == ( - 'Development error: limit should not be None',) + assert e.value.args == ('limit should not be None',) def test_generate_content_mimetype_get_range_no_limit( self, swh_indexer_storage_with_data): @@ -925,13 +927,12 @@ self, swh_indexer_storage_with_data): storage, data = swh_indexer_storage_with_data """license_get_range call with wrong limit input should fail""" - with pytest.raises(ValueError) as e: + with pytest.raises(IndexerStorageArgumentException) as e: storage.content_fossology_license_get_range( start=None, end=None, indexer_configuration_id=None, limit=None) - assert e.value.args == ( - 'Development error: limit should not be None',) + assert e.value.args == ('limit should not be None',) def test_generate_content_fossology_license_get_range_no_limit( self, swh_indexer_storage_with_data): @@ -1424,7 +1425,7 @@ # when storage.revision_intrinsic_metadata_add([metadata_rev]) - with pytest.raises(ValueError): + with pytest.raises(DuplicateId): storage.origin_intrinsic_metadata_add([ metadata_origin, metadata_origin])