diff --git a/.gitignore b/.gitignore --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ /sql/createdb-stamp /sql/filldb-stamp .tox/ +.hypothesis/ \ No newline at end of file diff --git a/Makefile.local b/Makefile.local new file mode 100644 --- /dev/null +++ b/Makefile.local @@ -0,0 +1 @@ +TESTFLAGS=--hypothesis-profile=fast diff --git a/conftest.py b/conftest.py new file mode 100644 --- /dev/null +++ b/conftest.py @@ -0,0 +1,6 @@ +from hypothesis import settings + +# define tests profile. Full documentation is at: +# https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles +settings.register_profile("fast", max_examples=5, deadline=5000) +settings.register_profile("slow", max_examples=20, deadline=5000) diff --git a/debian/control b/debian/control --- a/debian/control +++ b/debian/control @@ -7,6 +7,7 @@ python3-all, python3-chardet (>= 2.3.0~), python3-click, + python3-hypothesis, python3-pytest, python3-pygments, python3-magic, diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1 +1,2 @@ pytest +hypothesis diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -110,6 +110,85 @@ for obj in db.content_mimetype_missing_from_list(mimetypes, cur): yield obj[0] + def _content_get_range(self, content_type, start, end, + indexer_configuration_id, limit=1000, + db=None, cur=None): + """Retrieve ids of type content_type within range [start, end] bound + by limit. + + Args: + **content_type** (str): content's type (mimetype, language, etc...) + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **indexer_configuration_id** (int): The tool used to index data + **limit** (int): Limit result (default to 1000) + + Raises: + ValueError for; + - limit to None + - wrong content_type provided + + Returns: + a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any + + """ + if limit is None: + raise ValueError('Development error: limit should not be None') + if content_type not in db.content_indexer_names: + err = 'Development error: Wrong type. Should be one of [%s]' % ( + ','.join(db.content_indexer_names)) + raise ValueError(err) + + ids = [] + next_id = None + for counter, obj in enumerate(db.content_get_range( + content_type, start, end, indexer_configuration_id, + limit=limit+1, cur=cur)): + _id = obj[0] + if counter >= limit: + next_id = _id + break + + ids.append(_id) + + return { + 'ids': ids, + 'next': next_id + } + + @remote_api_endpoint('content_mimetype/range') + @db_transaction() + def content_mimetype_get_range(self, start, end, indexer_configuration_id, + limit=1000, db=None, cur=None): + """Retrieve mimetypes within range [start, end] bound by limit. + + Args: + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **indexer_configuration_id** (int): The tool used to index data + **limit** (int): Limit result (default to 1000) + + Raises: + ValueError for limit to None + + Returns: + a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any + + """ + return self._content_get_range('mimetype', start, end, + indexer_configuration_id, limit=limit, + db=db, cur=cur) + @remote_api_endpoint('content_mimetype/add') @db_transaction() def content_mimetype_add(self, mimetypes, conflict_update=False, db=None, diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -113,6 +113,31 @@ ((_id,) for _id in ids) ) + content_indexer_names = { + 'mimetype': 'content_mimetype', + 'language': 'content_language', + } + + def content_get_range(self, content_type, start, end, + indexer_configuration_id, limit=1000, cur=None): + """Retrieve contents with content_type, within range [start, end] + bound by limit and associated to the given indexer + configuration id. + + """ + cur = self._cursor(cur) + table = self.content_indexer_names[content_type] + query = """select t.id + from %s t + inner join indexer_configuration ic + on t.indexer_configuration_id=ic.id + where ic.id=%%s and + %%s <= t.id and t.id <= %%s + order by t.indexer_configuration_id, t.id + limit %%s""" % table + cur.execute(query, (indexer_configuration_id, start, end, limit)) + yield from cursor_to_bytes(cur) + def content_mimetype_get_from_list(self, ids, cur=None): yield from self._get_from_list( 'content_mimetype', ids, self.content_mimetype_cols, cur=cur) diff --git a/swh/indexer/tests/storage/__init__.py b/swh/indexer/tests/storage/__init__.py --- a/swh/indexer/tests/storage/__init__.py +++ b/swh/indexer/tests/storage/__init__.py @@ -0,0 +1,89 @@ +# Copyright (C) 2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from os import path +import swh.storage + +from hypothesis.strategies import (binary, composite, sets, one_of, + tuples, sampled_from) + + +SQL_DIR = path.join(path.dirname(swh.indexer.__file__), 'sql') + + +MIMETYPES = [ + b'application/json', + b'application/octet-stream', + b'application/xml', + b'text/plain', +] + +ENCODINGS = [ + b'iso8859-1', + b'iso8859-15', + b'latin1', + b'utf-8', +] + + +def gen_content_id(): + """Generate raw id strategy. + + """ + return binary(min_size=20, max_size=20) + + +def gen_mimetype(): + """Generate one mimetype strategy. + + """ + return one_of(sampled_from(MIMETYPES)) + + +def gen_encoding(): + """Generate one encoding strategy. + + """ + return one_of(sampled_from(ENCODINGS)) + + +@composite +def gen_content_mimetypes(draw, *, min_size=0, max_size=100): + """Generate valid and consistent content_mimetypes. + + Context: Test purposes + + Args: + **draw** (callable): Used by hypothesis to generate data + **min_size** (int): Minimal number of elements to generate + (default: 0) + **max_size** (int): Maximal number of elements to generate + (default: 100) + + Returns: + List of content_mimetypes as expected by the + content_mimetype_add api endpoint. + + """ + _ids = draw( + sets( + tuples( + gen_content_id(), + gen_mimetype(), + gen_encoding() + ), + min_size=min_size, max_size=max_size + ) + ) + + content_mimetypes = [] + for content_id, mimetype, encoding in _ids: + content_mimetypes.append({ + 'id': content_id, + 'mimetype': mimetype, + 'encoding': encoding, + 'indexer_configuration_id': 1, + }) + return content_mimetypes diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -4,15 +4,16 @@ # See top-level LICENSE file for more information import os +import pytest import unittest +from hypothesis import given + from swh.model.hashutil import hash_to_bytes from swh.indexer.storage import get_indexer_storage from swh.core.tests.db_testing import SingleDbTestFixture -from swh.indexer.tests import SQL_DIR - -import pytest +from swh.indexer.tests.storage import SQL_DIR, gen_content_mimetypes @pytest.mark.db @@ -1613,6 +1614,89 @@ self.assertEqual(expected_tool, actual_tool) +@pytest.mark.property_based +class PropBasedTestStorage(BaseTestStorage, unittest.TestCase): + """Properties-based tests + + """ + def test_generate_content_mimetype_get_range_limit_none(self): + """mimetype_get_range call with wrong limit input should fail""" + with self.assertRaises(ValueError) as e: + self.storage.content_mimetype_get_range( + start=None, end=None, indexer_configuration_id=None, + limit=None) + + self.assertEqual(e.exception.args, ( + 'Development error: limit should not be None',)) + + @given(gen_content_mimetypes(min_size=1, max_size=4)) + def test_generate_content_mimetype_get_range_no_limit(self, mimetypes): + """mimetype_get_range returns mimetypes within range provided""" + self.reset_storage_tables() + # add mimetypes to storage + self.storage.content_mimetype_add(mimetypes) + + # All ids from the db + content_ids = [c['id'] for c in mimetypes] + content_ids.sort() + + start = content_ids[0] + end = content_ids[-1] + + # retrieve mimetypes + tool_id = mimetypes[0]['indexer_configuration_id'] + actual_result = self.storage.content_mimetype_get_range( + start, end, indexer_configuration_id=tool_id) + + actual_ids = actual_result['ids'] + actual_next = actual_result['next'] + + self.assertEqual(len(mimetypes), len(actual_ids)) + self.assertIsNone(actual_next) + self.assertEqual(content_ids, actual_ids) + + @given(gen_content_mimetypes(min_size=4, max_size=4)) + def test_generate_content_mimetype_get_range_limit(self, mimetypes): + """mimetype_get_range paginates results if limit exceeded""" + self.reset_storage_tables() + + # add mimetypes to storage + self.storage.content_mimetype_add(mimetypes) + + # input the list of sha1s we want from storage + content_ids = [c['id'] for c in mimetypes] + content_ids.sort() + start = content_ids[0] + end = content_ids[-1] + + # retrieve mimetypes limited to 3 results + limited_results = len(mimetypes) - 1 + tool_id = mimetypes[0]['indexer_configuration_id'] + actual_result = self.storage.content_mimetype_get_range( + start, end, + indexer_configuration_id=tool_id, limit=limited_results) + + actual_ids = actual_result['ids'] + actual_next = actual_result['next'] + + self.assertEqual(limited_results, len(actual_ids)) + self.assertIsNotNone(actual_next) + self.assertEqual(actual_next, content_ids[-1]) + + expected_mimetypes = content_ids[:-1] + self.assertEqual(expected_mimetypes, actual_ids) + + # retrieve next part + actual_results2 = self.storage.content_mimetype_get_range( + start=end, end=end, indexer_configuration_id=tool_id) + actual_ids2 = actual_results2['ids'] + actual_next2 = actual_results2['next'] + + self.assertIsNone(actual_next2) + expected_mimetypes2 = [content_ids[-1]] + self.assertEqual(expected_mimetypes2, actual_ids2) + + class IndexerTestStorage(CommonTestStorage, unittest.TestCase): """Running the tests locally. diff --git a/tox.ini b/tox.ini --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,23 @@ pytest-cov pifpaf commands = - pifpaf run postgresql -- pytest --cov=swh --cov-branch {posargs} + pifpaf run postgresql -- pytest --hypothesis-profile=fast --cov=swh --cov-branch {posargs} + +[testenv:py3-slow] +deps = + .[testing] + pytest-cov + pifpaf +commands = + pifpaf run postgresql -- pytest --hypothesis-profile=slow --cov=swh --cov-branch {posargs} + +[testenv:py3-prop] +deps = + .[testing] + pytest-cov + pifpaf +commands = + pifpaf run postgresql -- pytest --hypothesis-profile=fast -m property_based --disable-warnings [testenv:flake8] skip_install = true