Page MenuHomeSoftware Heritage

D654.id2036.diff
No OneTemporary

D654.id2036.diff

diff --git a/.gitignore b/.gitignore
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@
/sql/createdb-stamp
/sql/filldb-stamp
.tox/
+.hypothesis/
\ No newline at end of file
diff --git a/Makefile.local b/Makefile.local
new file mode 100644
--- /dev/null
+++ b/Makefile.local
@@ -0,0 +1 @@
+TESTFLAGS=--hypothesis-profile=fast --disable-warnings -m property_based
diff --git a/conftest.py b/conftest.py
new file mode 100644
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,6 @@
+from hypothesis import settings
+
+# define tests profile. Full documentation is at:
+# https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles
+settings.register_profile("fast", max_examples=5, deadline=5000)
+settings.register_profile("slow", max_examples=20, deadline=5000)
diff --git a/debian/control b/debian/control
--- a/debian/control
+++ b/debian/control
@@ -7,6 +7,7 @@
python3-all,
python3-chardet (>= 2.3.0~),
python3-click,
+ python3-hypothesis,
python3-pytest,
python3-pygments,
python3-magic,
diff --git a/requirements-test.txt b/requirements-test.txt
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1 +1,2 @@
pytest
+hypothesis
diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
--- a/swh/indexer/storage/__init__.py
+++ b/swh/indexer/storage/__init__.py
@@ -110,6 +110,85 @@
for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
yield obj[0]
+ def _content_get_range(self, content_type, start, end,
+ indexer_configuration_id, limit=1000,
+ db=None, cur=None):
+ """Retrieve ids of type content_type within range [start, end] bound
+ by limit.
+
+ Args:
+ **content_type** (str): content's type (mimetype, language, etc...)
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **indexer_configuration_id** (int): The tool used to index data
+ **limit** (int): Limit result (default to 1000)
+
+ Raises:
+ ValueError for;
+ - limit to None
+ - wrong content_type provided
+
+ Returns:
+ a dict with keys:
+ - **ids** [bytes]: iterable of content ids within the range.
+ - **next** (Optional[bytes]): The next range of sha1 starts at
+ this sha1 if any
+
+ """
+ if limit is None:
+ raise ValueError('Development error: limit should not be None')
+ if content_type not in db.content_types:
+ err = 'Development error: Wrong type. Should be one of [%s]' % (
+ ','.join(db.content_tables))
+ raise ValueError(err)
+
+ ids = []
+ next_id = None
+ for counter, obj in enumerate(db.content_get_range(
+ content_type, start, end, indexer_configuration_id,
+ limit=limit+1, cur=cur)):
+ _id = obj[0]
+ if counter >= limit:
+ next_id = _id
+ break
+
+ ids.append(_id)
+
+ return {
+ 'ids': ids,
+ 'next': next_id
+ }
+
+ @remote_api_endpoint('content_mimetype/range')
+ @db_transaction()
+ def content_mimetype_get_range(self, start, end, indexer_configuration_id,
+ limit=1000, db=None, cur=None):
+ """Retrieve mimetypes within range [start, end] bound by limit.
+
+ Args:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **indexer_configuration_id** (int): The tool used to index data
+ **limit** (int): Limit result (default to 1000)
+
+ Raises:
+ ValueError for limit to None
+
+ Returns:
+ a dict with keys:
+ - **ids** [bytes]: iterable of content ids within the range.
+ - **next** (Optional[bytes]): The next range of sha1 starts at
+ this sha1 if any
+
+ """
+ return self._content_get_range('mimetype', start, end,
+ indexer_configuration_id, limit=limit,
+ db=db, cur=cur)
+
@remote_api_endpoint('content_mimetype/add')
@db_transaction()
def content_mimetype_add(self, mimetypes, conflict_update=False, db=None,
diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
--- a/swh/indexer/storage/db.py
+++ b/swh/indexer/storage/db.py
@@ -113,6 +113,31 @@
((_id,) for _id in ids)
)
+ content_types = {
+ 'mimetype': 'content_mimetype',
+ 'language': 'content_language',
+ }
+
+ def content_get_range(self, content_type, start, end,
+ indexer_configuration_id, limit=1000, cur=None):
+ """Retrieve contents with content_type, within range [start, end]
+ bound by limit and associated to the given indexer
+ configuration id.
+
+ """
+ cur = self._cursor(cur)
+ table = self.content_types[content_type]
+ query = """select t.id
+ from %s t
+ inner join indexer_configuration ic
+ on t.indexer_configuration_id=ic.id
+ where ic.id=%%s and
+ %%s <= t.id and t.id <= %%s
+ order by t.indexer_configuration_id, t.id
+ limit %%s""" % table
+ cur.execute(query, (indexer_configuration_id, start, end, limit))
+ yield from cursor_to_bytes(cur)
+
def content_mimetype_get_from_list(self, ids, cur=None):
yield from self._get_from_list(
'content_mimetype', ids, self.content_mimetype_cols, cur=cur)
diff --git a/swh/indexer/tests/storage/__init__.py b/swh/indexer/tests/storage/__init__.py
--- a/swh/indexer/tests/storage/__init__.py
+++ b/swh/indexer/tests/storage/__init__.py
@@ -0,0 +1,89 @@
+# Copyright (C) 2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from os import path
+import swh.storage
+
+from hypothesis.strategies import (binary, composite, sets, one_of,
+ tuples, sampled_from)
+
+
+SQL_DIR = path.join(path.dirname(swh.indexer.__file__), 'sql')
+
+
+MIMETYPES = [
+ b'application/json',
+ b'application/octet-stream',
+ b'application/xml',
+ b'text/plain',
+]
+
+ENCODINGS = [
+ b'iso8859-1',
+ b'iso8859-15',
+ b'latin1',
+ b'utf-8',
+]
+
+
+def gen_content_id():
+ """Generate raw id strategy.
+
+ """
+ return binary(min_size=20, max_size=20)
+
+
+def gen_mimetype():
+ """Generate one mimetype strategy.
+
+ """
+ return one_of(sampled_from(MIMETYPES))
+
+
+def gen_encoding():
+ """Generate one encoding strategy.
+
+ """
+ return one_of(sampled_from(ENCODINGS))
+
+
+@composite
+def gen_content_mimetypes(draw, *, min_size=0, max_size=100):
+ """Generate valid and consistent content_mimetypes.
+
+ Context: Test purposes
+
+ Args:
+ **draw** (callable): Used by hypothesis to generate data
+ **min_size** (int): Minimal number of elements to generate
+ (default: 0)
+ **max_size** (int): Maximal number of elements to generate
+ (default: 100)
+
+ Returns:
+ List of content_mimetypes as expected by the
+ content_mimetype_add api endpoint.
+
+ """
+ _ids = draw(
+ sets(
+ tuples(
+ gen_content_id(),
+ gen_mimetype(),
+ gen_encoding()
+ ),
+ min_size=min_size, max_size=max_size
+ )
+ )
+
+ mimetypes = []
+ for content_id, mimetype, encoding in _ids:
+ mimetypes.append({
+ 'id': content_id,
+ 'mimetype': mimetype,
+ 'encoding': encoding,
+ 'indexer_configuration_id': 1,
+ })
+ return mimetypes
diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
--- a/swh/indexer/tests/storage/test_storage.py
+++ b/swh/indexer/tests/storage/test_storage.py
@@ -4,15 +4,16 @@
# See top-level LICENSE file for more information
import os
+import pytest
import unittest
+from hypothesis import given
+
from swh.model.hashutil import hash_to_bytes
from swh.indexer.storage import get_indexer_storage
from swh.core.tests.db_testing import SingleDbTestFixture
-from swh.indexer.tests import SQL_DIR
-
-import pytest
+from swh.indexer.tests.storage import SQL_DIR, gen_content_mimetypes
@pytest.mark.db
@@ -1613,6 +1614,103 @@
self.assertEqual(expected_tool, actual_tool)
+@pytest.mark.property_based
+class PropBasedTestStorage(BaseTestStorage, unittest.TestCase):
+ """Properties-based tests
+
+ """
+ def assert_mimetypes_ok(self, expected_mimetypes, actual_mimetypes,
+ keys_to_check={'id', 'mimetype', 'encoding'}):
+ """Assert that a given list of contents matches on a given set of keys.
+
+ """
+ for k in keys_to_check:
+ expected_list = [c[k] for c in expected_mimetypes]
+ expected_list.sort()
+ actual_list = [c[k] for c in actual_mimetypes]
+ actual_list.sort()
+ self.assertEqual(actual_list, expected_list)
+
+ def test_generate_content_mimetype_get_range_limit_none(self):
+ """mimetype_get_range call with wrong limit input should fail"""
+ with self.assertRaises(ValueError) as e:
+ self.storage.content_mimetype_get_range(
+ start=None, end=None, indexer_configuration_id=None,
+ limit=None)
+
+ self.assertEqual(e.exception.args, (
+ 'Development error: limit should not be None',))
+
+ @given(gen_content_mimetypes(min_size=1, max_size=4))
+ def test_generate_content_mimetype_get_range_no_limit(self, mimetypes):
+ """mimetype_get_range returns mimetypes within range provided"""
+ self.reset_storage_tables()
+ # add mimetypes to storage
+ self.storage.content_mimetype_add(mimetypes)
+
+ # All ids from the db
+ content_ids = [c['id'] for c in mimetypes]
+ content_ids.sort()
+
+ start = content_ids[0]
+ end = content_ids[-1]
+
+ # retrieve mimetypes
+ tool_id = mimetypes[0]['indexer_configuration_id']
+ actual_result = self.storage.content_mimetype_get_range(
+ start, end, indexer_configuration_id=tool_id)
+
+ actual_mimetypes = actual_result['ids']
+ actual_next = actual_result['next']
+
+ self.assertEqual(len(mimetypes), len(actual_mimetypes))
+ self.assertIsNone(actual_next)
+ self.assertEqual(content_ids, actual_mimetypes)
+
+ @given(gen_content_mimetypes(min_size=4, max_size=4))
+ def test_generate_content_mimetype_get_range_limit(self, mimetypes):
+ """mimetype_get_range paginates results if limit exceeded"""
+ self.reset_storage_tables()
+
+ # add mimetypes to storage
+ self.storage.content_mimetype_add(mimetypes)
+
+ # input the list of sha1s we want from storage
+ content_ids = [c['id'] for c in mimetypes]
+ content_ids.sort()
+ start = content_ids[0]
+ end = content_ids[-1]
+
+ # retrieve mimetypes limited to 3 results
+ limited_results = len(mimetypes) - 1
+ tool_id = mimetypes[0]['indexer_configuration_id']
+ actual_result = self.storage.content_mimetype_get_range(
+ start, end,
+ indexer_configuration_id=tool_id, limit=limited_results)
+
+ actual_mimetypes = actual_result['ids']
+ actual_next = actual_result['next']
+
+ self.assertEqual(limited_results, len(actual_mimetypes))
+ self.assertIsNotNone(actual_next)
+ self.assertEqual(actual_next, content_ids[-1])
+
+ expected_mimetypes = content_ids[:-1]
+ self.assertEqual(expected_mimetypes, actual_mimetypes)
+
+ # retrieve next part
+ actual_results2 = self.storage.content_mimetype_get_range(
+ start=end, end=end, indexer_configuration_id=tool_id)
+ actual_mimetypes2 = actual_results2['ids']
+ actual_next2 = actual_results2['next']
+
+ self.assertEqual(1, len(actual_mimetypes2))
+ self.assertIsNone(actual_next2)
+
+ expected_mimetypes2 = [content_ids[-1]]
+ self.assertEqual(expected_mimetypes2, actual_mimetypes2)
+
+
class IndexerTestStorage(CommonTestStorage, unittest.TestCase):
"""Running the tests locally.
diff --git a/tox.ini b/tox.ini
--- a/tox.ini
+++ b/tox.ini
@@ -7,7 +7,23 @@
pytest-cov
pifpaf
commands =
- pifpaf run postgresql -- pytest --cov=swh --cov-branch {posargs}
+ pifpaf run postgresql -- pytest --hypothesis-profile=fast --cov=swh --cov-branch {posargs}
+
+[testenv:py3-slow]
+deps =
+ .[testing]
+ pytest-cov
+ pifpaf
+commands =
+ pifpaf run postgresql -- pytest --hypothesis-profile=slow --cov=swh --cov-branch {posargs}
+
+[testenv:py3-prop]
+deps =
+ .[testing]
+ pytest-cov
+ pifpaf
+commands =
+ pifpaf run postgresql -- pytest --hypothesis-profile=fast -m property_based --disable-warnings
[testenv:flake8]
skip_install = true

File Metadata

Mime Type
text/plain
Expires
Mon, Apr 14, 1:08 AM (1 w, 1 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3217475

Event Timeline