D654.id2036.diff
No OneTemporary
Actions

Size

13 KB

Subscribers

None

D654.id2036.diff
View Options

	diff --git a/.gitignore b/.gitignore
	--- a/.gitignore
	+++ b/.gitignore
	@@ -11,3 +11,4 @@
	/sql/createdb-stamp
	/sql/filldb-stamp
	.tox/
	+.hypothesis/
	\ No newline at end of file
	diff --git a/Makefile.local b/Makefile.local
	new file mode 100644
	--- /dev/null
	+++ b/Makefile.local
	@@ -0,0 +1 @@
	+TESTFLAGS=--hypothesis-profile=fast --disable-warnings -m property_based
	diff --git a/conftest.py b/conftest.py
	new file mode 100644
	--- /dev/null
	+++ b/conftest.py
	@@ -0,0 +1,6 @@
	+from hypothesis import settings
	+
	+# define tests profile. Full documentation is at:
	+# https://hypothesis.readthedocs.io/en/latest/settings.html#settings-profiles
	+settings.register_profile("fast", max_examples=5, deadline=5000)
	+settings.register_profile("slow", max_examples=20, deadline=5000)
	diff --git a/debian/control b/debian/control
	--- a/debian/control
	+++ b/debian/control
	@@ -7,6 +7,7 @@
	python3-all,
	python3-chardet (>= 2.3.0~),
	python3-click,
	+ python3-hypothesis,
	python3-pytest,
	python3-pygments,
	python3-magic,
	diff --git a/requirements-test.txt b/requirements-test.txt
	--- a/requirements-test.txt
	+++ b/requirements-test.txt
	@@ -1 +1,2 @@
	pytest
	+hypothesis
	diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py
	--- a/swh/indexer/storage/__init__.py
	+++ b/swh/indexer/storage/__init__.py
	@@ -110,6 +110,85 @@
	for obj in db.content_mimetype_missing_from_list(mimetypes, cur):
	yield obj[0]

	+ def _content_get_range(self, content_type, start, end,
	+ indexer_configuration_id, limit=1000,
	+ db=None, cur=None):
	+ """Retrieve ids of type content_type within range [start, end] bound
	+ by limit.
	+
	+ Args:
	+ content_type (str): content's type (mimetype, language, etc...)
	+ start (bytes): Starting identifier range (expected smaller
	+ than end)
	+ end (bytes): Ending identifier range (expected larger
	+ than start)
	+ indexer_configuration_id (int): The tool used to index data
	+ limit (int): Limit result (default to 1000)
	+
	+ Raises:
	+ ValueError for;
	+ - limit to None
	+ - wrong content_type provided
	+
	+ Returns:
	+ a dict with keys:
	+ - ids [bytes]: iterable of content ids within the range.
	+ - next (Optional[bytes]): The next range of sha1 starts at
	+ this sha1 if any
	+
	+ """
	+ if limit is None:
	+ raise ValueError('Development error: limit should not be None')
	+ if content_type not in db.content_types:
	+ err = 'Development error: Wrong type. Should be one of [%s]' % (
	+ ','.join(db.content_tables))
	+ raise ValueError(err)
	+
	+ ids = []
	+ next_id = None
	+ for counter, obj in enumerate(db.content_get_range(
	+ content_type, start, end, indexer_configuration_id,
	+ limit=limit+1, cur=cur)):
	+ _id = obj[0]
	+ if counter >= limit:
	+ next_id = _id
	+ break
	+
	+ ids.append(_id)
	+
	+ return {
	+ 'ids': ids,
	+ 'next': next_id
	+ }
	+
	+ @remote_api_endpoint('content_mimetype/range')
	+ @db_transaction()
	+ def content_mimetype_get_range(self, start, end, indexer_configuration_id,
	+ limit=1000, db=None, cur=None):
	+ """Retrieve mimetypes within range [start, end] bound by limit.
	+
	+ Args:
	+ start (bytes): Starting identifier range (expected smaller
	+ than end)
	+ end (bytes): Ending identifier range (expected larger
	+ than start)
	+ indexer_configuration_id (int): The tool used to index data
	+ limit (int): Limit result (default to 1000)
	+
	+ Raises:
	+ ValueError for limit to None
	+
	+ Returns:
	+ a dict with keys:
	+ - ids [bytes]: iterable of content ids within the range.
	+ - next (Optional[bytes]): The next range of sha1 starts at
	+ this sha1 if any
	+
	+ """
	+ return self._content_get_range('mimetype', start, end,
	+ indexer_configuration_id, limit=limit,
	+ db=db, cur=cur)
	+
	@remote_api_endpoint('content_mimetype/add')
	@db_transaction()
	def content_mimetype_add(self, mimetypes, conflict_update=False, db=None,
	diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py
	--- a/swh/indexer/storage/db.py
	+++ b/swh/indexer/storage/db.py
	@@ -113,6 +113,31 @@
	((_id,) for _id in ids)
	)

	+ content_types = {
	+ 'mimetype': 'content_mimetype',
	+ 'language': 'content_language',
	+ }
	+
	+ def content_get_range(self, content_type, start, end,
	+ indexer_configuration_id, limit=1000, cur=None):
	+ """Retrieve contents with content_type, within range [start, end]
	+ bound by limit and associated to the given indexer
	+ configuration id.
	+
	+ """
	+ cur = self._cursor(cur)
	+ table = self.content_types[content_type]
	+ query = """select t.id
	+ from %s t
	+ inner join indexer_configuration ic
	+ on t.indexer_configuration_id=ic.id
	+ where ic.id=%%s and
	+ %%s <= t.id and t.id <= %%s
	+ order by t.indexer_configuration_id, t.id
	+ limit %%s""" % table
	+ cur.execute(query, (indexer_configuration_id, start, end, limit))
	+ yield from cursor_to_bytes(cur)
	+
	def content_mimetype_get_from_list(self, ids, cur=None):
	yield from self._get_from_list(
	'content_mimetype', ids, self.content_mimetype_cols, cur=cur)
	diff --git a/swh/indexer/tests/storage/__init__.py b/swh/indexer/tests/storage/__init__.py
	--- a/swh/indexer/tests/storage/__init__.py
	+++ b/swh/indexer/tests/storage/__init__.py
	@@ -0,0 +1,89 @@
	+# Copyright (C) 2018 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+from os import path
	+import swh.storage
	+
	+from hypothesis.strategies import (binary, composite, sets, one_of,
	+ tuples, sampled_from)
	+
	+
	+SQL_DIR = path.join(path.dirname(swh.indexer.__file__), 'sql')
	+
	+
	+MIMETYPES = [
	+ b'application/json',
	+ b'application/octet-stream',
	+ b'application/xml',
	+ b'text/plain',
	+]
	+
	+ENCODINGS = [
	+ b'iso8859-1',
	+ b'iso8859-15',
	+ b'latin1',
	+ b'utf-8',
	+]
	+
	+
	+def gen_content_id():
	+ """Generate raw id strategy.
	+
	+ """
	+ return binary(min_size=20, max_size=20)
	+
	+
	+def gen_mimetype():
	+ """Generate one mimetype strategy.
	+
	+ """
	+ return one_of(sampled_from(MIMETYPES))
	+
	+
	+def gen_encoding():
	+ """Generate one encoding strategy.
	+
	+ """
	+ return one_of(sampled_from(ENCODINGS))
	+
	+
	+@composite
	+def gen_content_mimetypes(draw, *, min_size=0, max_size=100):
	+ """Generate valid and consistent content_mimetypes.
	+
	+ Context: Test purposes
	+
	+ Args:
	+ draw (callable): Used by hypothesis to generate data
	+ min_size (int): Minimal number of elements to generate
	+ (default: 0)
	+ max_size (int): Maximal number of elements to generate
	+ (default: 100)
	+
	+ Returns:
	+ List of content_mimetypes as expected by the
	+ content_mimetype_add api endpoint.
	+
	+ """
	+ _ids = draw(
	+ sets(
	+ tuples(
	+ gen_content_id(),
	+ gen_mimetype(),
	+ gen_encoding()
	+ ),
	+ min_size=min_size, max_size=max_size
	+ )
	+ )
	+
	+ mimetypes = []
	+ for content_id, mimetype, encoding in _ids:
	+ mimetypes.append({
	+ 'id': content_id,
	+ 'mimetype': mimetype,
	+ 'encoding': encoding,
	+ 'indexer_configuration_id': 1,
	+ })
	+ return mimetypes
	diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py
	--- a/swh/indexer/tests/storage/test_storage.py
	+++ b/swh/indexer/tests/storage/test_storage.py
	@@ -4,15 +4,16 @@
	# See top-level LICENSE file for more information

	import os
	+import pytest
	import unittest

	+from hypothesis import given
	+
	from swh.model.hashutil import hash_to_bytes

	from swh.indexer.storage import get_indexer_storage
	from swh.core.tests.db_testing import SingleDbTestFixture
	-from swh.indexer.tests import SQL_DIR
	-
	-import pytest
	+from swh.indexer.tests.storage import SQL_DIR, gen_content_mimetypes


	@pytest.mark.db
	@@ -1613,6 +1614,103 @@
	self.assertEqual(expected_tool, actual_tool)


	+@pytest.mark.property_based
	+class PropBasedTestStorage(BaseTestStorage, unittest.TestCase):
	+ """Properties-based tests
	+
	+ """
	+ def assert_mimetypes_ok(self, expected_mimetypes, actual_mimetypes,
	+ keys_to_check={'id', 'mimetype', 'encoding'}):
	+ """Assert that a given list of contents matches on a given set of keys.
	+
	+ """
	+ for k in keys_to_check:
	+ expected_list = [c[k] for c in expected_mimetypes]
	+ expected_list.sort()
	+ actual_list = [c[k] for c in actual_mimetypes]
	+ actual_list.sort()
	+ self.assertEqual(actual_list, expected_list)
	+
	+ def test_generate_content_mimetype_get_range_limit_none(self):
	+ """mimetype_get_range call with wrong limit input should fail"""
	+ with self.assertRaises(ValueError) as e:
	+ self.storage.content_mimetype_get_range(
	+ start=None, end=None, indexer_configuration_id=None,
	+ limit=None)
	+
	+ self.assertEqual(e.exception.args, (
	+ 'Development error: limit should not be None',))
	+
	+ @given(gen_content_mimetypes(min_size=1, max_size=4))
	+ def test_generate_content_mimetype_get_range_no_limit(self, mimetypes):
	+ """mimetype_get_range returns mimetypes within range provided"""
	+ self.reset_storage_tables()
	+ # add mimetypes to storage
	+ self.storage.content_mimetype_add(mimetypes)
	+
	+ # All ids from the db
	+ content_ids = [c['id'] for c in mimetypes]
	+ content_ids.sort()
	+
	+ start = content_ids[0]
	+ end = content_ids[-1]
	+
	+ # retrieve mimetypes
	+ tool_id = mimetypes[0]['indexer_configuration_id']
	+ actual_result = self.storage.content_mimetype_get_range(
	+ start, end, indexer_configuration_id=tool_id)
	+
	+ actual_mimetypes = actual_result['ids']
	+ actual_next = actual_result['next']
	+
	+ self.assertEqual(len(mimetypes), len(actual_mimetypes))
	+ self.assertIsNone(actual_next)
	+ self.assertEqual(content_ids, actual_mimetypes)
	+
	+ @given(gen_content_mimetypes(min_size=4, max_size=4))
	+ def test_generate_content_mimetype_get_range_limit(self, mimetypes):
	+ """mimetype_get_range paginates results if limit exceeded"""
	+ self.reset_storage_tables()
	+
	+ # add mimetypes to storage
	+ self.storage.content_mimetype_add(mimetypes)
	+
	+ # input the list of sha1s we want from storage
	+ content_ids = [c['id'] for c in mimetypes]
	+ content_ids.sort()
	+ start = content_ids[0]
	+ end = content_ids[-1]
	+
	+ # retrieve mimetypes limited to 3 results
	+ limited_results = len(mimetypes) - 1
	+ tool_id = mimetypes[0]['indexer_configuration_id']
	+ actual_result = self.storage.content_mimetype_get_range(
	+ start, end,
	+ indexer_configuration_id=tool_id, limit=limited_results)
	+
	+ actual_mimetypes = actual_result['ids']
	+ actual_next = actual_result['next']
	+
	+ self.assertEqual(limited_results, len(actual_mimetypes))
	+ self.assertIsNotNone(actual_next)
	+ self.assertEqual(actual_next, content_ids[-1])
	+
	+ expected_mimetypes = content_ids[:-1]
	+ self.assertEqual(expected_mimetypes, actual_mimetypes)
	+
	+ # retrieve next part
	+ actual_results2 = self.storage.content_mimetype_get_range(
	+ start=end, end=end, indexer_configuration_id=tool_id)
	+ actual_mimetypes2 = actual_results2['ids']
	+ actual_next2 = actual_results2['next']
	+
	+ self.assertEqual(1, len(actual_mimetypes2))
	+ self.assertIsNone(actual_next2)
	+
	+ expected_mimetypes2 = [content_ids[-1]]
	+ self.assertEqual(expected_mimetypes2, actual_mimetypes2)
	+
	+
	class IndexerTestStorage(CommonTestStorage, unittest.TestCase):
	"""Running the tests locally.

	diff --git a/tox.ini b/tox.ini
	--- a/tox.ini
	+++ b/tox.ini
	@@ -7,7 +7,23 @@
	pytest-cov
	pifpaf
	commands =
	- pifpaf run postgresql -- pytest --cov=swh --cov-branch {posargs}
	+ pifpaf run postgresql -- pytest --hypothesis-profile=fast --cov=swh --cov-branch {posargs}
	+
	+[testenv:py3-slow]
	+deps =
	+ .[testing]
	+ pytest-cov
	+ pifpaf
	+commands =
	+ pifpaf run postgresql -- pytest --hypothesis-profile=slow --cov=swh --cov-branch {posargs}
	+
	+[testenv:py3-prop]
	+deps =
	+ .[testing]
	+ pytest-cov
	+ pifpaf
	+commands =
	+ pifpaf run postgresql -- pytest --hypothesis-profile=fast -m property_based --disable-warnings

	[testenv:flake8]
	skip_install = true

File Metadata

Mime Type: text/plain
Expires: Mon, Apr 14, 1:08 AM (1 w, 1 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3217475

D654.id2036.diffNo OneTemporaryActions

D654.id2036.diffView Options

File Metadata

Event Timeline

D654.id2036.diff
No OneTemporary
Actions

D654.id2036.diff
View Options