diff --git a/swh/indexer/storage/__init__.py b/swh/indexer/storage/__init__.py --- a/swh/indexer/storage/__init__.py +++ b/swh/indexer/storage/__init__.py @@ -456,6 +456,35 @@ cur=cur) db.content_fossology_license_add_from_temp(conflict_update, cur) + @remote_api_endpoint('content/fossology_license/range') + @db_transaction() + def content_fossology_license_get_range( + self, start, end, indexer_configuration_id, + limit=1000, db=None, cur=None): + """Retrieve licenses within range [start, end] bound by limit. + + Args: + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **indexer_configuration_id** (int): The tool used to index data + **limit** (int): Limit result (default to 1000) + + Raises: + ValueError for limit to None + + Returns: + a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any + + """ + return self._content_get_range('fossology_license', start, end, + indexer_configuration_id, limit=limit, + db=db, cur=cur) + @remote_api_endpoint('content_metadata/missing') @db_transaction_generator() def content_metadata_missing(self, metadata, db=None, cur=None): diff --git a/swh/indexer/storage/db.py b/swh/indexer/storage/db.py --- a/swh/indexer/storage/db.py +++ b/swh/indexer/storage/db.py @@ -115,7 +115,7 @@ content_indexer_names = { 'mimetype': 'content_mimetype', - 'language': 'content_language', + 'fossology_license': 'content_fossology_license', } def content_get_range(self, content_type, start, end, diff --git a/swh/indexer/tests/storage/__init__.py b/swh/indexer/tests/storage/__init__.py --- a/swh/indexer/tests/storage/__init__.py +++ b/swh/indexer/tests/storage/__init__.py @@ -42,6 +42,16 @@ return one_of(sampled_from(ENCODINGS)) +def _init_content(uuid): + """Given a uuid, initialize a content + + """ + return { + 'id': MultiHash.from_data(uuid.bytes, {'sha1'}).digest()['sha1'], + 'indexer_configuration_id': 1, + } + + @composite def gen_content_mimetypes(draw, *, min_size=0, max_size=100): """Generate valid and consistent content_mimetypes. @@ -73,11 +83,60 @@ content_mimetypes = [] for uuid, mimetype, encoding in _ids: - content_id = MultiHash.from_data(uuid.bytes, {'sha1'}).digest()['sha1'] content_mimetypes.append({ - 'id': content_id, + **_init_content(uuid), 'mimetype': mimetype, 'encoding': encoding, - 'indexer_configuration_id': 1, }) return content_mimetypes + + +FOSSOLOGY_LICENSES = [ + b'3DFX', + b'BSD', + b'GPL', + b'Apache2', + b'MIT', +] + + +def gen_license(): + return one_of(sampled_from(FOSSOLOGY_LICENSES)) + + +@composite +def gen_content_fossology_licenses(draw, *, min_size=0, max_size=100): + """Generate valid and consistent content_fossology_licenses. + + Context: Test purposes + + Args: + **draw** (callable): Used by hypothesis to generate data + **min_size** (int): Minimal number of elements to generate + (default: 0) + **max_size** (int): Maximal number of elements to generate + (default: 100) + + Returns: + List of content_fossology_licenses as expected by the + content_fossology_license_add api endpoint. + + """ + _ids = draw( + sets( + tuples( + uuids(), + gen_license(), + ), + min_size=min_size, max_size=max_size + ) + ) + + content_licenses = [] + for uuid, license in _ids: + content_licenses.append({ + **_init_content(uuid), + 'licenses': [license], + 'indexer_configuration_id': 1, + }) + return content_licenses diff --git a/swh/indexer/tests/storage/test_storage.py b/swh/indexer/tests/storage/test_storage.py --- a/swh/indexer/tests/storage/test_storage.py +++ b/swh/indexer/tests/storage/test_storage.py @@ -13,7 +13,9 @@ from swh.indexer.storage import get_indexer_storage from swh.core.tests.db_testing import SingleDbTestFixture -from swh.indexer.tests.storage import SQL_DIR, gen_content_mimetypes +from swh.indexer.tests.storage import ( + SQL_DIR, gen_content_mimetypes, gen_content_fossology_licenses +) @pytest.mark.db @@ -1694,6 +1696,83 @@ expected_mimetypes2 = [content_ids[-1]] self.assertEqual(expected_mimetypes2, actual_ids2) + def test_generate_content_fossology_license_get_range_limit_none(self): + """license_get_range call with wrong limit input should fail""" + with self.assertRaises(ValueError) as e: + self.storage.content_fossology_license_get_range( + start=None, end=None, indexer_configuration_id=None, + limit=None) + + self.assertEqual(e.exception.args, ( + 'Development error: limit should not be None',)) + + @given(gen_content_fossology_licenses(min_size=1, max_size=4)) + def test_generate_content_fossology_license_get_range_no_limit( + self, fossology_licenses): + """license_get_range returns licenses within range provided""" + self.reset_storage_tables() + # add fossology_licenses to storage + self.storage.content_fossology_license_add(fossology_licenses) + + # All ids from the db + content_ids = sorted([c['id'] for c in fossology_licenses]) + + start = content_ids[0] + end = content_ids[-1] + + # retrieve fossology_licenses + tool_id = fossology_licenses[0]['indexer_configuration_id'] + actual_result = self.storage.content_fossology_license_get_range( + start, end, indexer_configuration_id=tool_id) + + actual_ids = actual_result['ids'] + actual_next = actual_result['next'] + + self.assertEqual(len(fossology_licenses), len(actual_ids)) + self.assertIsNone(actual_next) + self.assertEqual(content_ids, actual_ids) + + @given(gen_content_fossology_licenses(min_size=4, max_size=4)) + def test_generate_fossology_license_get_range_limit( + self, fossology_licenses): + """fossology_license_get_range paginates results if limit exceeded""" + self.reset_storage_tables() + + # add fossology_licenses to storage + self.storage.content_fossology_license_add(fossology_licenses) + + # input the list of sha1s we want from storage + content_ids = sorted([c['id'] for c in fossology_licenses]) + start = content_ids[0] + end = content_ids[-1] + + # retrieve fossology_licenses limited to 3 results + limited_results = len(fossology_licenses) - 1 + tool_id = fossology_licenses[0]['indexer_configuration_id'] + actual_result = self.storage.content_fossology_license_get_range( + start, end, + indexer_configuration_id=tool_id, limit=limited_results) + + actual_ids = actual_result['ids'] + actual_next = actual_result['next'] + + self.assertEqual(limited_results, len(actual_ids)) + self.assertIsNotNone(actual_next) + self.assertEqual(actual_next, content_ids[-1]) + + expected_fossology_licenses = content_ids[:-1] + self.assertEqual(expected_fossology_licenses, actual_ids) + + # retrieve next part + actual_results2 = self.storage.content_fossology_license_get_range( + start=end, end=end, indexer_configuration_id=tool_id) + actual_ids2 = actual_results2['ids'] + actual_next2 = actual_results2['next'] + + self.assertIsNone(actual_next2) + expected_fossology_licenses2 = [content_ids[-1]] + self.assertEqual(expected_fossology_licenses2, actual_ids2) + class IndexerTestStorage(CommonTestStorage, unittest.TestCase): """Running the tests locally.