diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -104,6 +104,7 @@ - indexer_configuration_id (int): tool used to compute the output """ + assert isinstance(id, bytes) content_path = self.write_to_temp( filename=hashutil.hash_to_hex(id), # use the id as pathname data=data) diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py --- a/swh/indexer/indexer.py +++ b/swh/indexer/indexer.py @@ -405,11 +405,14 @@ bytes: Identifier of contents to index. """ + if not isinstance(start, bytes) or not isinstance(end, bytes): + raise TypeError('identifiers must be bytes, not %r and %r.' % + (start, end)) while start: result = self.storage.content_get_range(start, end) contents = result['contents'] for c in contents: - _id = c['sha1'] + _id = hashutil.hash_to_bytes(c['sha1']) if _id in indexed: continue yield _id @@ -435,6 +438,10 @@ hashutil.hash_to_hex(sha1)) continue res = self.index(sha1, raw_content, **kwargs) + if not isinstance(res['id'], bytes): + raise TypeError( + '%r.index should return ids as bytes, not %r' % + (self.__class__.__name__, res['id'])) if res: yield res diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py --- a/swh/indexer/storage/in_memory.py +++ b/swh/indexer/storage/in_memory.py @@ -191,6 +191,31 @@ """ yield from self._mimetypes.missing(mimetypes) + def content_mimetype_get_range( + self, start, end, indexer_configuration_id, limit=1000): + """Retrieve mimetypes within range [start, end] bound by limit. + + Args: + **start** (bytes): Starting identifier range (expected smaller + than end) + **end** (bytes): Ending identifier range (expected larger + than start) + **indexer_configuration_id** (int): The tool used to index data + **limit** (int): Limit result (default to 1000) + + Raises: + ValueError for limit to None + + Returns: + a dict with keys: + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any + + """ + return self._mimetypes.get_range( + start, end, indexer_configuration_id, limit) + def content_mimetype_add(self, mimetypes, conflict_update=False): """Add mimetypes not present in storage. @@ -207,6 +232,8 @@ default) """ + if not all(isinstance(x['id'], bytes) for x in mimetypes): + raise TypeError('identifiers must be bytes.') self._mimetypes.add(mimetypes, conflict_update) def content_mimetype_get(self, ids, db=None, cur=None): @@ -281,6 +308,8 @@ results """ + if not all(isinstance(x['id'], bytes) for x in ctags): + raise TypeError('identifiers must be bytes.') self._content_ctags.add_merge(ctags, conflict_update, 'ctags') def content_ctags_search(self, expression, @@ -351,6 +380,8 @@ list: content_license entries which failed due to unknown licenses """ + if not all(isinstance(x['id'], bytes) for x in licenses): + raise TypeError('identifiers must be bytes.') self._licenses.add_merge(licenses, conflict_update, 'licenses') def content_fossology_license_get_range( @@ -425,6 +456,8 @@ or skip duplicates (false, the default) """ + if not all(isinstance(x['id'], bytes) for x in metadata): + raise TypeError('identifiers must be bytes.') self._content_metadata.add(metadata, conflict_update) def revision_metadata_missing(self, metadata): @@ -473,6 +506,8 @@ or skip duplicates (false, the default) """ + if not all(isinstance(x['id'], bytes) for x in metadata): + raise TypeError('identifiers must be bytes.') self._revision_metadata.add(metadata, conflict_update) def indexer_configuration_add(self, tools): diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py --- a/swh/indexer/tests/storage/test_in_memory.py +++ b/swh/indexer/tests/storage/test_in_memory.py @@ -63,15 +63,3 @@ @pytest.mark.xfail def test_indexer_configuration_metadata_get(self): pass - - @pytest.mark.xfail - def test_generate_content_mimetype_get_range_limit_none(self): - pass - - @pytest.mark.xfail - def test_generate_content_mimetype_get_range_no_limit(self, mimetypes): - pass - - @pytest.mark.xfail - def test_generate_content_mimetype_get_range_limit(self, mimetypes): - pass diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py --- a/swh/indexer/tests/test_fossology_license.py +++ b/swh/indexer/tests/test_fossology_license.py @@ -4,7 +4,6 @@ # See top-level LICENSE file for more information import unittest -import logging from unittest.mock import patch @@ -14,10 +13,9 @@ ) from swh.indexer.tests.test_utils import ( - MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer, - BASE_TEST_CONFIG + BASE_TEST_CONFIG, fill_storage, fill_obj_storage ) @@ -91,6 +89,7 @@ yield from self.idx_storage.content_ctags_get(ids) def setUp(self): + super().setUp() self.indexer = FossologyLicenseTestIndexer() self.idx_storage = self.indexer.idx_storage @@ -137,15 +136,6 @@ 'write_batch_size': 100, } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - self.log = logging.getLogger('swh.indexer') - # this hardcodes some contents, will use this to setup the storage - self.objstorage = MockObjStorage() - contents = [{'sha1': c_id} for c_id in self.objstorage] - self.storage = BasicMockStorage(contents) - class TestFossologyLicenseRangeIndexer( CommonContentIndexerRangeTest, unittest.TestCase): @@ -158,12 +148,10 @@ """ def setUp(self): + super().setUp() self.indexer = FossologyLicenseRangeIndexerTest() - # will play along with the objstorage's mocked contents for now - self.contents = sorted(self.indexer.objstorage) - # FIXME: leverage swh.objstorage.in_memory_storage's - # InMemoryObjStorage, swh.storage.tests's gen_contents, and - # hypothesis to generate data to actually run indexer on those + fill_storage(self.indexer.storage) + fill_obj_storage(self.indexer.objstorage) self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -12,10 +12,9 @@ ) from swh.indexer.tests.test_utils import ( - MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, CommonContentIndexerTest, CommonContentIndexerRangeTest, CommonIndexerWithErrorsTest, CommonIndexerNoTool, - BASE_TEST_CONFIG + BASE_TEST_CONFIG, fill_storage, fill_obj_storage ) @@ -121,15 +120,6 @@ 'write_batch_size': 100, } - def prepare(self): - super().prepare() - self.idx_storage = BasicMockIndexerStorage() - # this hardcodes some contents, will use this to setup the storage - self.objstorage = MockObjStorage() - # sync objstorage and storage - contents = [{'sha1': c_id} for c_id in self.objstorage] - self.storage = BasicMockStorage(contents) - class TestMimetypeRangeIndexer( CommonContentIndexerRangeTest, unittest.TestCase): @@ -142,12 +132,10 @@ """ def setUp(self): + super().setUp() self.indexer = MimetypeRangeIndexerTest() - # will play along with the objstorage's mocked contents for now - self.contents = sorted(self.indexer.objstorage) - # FIXME: leverage swh.objstorage.in_memory_storage's - # InMemoryObjStorage, swh.storage.tests's gen_contents, and - # hypothesis to generate data to actually run indexer on those + fill_storage(self.indexer.storage) + fill_obj_storage(self.indexer.objstorage) self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -4,10 +4,12 @@ # See top-level LICENSE file for more information import datetime +import hashlib +import random from swh.objstorage.exc import ObjNotFoundError from swh.model import hashutil -from swh.model.hashutil import hash_to_bytes +from swh.model.hashutil import hash_to_bytes, hash_to_hex from swh.indexer.storage import INDEXER_CFG_KEY @@ -488,6 +490,22 @@ 'id': DIRECTORY_ID, 'entries': DIRECTORY, }]) + for (obj_id, content) in OBJ_STORAGE_DATA.items(): + # TODO: use MultiHash + if hasattr(hashlib, 'blake2s'): + blake2s256 = hashlib.blake2s(content, digest_size=32).digest() + else: + # fallback for Python <3.6 + blake2s256 = bytes([random.randint(0, 255) for _ in range(32)]) + storage.content_add([{ + 'data': content, + 'length': len(content), + 'status': 'visible', + 'sha1': hash_to_bytes(obj_id), + 'sha1_git': hash_to_bytes(obj_id), + 'sha256': hashlib.sha256(content).digest(), + 'blake2s256': blake2s256 + }]) class MockStorage(): @@ -664,6 +682,8 @@ return self.indexer.idx_storage.state def assert_results_ok(self, sha1s, expected_results=None): + sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1) + for sha1 in sha1s] actual_results = self.get_indexer_results(sha1s) if expected_results is None: @@ -712,15 +732,22 @@ """Allows to factorize tests on range indexer. """ + def setUp(self): + self.contents = sorted(OBJ_STORAGE_DATA) + def assert_results_ok(self, start, end, actual_results, expected_results=None): if expected_results is None: expected_results = self.expected_results + actual_results = list(actual_results) for indexed_data in actual_results: _id = indexed_data['id'] - self.assertEqual(indexed_data, expected_results[_id]) - self.assertTrue(start <= _id and _id <= end) + assert isinstance(_id, bytes) + indexed_data = indexed_data.copy() + indexed_data['id'] = hash_to_hex(indexed_data['id']) + self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)]) + self.assertTrue(start <= _id <= end) _tool_id = indexed_data['indexer_configuration_id'] self.assertEqual(_tool_id, self.indexer.tool['id']) @@ -728,7 +755,8 @@ """Indexing contents without existing data results in indexed data """ - start, end = [self.contents[0], self.contents[2]] # output hex ids + _start, _end = [self.contents[0], self.contents[2]] # output hex ids + start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = list(self.indexer._index_contents( start, end, indexed={})) @@ -739,12 +767,13 @@ """Indexing contents with existing data results in less indexed data """ - start, end = [self.contents[0], self.contents[2]] # output hex ids + _start, _end = [self.contents[0], self.contents[2]] # output hex ids + start, end = map(hashutil.hash_to_bytes, (_start, _end)) data_indexed = [self.id0, self.id2] # given actual_results = self.indexer._index_contents( - start, end, indexed=set(data_indexed)) + start, end, indexed=set(map(hash_to_bytes, data_indexed))) # craft the expected results expected_results = self.expected_results.copy() @@ -758,7 +787,8 @@ """Optimal indexing should result in indexed data """ - start, end = [self.contents[0], self.contents[2]] # output hex ids + _start, _end = [self.contents[0], self.contents[2]] # output hex ids + start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run(start, end) @@ -785,8 +815,9 @@ def test_generate_content_get_no_result(self): """No result indexed returns False""" - start, end = ['0000000000000000000000000000000000000000', - '0000000000000000000000000000000000000001'] + _start, _end = ['0000000000000000000000000000000000000000', + '0000000000000000000000000000000000000001'] + start, end = map(hashutil.hash_to_bytes, (_start, _end)) # given actual_results = self.indexer.run( start, end, incremental=False)