diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2017 The Software Heritage developers +# Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,50 +8,14 @@ from swh.model import hashutil -from .indexer import ContentIndexer, DiskIndexer +from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer -def compute_license(path, log=None): - """Determine license from file at path. +class MixinFossologyLicenseIndexer: + """Mixin fossology license indexer. - Args: - path: filepath to determine the license - - Returns: - A dict with the following keys: - - licenses ([str]): associated detected licenses to path - - path (bytes): content filepath - - tool (str): tool used to compute the output - - """ - try: - properties = subprocess.check_output(['nomossa', path], - universal_newlines=True) - if properties: - res = properties.rstrip().split(' contains license(s) ') - licenses = res[1].split(',') - - return { - 'licenses': licenses, - 'path': path, - } - except subprocess.CalledProcessError: - if log: - from os import path as __path - log.exception('Problem during license detection for sha1 %s' % - __path.basename(path)) - return { - 'licenses': [], - 'path': path, - } - - -class ContentFossologyLicenseIndexer(ContentIndexer, DiskIndexer): - """Indexer in charge of: - - filtering out content already indexed - - reading content from objstorage per the content's id (sha1) - - computing {license, encoding} from that content - - store result in storage + See :class:`ContentFossologyLicenseIndexer` and + :class:`FossologyLicenseRangeIndexer` """ ADDITIONAL_CONFIG = { @@ -72,22 +36,45 @@ self.working_directory = self.config['workdir'] self.tool = self.tools[0] - def filter(self, ids): - """Filter out known sha1s and return only missing ones. + def compute_license(self, path, log=None): + """Determine license from file at path. + + Args: + path: filepath to determine the license + + Returns: + A dict with the following keys: + - licenses ([str]): associated detected licenses to path + - path (bytes): content filepath + - tool (str): tool used to compute the output """ - yield from self.idx_storage.content_fossology_license_missing(( - { - 'id': sha1, - 'indexer_configuration_id': self.tool['id'], - } for sha1 in ids - )) + try: + properties = subprocess.check_output(['nomossa', path], + universal_newlines=True) + if properties: + res = properties.rstrip().split(' contains license(s) ') + licenses = res[1].split(',') + + return { + 'licenses': licenses, + 'path': path, + } + except subprocess.CalledProcessError: + if log: + from os import path as __path + log.exception('Problem during license detection for sha1 %s' % + __path.basename(path)) + return { + 'licenses': [], + 'path': path, + } def index(self, id, data): """Index sha1s' content and store result. Args: - sha1 (bytes): content's identifier + id (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: @@ -97,13 +84,14 @@ - path (bytes): path """ - filename = hashutil.hash_to_hex(id) + if isinstance(id, str): + id = hashutil.hash_to_hex(id) content_path = self.write_to_temp( - filename=filename, + filename=id, data=data) try: - properties = compute_license(path=content_path, log=self.log) + properties = self.compute_license(path=content_path, log=self.log) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], @@ -130,11 +118,67 @@ results, conflict_update=(policy_update == 'update-dups')) +class ContentFossologyLicenseIndexer( + MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer): + """Indexer in charge of: + - filtering out content already indexed + - reading content from objstorage per the content's id (sha1) + - computing {license, encoding} from that content + - store result in storage + + """ + def filter(self, ids): + """Filter out known sha1s and return only missing ones. + + """ + yield from self.idx_storage.content_fossology_license_missing(( + { + 'id': sha1, + 'indexer_configuration_id': self.tool['id'], + } for sha1 in ids + )) + + +class FossologyLicenseRangeIndexer( + MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer): + """FossologyLicense Range Indexer working on range of content identifiers. + + It: + - filters out the non textual content + - (optionally) filters out content already indexed (cf :callable:`range`) + - reads content from objstorage per the content's id (sha1) + - computes {mimetype, encoding} from that content + - stores result in storage + + """ + def indexed_contents_in_range(self, start, end): + """Retrieve indexed content id within range [start, end]. + + Args + **start** (bytes): Starting bound from range identifier + **end** (bytes): End range identifier + + Yields: + Content identifier (bytes) present in the range [start, end] + + """ + while start: + result = self.idx_storage.content_fossology_license_get_range( + start, end, self.tool['id']) + contents = result['ids'] + for _id in contents: + yield _id + start = result['next'] + if start is None: + break + + @click.command(help='Compute license for path using tool') @click.option('--tool', default='nomossa', help="Path to tool") @click.option('--path', required=1, help="Path to execute index on") def main(tool, path): - print(compute_license(tool, path)) + indexer = ContentFossologyLicenseIndexer() + print(indexer.compute_license(tool, path)) if __name__ == '__main__': diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/test_fossology_license.py @@ -0,0 +1,227 @@ +# Copyright (C) 2017-2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest +import logging + +from swh.indexer.fossology_license import ( + ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer +) + +from swh.indexer.tests.test_utils import ( + MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, + SHA1_TO_LICENSES, IndexerRangeTest +) + + +class NoDiskIndexer: + """Mixin to override the DiskIndexer behavior avoiding side-effects in + tests. + + """ + + def write_to_temp(self, filename, data): # noop + return filename + + def cleanup(self, content_path): # noop + return None + + +class InjectLicenseIndexer: + """Override license computations. + + """ + def compute_license(self, path, log=None): + """path is the content identifier + + """ + return { + 'licenses': SHA1_TO_LICENSES.get(path) + } + + +class FossologyLicenseTestIndexer( + NoDiskIndexer, InjectLicenseIndexer, ContentFossologyLicenseIndexer): + """Specific mimetype whose configuration is enough to satisfy the + indexing tests. + + """ + def prepare(self): + self.config = { + 'tools': { + 'name': 'nomos', + 'version': '3.1.0rc2-31-ga2cbb8c', + 'configuration': { + 'command_line': 'nomossa ', + }, + }, + } + self.idx_storage = BasicMockIndexerStorage() + self.log = logging.getLogger('swh.indexer') + self.objstorage = MockObjStorage() + self.tools = self.register_tools(self.config['tools']) + self.tool = self.tools[0] + + +class FossologyLicenseIndexerUnknownToolTestStorage( + FossologyLicenseTestIndexer): + """Specific fossology license indexer whose configuration is not + enough to satisfy the indexing checks + + """ + def prepare(self): + super().prepare() + self.tools = None + + +class TestFossologyLicenseIndexerWithErrors(unittest.TestCase): + def test_wrong_unknown_configuration_tool(self): + """Indexer with unknown configuration tool should fail the check""" + with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): + FossologyLicenseIndexerUnknownToolTestStorage() + + +class TestFossologyLicenseIndexer(unittest.TestCase): + """Fossology license tests. + + """ + def setUp(self): + self.indexer = FossologyLicenseTestIndexer() + + def test_index_no_update(self): + """Index sha1s results in new computed licenses + + """ + id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' + sha1s = [id0, id1] + + # when + self.indexer.run(sha1s, policy_update='ignore-dups') + + # then + expected_results = [{ + 'id': id0, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id0], + }, { + 'id': id1, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id1], + }] + + self.assertFalse(self.indexer.idx_storage.conflict_update) + self.assertEqual(expected_results, self.indexer.idx_storage.state) + + def test_index_update(self): + """Index sha1s results in new computed licenses + + """ + id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' + id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content + sha1s = [id0, id1, id2] + + # when + self.indexer.run(sha1s, policy_update='update-dups') + + # then + expected_results = [{ + 'id': id0, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id0], + }, { + 'id': id1, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id1], + }, { + 'id': id2, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id2], + }] + + self.assertTrue(self.indexer.idx_storage.conflict_update) + self.assertEqual(expected_results, self.indexer.idx_storage.state) + + def test_index_one_unknown_sha1(self): + """Only existing contents are indexed + + """ + # given + id0 = '688a5ef812c53907562fe379d4b3851e69c7cb15' + sha1s = [id0, + '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown + '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown + + # when + self.indexer.run(sha1s, policy_update='update-dups') + + # then + expected_results = [{ + 'id': id0, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id0], + }] + + self.assertTrue(self.indexer.idx_storage.conflict_update) + self.assertEqual(expected_results, self.indexer.idx_storage.state) + + +class FossologyLicenseRangeIndexerTest( + NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer): + """Testing the range indexer on fossology license. + + """ + def prepare(self): + self.config = { + 'tools': { + 'name': 'nomos', + 'version': '3.1.0rc2-31-ga2cbb8c', + 'configuration': { + 'command_line': 'nomossa ', + }, + }, + 'write_batch_size': 100, + } + self.idx_storage = BasicMockIndexerStorage() + self.log = logging.getLogger('swh.indexer') + # this hardcodes some contents, will use this to setup the storage + self.objstorage = MockObjStorage() + # sync objstorage and storage + contents = [{'sha1': c_id} for c_id in self.objstorage] + self.storage = BasicMockStorage(contents) + self.tools = self.register_tools(self.config['tools']) + self.tool = self.tools[0] + + +class TestFossologyLicenseRangeIndexer(IndexerRangeTest, unittest.TestCase): + def setUp(self): + self.indexer = FossologyLicenseRangeIndexerTest() + # will play along with the objstorage's mocked contents for now + self.contents = sorted(self.indexer.objstorage) + # FIXME: leverage swh.objstorage.in_memory_storage's + # InMemoryObjStorage, swh.storage.tests's gen_contents, and + # hypothesis to generate data to actually run indexer on those + + self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' + self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' + self.expected_results = { + self.id0: { + 'id': self.id0, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id0] + }, + self.id1: { + 'id': self.id1, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id1] + }, + self.id2: { + 'id': self.id2, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id2] + } + } diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -10,90 +10,9 @@ ContentMimetypeIndexer, MimetypeRangeIndexer ) -from swh.indexer.tests.test_utils import MockObjStorage -from swh.model import hashutil - - -class _MockStorage(): - """In memory implementation to fake the content_get_range api. - - FIXME: To remove when the actual in-memory lands. - - """ - contents = [] - - def __init__(self, contents): - self.contents = contents - - def content_get_range(self, start, end, limit=1000): - # to make input test data consilient with actual runtime the - # other way of doing properly things would be to rewrite all - # tests (that's another task entirely so not right now) - if isinstance(start, bytes): - start = hashutil.hash_to_hex(start) - if isinstance(end, bytes): - end = hashutil.hash_to_hex(end) - results = [] - _next_id = None - counter = 0 - for c in self.contents: - _id = c['sha1'] - if start <= _id and _id <= end: - results.append(c) - if counter >= limit: - break - counter += 1 - - return { - 'contents': results, - 'next': _next_id - } - - -class _MockIndexerStorage(): - """Mock storage to simplify reading indexers' outputs. - - """ - state = [] - - def content_mimetype_add(self, mimetypes, conflict_update=None): - self.state = mimetypes - self.conflict_update = conflict_update - - def content_mimetype_get_range(self, start, end, indexer_configuration_id, - limit=1000): - """Basic in-memory implementation (limit is unused). - - """ - # to make input test data consilient with actual runtime the - # other way of doing properly things would be to rewrite all - # tests (that's another task entirely so not right now) - if isinstance(start, bytes): - start = hashutil.hash_to_hex(start) - if isinstance(end, bytes): - end = hashutil.hash_to_hex(end) - results = [] - _next = None - counter = 0 - for m in self.state: - _id = m['id'] - _tool_id = m['indexer_configuration_id'] - if (start <= _id and _id <= end and - _tool_id == indexer_configuration_id): - results.append(_id) - if counter >= limit: - break - counter += 1 - - return { - 'ids': results, - 'next': _next - } - - def indexer_configuration_add(self, tools): - return [{ - 'id': 10, - }] +from swh.indexer.tests.test_utils import ( + MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, IndexerRangeTest +) class MimetypeTestIndexer(ContentMimetypeIndexer): @@ -112,7 +31,7 @@ }, }, } - self.idx_storage = _MockIndexerStorage() + self.idx_storage = BasicMockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.tools = self.register_tools(self.config['tools']) @@ -236,18 +155,19 @@ }, 'write_batch_size': 100, } - self.idx_storage = _MockIndexerStorage() + self.idx_storage = BasicMockIndexerStorage() self.log = logging.getLogger('swh.indexer') # this hardcodes some contents, will use this to setup the storage self.objstorage = MockObjStorage() # sync objstorage and storage contents = [{'sha1': c_id} for c_id in self.objstorage] - self.storage = _MockStorage(contents) + self.storage = BasicMockStorage(contents) self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] -class TestMimetypeRangeIndexer(unittest.TestCase): +class TestMimetypeRangeIndexer(IndexerRangeTest, unittest.TestCase): + """Range Mimetype Indexer tests on """ def setUp(self): self.indexer = MimetypeRangeIndexerTest() # will play along with the objstorage's mocked contents for now @@ -256,103 +176,23 @@ # InMemoryObjStorage, swh.storage.tests's gen_contents, and # hypothesis to generate data to actually run indexer on those + self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' + self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' self.expected_results = { - '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': { + self.id0: { 'encoding': b'us-ascii', - 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', + 'id': self.id0, 'indexer_configuration_id': 10, 'mimetype': b'text/plain'}, - '02fb2c89e14f7fab46701478c83779c7beb7b069': { + self.id1: { 'encoding': b'us-ascii', - 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069', + 'id': self.id1, 'indexer_configuration_id': 10, 'mimetype': b'text/x-python'}, - '103bc087db1d26afc3a0283f38663d081e9b01e6': { + self.id2: { 'encoding': b'us-ascii', - 'id': '103bc087db1d26afc3a0283f38663d081e9b01e6', + 'id': self.id2, 'indexer_configuration_id': 10, 'mimetype': b'text/plain'} } - - def assert_mimetypes_ok(self, start, end, actual_results, - expected_results=None): - if expected_results is None: - expected_results = self.expected_results - - for mimetype in actual_results: - _id = mimetype['id'] - self.assertEqual(mimetype, expected_results[_id]) - self.assertTrue(start <= _id and _id <= end) - _tool_id = mimetype['indexer_configuration_id'] - self.assertEqual(_tool_id, self.indexer.tool['id']) - - def test__index_contents(self): - """Indexing contents without existing data results in indexed data - - """ - start, end = [self.contents[0], self.contents[2]] # output hex ids - # given - actual_results = list(self.indexer._index_contents( - start, end, indexed={})) - - self.assert_mimetypes_ok(start, end, actual_results) - - def test__index_contents_with_indexed_data(self): - """Indexing contents with existing data results in less indexed data - - """ - start, end = [self.contents[0], self.contents[2]] # output hex ids - data_indexed = [ - '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', - '103bc087db1d26afc3a0283f38663d081e9b01e6' - ] - - # given - actual_results = self.indexer._index_contents( - start, end, indexed=set(data_indexed)) - - # craft the expected results - expected_results = self.expected_results.copy() - for already_indexed_key in data_indexed: - expected_results.pop(already_indexed_key) - - self.assert_mimetypes_ok( - start, end, actual_results, expected_results) - - def test_generate_content_mimetype_get(self): - """Optimal indexing should result in indexed data - - """ - start, end = [self.contents[0], self.contents[2]] # output hex ids - # given - actual_results = self.indexer.run(start, end) - - # then - self.assertTrue(actual_results) - - def test_generate_content_mimetype_get_input_as_bytes(self): - """Optimal indexing should result in indexed data - - Input are in bytes here. - - """ - _start, _end = [self.contents[0], self.contents[2]] # output hex ids - start, end = map(hashutil.hash_to_bytes, (_start, _end)) - - # given - actual_results = self.indexer.run( # checks the bytes input this time - start, end, skip_existing=False) # no data so same result - - # then - self.assertTrue(actual_results) - - def test_generate_content_mimetype_get_no_result(self): - """No result indexed returns False""" - start, end = ['0000000000000000000000000000000000000000', - '0000000000000000000000000000000000000001'] - # given - actual_results = self.indexer.run( - start, end, incremental=False) - - # then - self.assertFalse(actual_results) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,10 +1,11 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.objstorage.exc import ObjNotFoundError +from swh.model import hashutil ORIGINS = [ { @@ -124,6 +125,15 @@ } +SHA1_TO_LICENSES = { + '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'], + '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'], + '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'], + '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'], + 'da39a3ee5e6b4b0d3255bfef95601890afd80709': [], +} + + class MockObjStorage: """Mock an swh-objstorage objstorage with predefined contents. @@ -398,3 +408,193 @@ 'status': None, 'sha256': None }] + + +class BasicMockStorage(): + """In memory implementation to fake the content_get_range api. + + FIXME: To remove when the actual in-memory lands. + + """ + contents = [] + + def __init__(self, contents): + self.contents = contents + + def content_get_range(self, start, end, limit=1000): + # to make input test data consilient with actual runtime the + # other way of doing properly things would be to rewrite all + # tests (that's another task entirely so not right now) + if isinstance(start, bytes): + start = hashutil.hash_to_hex(start) + if isinstance(end, bytes): + end = hashutil.hash_to_hex(end) + results = [] + _next_id = None + counter = 0 + for c in self.contents: + _id = c['sha1'] + if start <= _id and _id <= end: + results.append(c) + if counter >= limit: + break + counter += 1 + + return { + 'contents': results, + 'next': _next_id + } + + +class BasicMockIndexerStorage(): + """Mock Indexer storage to simplify reading indexers' outputs. + + """ + state = [] + + def _internal_add(self, data, conflict_update=None): + """All content indexer have the same structure. So reuse `data` as the + same data. It's either mimetype, language, + fossology_license, etc... + + """ + self.state = data + self.conflict_update = conflict_update + + def content_mimetype_add(self, data, conflict_update=None): + self._internal_add(data, conflict_update=conflict_update) + + def content_fossology_license_add(self, data, conflict_update=None): + self._internal_add(data, conflict_update=conflict_update) + + def _internal_get_range(self, start, end, + indexer_configuration_id, limit=1000): + """Same logic as _internal_add, we retrieve indexed data given an + identifier. So the code here does not change even though + the underlying data does. + + """ + # to make input test data consilient with actual runtime the + # other way of doing properly things would be to rewrite all + # tests (that's another task entirely so not right now) + if isinstance(start, bytes): + start = hashutil.hash_to_hex(start) + if isinstance(end, bytes): + end = hashutil.hash_to_hex(end) + results = [] + _next = None + counter = 0 + for m in self.state: + _id = m['id'] + _tool_id = m['indexer_configuration_id'] + if (start <= _id and _id <= end and + _tool_id == indexer_configuration_id): + results.append(_id) + if counter >= limit: + break + counter += 1 + + return { + 'ids': results, + 'next': _next + } + + def content_mimetype_get_range( + self, start, end, indexer_configuration_id, limit=1000): + return self._internal_get_range( + start, end, indexer_configuration_id, limit=limit) + + def content_fossology_license_get_range( + self, start, end, indexer_configuration_id, limit=1000): + return self._internal_get_range( + start, end, indexer_configuration_id, limit=limit) + + def indexer_configuration_add(self, tools): + return [{ + 'id': 10, + }] + + +class IndexerRangeTest: + """Allows to factorize tests on range indexer. + + """ + def assert_results_ok(self, start, end, actual_results, + expected_results=None): + if expected_results is None: + expected_results = self.expected_results + + for indexed_data in actual_results: + _id = indexed_data['id'] + self.assertEqual(indexed_data, expected_results[_id]) + self.assertTrue(start <= _id and _id <= end) + _tool_id = indexed_data['indexer_configuration_id'] + self.assertEqual(_tool_id, self.indexer.tool['id']) + + def test__index_contents(self): + """Indexing contents without existing data results in indexed data + + """ + start, end = [self.contents[0], self.contents[2]] # output hex ids + # given + actual_results = list(self.indexer._index_contents( + start, end, indexed={})) + + self.assert_results_ok(start, end, actual_results) + + def test__index_contents_with_indexed_data(self): + """Indexing contents with existing data results in less indexed data + + """ + start, end = [self.contents[0], self.contents[2]] # output hex ids + data_indexed = [self.id0, self.id2] + + # given + actual_results = self.indexer._index_contents( + start, end, indexed=set(data_indexed)) + + # craft the expected results + expected_results = self.expected_results.copy() + for already_indexed_key in data_indexed: + expected_results.pop(already_indexed_key) + + self.assert_results_ok( + start, end, actual_results, expected_results) + + def test_generate_content_mimetype_get(self): + """Optimal indexing should result in indexed data + + """ + start, end = [self.contents[0], self.contents[2]] # output hex ids + # given + actual_results = self.indexer.run(start, end) + + # then + self.assertTrue(actual_results) + + def test_generate_content_mimetype_get_input_as_bytes(self): + """Optimal indexing should result in indexed data + + Input are in bytes here. + + """ + _start, _end = [self.contents[0], self.contents[2]] # output hex ids + start, end = map(hashutil.hash_to_bytes, (_start, _end)) + + # given + actual_results = self.indexer.run( # checks the bytes input this time + start, end, skip_existing=False) # no data so same result + + # then + self.assertTrue(actual_results) + + def test_generate_content_mimetype_get_no_result(self): + """No result indexed returns False""" + start, end = ['0000000000000000000000000000000000000000', + '0000000000000000000000000000000000000001'] + # given + actual_results = self.indexer.run( + start, end, incremental=False) + + # then + self.assertFalse(actual_results)