diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2017 The Software Heritage developers +# Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -8,50 +8,14 @@ from swh.model import hashutil -from .indexer import ContentIndexer, DiskIndexer +from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer -def compute_license(path, log=None): - """Determine license from file at path. +class MixinFossologyLicenseIndexer: + """Mixin fossology license indexer. - Args: - path: filepath to determine the license - - Returns: - A dict with the following keys: - - licenses ([str]): associated detected licenses to path - - path (bytes): content filepath - - tool (str): tool used to compute the output - - """ - try: - properties = subprocess.check_output(['nomossa', path], - universal_newlines=True) - if properties: - res = properties.rstrip().split(' contains license(s) ') - licenses = res[1].split(',') - - return { - 'licenses': licenses, - 'path': path, - } - except subprocess.CalledProcessError: - if log: - from os import path as __path - log.exception('Problem during license detection for sha1 %s' % - __path.basename(path)) - return { - 'licenses': [], - 'path': path, - } - - -class ContentFossologyLicenseIndexer(ContentIndexer, DiskIndexer): - """Indexer in charge of: - - filtering out content already indexed - - reading content from objstorage per the content's id (sha1) - - computing {license, encoding} from that content - - store result in storage + See :class:`ContentFossologyLicenseIndexer` and + :class:`FossologyLicenseRangeIndexer` """ ADDITIONAL_CONFIG = { @@ -72,22 +36,45 @@ self.working_directory = self.config['workdir'] self.tool = self.tools[0] - def filter(self, ids): - """Filter out known sha1s and return only missing ones. + def compute_license(self, path, log=None): + """Determine license from file at path. + + Args: + path: filepath to determine the license + + Returns: + A dict with the following keys: + - licenses ([str]): associated detected licenses to path + - path (bytes): content filepath + - tool (str): tool used to compute the output """ - yield from self.idx_storage.content_fossology_license_missing(( - { - 'id': sha1, - 'indexer_configuration_id': self.tool['id'], - } for sha1 in ids - )) + try: + properties = subprocess.check_output(['nomossa', path], + universal_newlines=True) + if properties: + res = properties.rstrip().split(' contains license(s) ') + licenses = res[1].split(',') + + return { + 'licenses': licenses, + 'path': path, + } + except subprocess.CalledProcessError: + if log: + from os import path as __path + log.exception('Problem during license detection for sha1 %s' % + __path.basename(path)) + return { + 'licenses': [], + 'path': path, + } def index(self, id, data): """Index sha1s' content and store result. Args: - sha1 (bytes): content's identifier + id (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: @@ -97,13 +84,14 @@ - path (bytes): path """ - filename = hashutil.hash_to_hex(id) + if isinstance(id, str): + id = hashutil.hash_to_hex(id) content_path = self.write_to_temp( - filename=filename, + filename=id, data=data) try: - properties = compute_license(path=content_path, log=self.log) + properties = self.compute_license(path=content_path, log=self.log) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], @@ -130,11 +118,67 @@ results, conflict_update=(policy_update == 'update-dups')) +class ContentFossologyLicenseIndexer( + MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer): + """Indexer in charge of: + - filtering out content already indexed + - reading content from objstorage per the content's id (sha1) + - computing {license, encoding} from that content + - store result in storage + + """ + def filter(self, ids): + """Filter out known sha1s and return only missing ones. + + """ + yield from self.idx_storage.content_fossology_license_missing(( + { + 'id': sha1, + 'indexer_configuration_id': self.tool['id'], + } for sha1 in ids + )) + + +class FossologyLicenseRangeIndexer( + MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer): + """FossologyLicense Range Indexer working on range of content identifiers. + + It: + - filters out the non textual content + - (optionally) filters out content already indexed (cf :callable:`range`) + - reads content from objstorage per the content's id (sha1) + - computes {mimetype, encoding} from that content + - stores result in storage + + """ + def indexed_contents_in_range(self, start, end): + """Retrieve indexed content id within range [start, end]. + + Args + **start** (bytes): Starting bound from range identifier + **end** (bytes): End range identifier + + Yields: + Content identifier (bytes) present in the range [start, end] + + """ + while start: + result = self.idx_storage.content_fossology_license_get_range( + start, end, self.tool['id']) + contents = result['ids'] + for _id in contents: + yield _id + start = result['next'] + if start is None: + break + + @click.command(help='Compute license for path using tool') @click.option('--tool', default='nomossa', help="Path to tool") @click.option('--path', required=1, help="Path to execute index on") def main(tool, path): - print(compute_license(tool, path)) + indexer = ContentFossologyLicenseIndexer() + print(indexer.compute_license(tool, path)) if __name__ == '__main__': diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py new file mode 100644 --- /dev/null +++ b/swh/indexer/tests/test_fossology_license.py @@ -0,0 +1,183 @@ +# Copyright (C) 2017-2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest +import logging + +from swh.indexer.fossology_license import ( + ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer +) + +from swh.indexer.tests.test_utils import ( + MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, + SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest +) + + +class NoDiskIndexer: + """Mixin to override the DiskIndexer behavior avoiding side-effects in + tests. + + """ + + def write_to_temp(self, filename, data): # noop + return filename + + def cleanup(self, content_path): # noop + return None + + +class InjectLicenseIndexer: + """Override license computations. + + """ + def compute_license(self, path, log=None): + """path is the content identifier + + """ + return { + 'licenses': SHA1_TO_LICENSES.get(path) + } + + +class FossologyLicenseTestIndexer( + NoDiskIndexer, InjectLicenseIndexer, ContentFossologyLicenseIndexer): + """Specific fossology license whose configuration is enough to satisfy + the indexing checks. + + """ + def prepare(self): + self.config = { + 'tools': { + 'name': 'nomos', + 'version': '3.1.0rc2-31-ga2cbb8c', + 'configuration': { + 'command_line': 'nomossa ', + }, + }, + } + self.idx_storage = BasicMockIndexerStorage() + self.log = logging.getLogger('swh.indexer') + self.objstorage = MockObjStorage() + self.tools = self.register_tools(self.config['tools']) + self.tool = self.tools[0] + + +class FossologyLicenseIndexerUnknownToolTestStorage( + FossologyLicenseTestIndexer): + """Specific fossology license indexer whose configuration is not + enough to satisfy the indexing checks + + """ + def prepare(self): + super().prepare() + self.tools = None + + +class TestFossologyLicenseIndexerWithErrors(unittest.TestCase): + def test_wrong_unknown_configuration_tool(self): + """Indexer with unknown configuration tool should fail the check""" + with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): + FossologyLicenseIndexerUnknownToolTestStorage() + + +class TestFossologyLicenseIndexer(CommonContentIndexerTest, unittest.TestCase): + """Language indexer test scenarios: + + - Known sha1s in the input list have their data indexed + - Unknown sha1 in the input list are not indexed + + """ + def setUp(self): + self.indexer = FossologyLicenseTestIndexer() + + self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' + self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content + # then + self.expected_results = { + self.id0: { + 'id': self.id0, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id0], + }, + self.id1: { + 'id': self.id1, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id1], + }, + self.id2: { + 'id': self.id2, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id2], + } + } + + +class FossologyLicenseRangeIndexerTest( + NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer): + """Testing the range indexer on fossology license. + + """ + def prepare(self): + self.config = { + 'tools': { + 'name': 'nomos', + 'version': '3.1.0rc2-31-ga2cbb8c', + 'configuration': { + 'command_line': 'nomossa ', + }, + }, + 'write_batch_size': 100, + } + self.idx_storage = BasicMockIndexerStorage() + self.log = logging.getLogger('swh.indexer') + # this hardcodes some contents, will use this to setup the storage + self.objstorage = MockObjStorage() + # sync objstorage and storage + contents = [{'sha1': c_id} for c_id in self.objstorage] + self.storage = BasicMockStorage(contents) + self.tools = self.register_tools(self.config['tools']) + self.tool = self.tools[0] + + +class TestFossologyLicenseRangeIndexer( + CommonContentIndexerRangeTest, unittest.TestCase): + """Range Fossology License Indexer tests. + + - new data within range are indexed + - no data outside a range are indexed + - with filtering existing indexed data prior to compute new index + - without filtering existing indexed data prior to compute new index + + """ + def setUp(self): + self.indexer = FossologyLicenseRangeIndexerTest() + # will play along with the objstorage's mocked contents for now + self.contents = sorted(self.indexer.objstorage) + # FIXME: leverage swh.objstorage.in_memory_storage's + # InMemoryObjStorage, swh.storage.tests's gen_contents, and + # hypothesis to generate data to actually run indexer on those + + self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' + self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' + self.expected_results = { + self.id0: { + 'id': self.id0, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id0] + }, + self.id1: { + 'id': self.id1, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id1] + }, + self.id2: { + 'id': self.id2, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id2] + } + } diff --git a/swh/indexer/tests/test_language.py b/swh/indexer/tests/test_language.py --- a/swh/indexer/tests/test_language.py +++ b/swh/indexer/tests/test_language.py @@ -7,20 +7,9 @@ import logging from swh.indexer import language from swh.indexer.language import ContentLanguageIndexer -from swh.indexer.tests.test_utils import MockObjStorage - - -class _MockIndexerStorage(): - """Mock storage to simplify reading indexers' outputs. - """ - def content_language_add(self, languages, conflict_update=None): - self.state = languages - self.conflict_update = conflict_update - - def indexer_configuration_add(self, tools): - return [{ - 'id': 20, - }] +from swh.indexer.tests.test_utils import ( + BasicMockIndexerStorage, MockObjStorage, CommonContentIndexerTest +) class LanguageTestIndexer(ContentLanguageIndexer): @@ -40,7 +29,7 @@ }, } } - self.idx_storage = _MockIndexerStorage() + self.idx_storage = BasicMockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.destination_task = None @@ -51,12 +40,9 @@ class Language(unittest.TestCase): - """ - Tests pygments tool for language detection - """ - def setUp(self): - self.maxDiff = None + """Tests pygments tool for language detection + """ def test_compute_language_none(self): # given self.content = "" @@ -68,40 +54,35 @@ # then self.assertEqual(self.declared_language, result) - def test_index_content_language_python(self): - # given - # testing python - sha1s = ['02fb2c89e14f7fab46701478c83779c7beb7b069'] - lang_indexer = LanguageTestIndexer() - # when - lang_indexer.run(sha1s, policy_update='ignore-dups') - results = lang_indexer.idx_storage.state +class TestLanguageIndexer(CommonContentIndexerTest, unittest.TestCase): + """Language indexer test scenarios: - expected_results = [{ - 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069', - 'indexer_configuration_id': 20, - 'lang': 'python' - }] - # then - self.assertEqual(expected_results, results) + - Known sha1s in the input list have their data indexed + - Unknown sha1 in the input list are not indexed - def test_index_content_language_c(self): - # given - # testing c - sha1s = ['103bc087db1d26afc3a0283f38663d081e9b01e6'] - lang_indexer = LanguageTestIndexer() - - # when - lang_indexer.run(sha1s, policy_update='ignore-dups') - results = lang_indexer.idx_storage.state - - expected_results = [{ - 'id': '103bc087db1d26afc3a0283f38663d081e9b01e6', - 'indexer_configuration_id': 20, - 'lang': 'c' - }] - - # then - self.assertEqual('c', results[0]['lang']) - self.assertEqual(expected_results, results) + """ + def setUp(self): + self.indexer = LanguageTestIndexer() + + self.id0 = '02fb2c89e14f7fab46701478c83779c7beb7b069' + self.id1 = '103bc087db1d26afc3a0283f38663d081e9b01e6' + self.id2 = 'd4c647f0fc257591cc9ba1722484229780d1c607' + + self.expected_results = { + self.id0: { + 'id': self.id0, + 'indexer_configuration_id': 10, + 'lang': 'python', + }, + self.id1: { + 'id': self.id1, + 'indexer_configuration_id': 10, + 'lang': 'c' + }, + self.id2: { + 'id': self.id2, + 'indexer_configuration_id': 10, + 'lang': 'text-only' + } + } diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -10,95 +10,15 @@ ContentMimetypeIndexer, MimetypeRangeIndexer ) -from swh.indexer.tests.test_utils import MockObjStorage -from swh.model import hashutil - - -class _MockStorage(): - """In memory implementation to fake the content_get_range api. - - FIXME: To remove when the actual in-memory lands. - - """ - contents = [] - - def __init__(self, contents): - self.contents = contents - - def content_get_range(self, start, end, limit=1000): - # to make input test data consilient with actual runtime the - # other way of doing properly things would be to rewrite all - # tests (that's another task entirely so not right now) - if isinstance(start, bytes): - start = hashutil.hash_to_hex(start) - if isinstance(end, bytes): - end = hashutil.hash_to_hex(end) - results = [] - _next_id = None - counter = 0 - for c in self.contents: - _id = c['sha1'] - if start <= _id and _id <= end: - results.append(c) - if counter >= limit: - break - counter += 1 - - return { - 'contents': results, - 'next': _next_id - } - - -class _MockIndexerStorage(): - """Mock storage to simplify reading indexers' outputs. - - """ - state = [] - - def content_mimetype_add(self, mimetypes, conflict_update=None): - self.state = mimetypes - self.conflict_update = conflict_update - - def content_mimetype_get_range(self, start, end, indexer_configuration_id, - limit=1000): - """Basic in-memory implementation (limit is unused). - - """ - # to make input test data consilient with actual runtime the - # other way of doing properly things would be to rewrite all - # tests (that's another task entirely so not right now) - if isinstance(start, bytes): - start = hashutil.hash_to_hex(start) - if isinstance(end, bytes): - end = hashutil.hash_to_hex(end) - results = [] - _next = None - counter = 0 - for m in self.state: - _id = m['id'] - _tool_id = m['indexer_configuration_id'] - if (start <= _id and _id <= end and - _tool_id == indexer_configuration_id): - results.append(_id) - if counter >= limit: - break - counter += 1 - - return { - 'ids': results, - 'next': _next - } - - def indexer_configuration_add(self, tools): - return [{ - 'id': 10, - }] +from swh.indexer.tests.test_utils import ( + MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, + CommonContentIndexerTest, CommonContentIndexerRangeTest +) class MimetypeTestIndexer(ContentMimetypeIndexer): - """Specific mimetype whose configuration is enough to satisfy the - indexing tests. + """Specific mimetype indexer instance whose configuration is enough to + satisfy the indexing tests. """ def prepare(self): @@ -112,7 +32,7 @@ }, }, } - self.idx_storage = _MockIndexerStorage() + self.idx_storage = BasicMockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.tools = self.register_tools(self.config['tools']) @@ -121,7 +41,7 @@ class MimetypeIndexerUnknownToolTestStorage(MimetypeTestIndexer): """Specific mimetype whose configuration is not enough to satisfy the - indexing tests. + indexing checks. """ def prepare(self): @@ -136,87 +56,39 @@ MimetypeIndexerUnknownToolTestStorage() -class TestMimetypeIndexer(unittest.TestCase): - def setUp(self): - self.indexer = MimetypeTestIndexer() - - def test_index_no_update(self): - # given - sha1s = [ - '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', - '688a5ef812c53907562fe379d4b3851e69c7cb15', - ] - - # when - self.indexer.run(sha1s, policy_update='ignore-dups') +class TestMimetypeIndexer(CommonContentIndexerTest, unittest.TestCase): + """Mimetype indexer test scenarios: - # then - expected_results = [{ - 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', - 'indexer_configuration_id': 10, - 'mimetype': b'text/plain', - 'encoding': b'us-ascii', - }, { - 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', - 'indexer_configuration_id': 10, - 'mimetype': b'text/plain', - 'encoding': b'us-ascii', - }] + - Known sha1s in the input list have their data indexed + - Unknown sha1 in the input list are not indexed - self.assertFalse(self.indexer.idx_storage.conflict_update) - self.assertEqual(expected_results, self.indexer.idx_storage.state) - - def test_index_update(self): - # given - sha1s = [ - '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', - '688a5ef812c53907562fe379d4b3851e69c7cb15', - 'da39a3ee5e6b4b0d3255bfef95601890afd80709', # empty content - ] - - # when - self.indexer.run(sha1s, policy_update='update-dups') - - # then - expected_results = [{ - 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', - 'indexer_configuration_id': 10, - 'mimetype': b'text/plain', - 'encoding': b'us-ascii', - }, { - 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', - 'indexer_configuration_id': 10, - 'mimetype': b'text/plain', - 'encoding': b'us-ascii', - }, { - 'id': 'da39a3ee5e6b4b0d3255bfef95601890afd80709', - 'indexer_configuration_id': 10, - 'mimetype': b'application/x-empty', - 'encoding': b'binary', - }] - - self.assertTrue(self.indexer.idx_storage.conflict_update) - self.assertEqual(expected_results, self.indexer.idx_storage.state) - - def test_index_one_unknown_sha1(self): - # given - sha1s = ['688a5ef812c53907562fe379d4b3851e69c7cb15', - '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown - '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown - - # when - self.indexer.run(sha1s, policy_update='update-dups') - - # then - expected_results = [{ - 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', - 'indexer_configuration_id': 10, - 'mimetype': b'text/plain', - 'encoding': b'us-ascii', - }] + """ + def setUp(self): + self.indexer = MimetypeTestIndexer() - self.assertTrue(self.indexer.idx_storage.conflict_update) - self.assertEqual(expected_results, self.indexer.idx_storage.state) + self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + self.id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' + self.id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' + self.expected_results = { + self.id0: { + 'id': self.id0, + 'indexer_configuration_id': 10, + 'mimetype': b'text/plain', + 'encoding': b'us-ascii', + }, + self.id1: { + 'id': self.id1, + 'indexer_configuration_id': 10, + 'mimetype': b'text/plain', + 'encoding': b'us-ascii', + }, + self.id2: { + 'id': self.id2, + 'indexer_configuration_id': 10, + 'mimetype': b'application/x-empty', + 'encoding': b'binary', + } + } class MimetypeRangeIndexerTest(MimetypeRangeIndexer): @@ -236,18 +108,27 @@ }, 'write_batch_size': 100, } - self.idx_storage = _MockIndexerStorage() + self.idx_storage = BasicMockIndexerStorage() self.log = logging.getLogger('swh.indexer') # this hardcodes some contents, will use this to setup the storage self.objstorage = MockObjStorage() # sync objstorage and storage contents = [{'sha1': c_id} for c_id in self.objstorage] - self.storage = _MockStorage(contents) + self.storage = BasicMockStorage(contents) self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] -class TestMimetypeRangeIndexer(unittest.TestCase): +class TestMimetypeRangeIndexer( + CommonContentIndexerRangeTest, unittest.TestCase): + """Range Mimetype Indexer tests. + + - new data within range are indexed + - no data outside a range are indexed + - with filtering existing indexed data prior to compute new index + - without filtering existing indexed data prior to compute new index + + """ def setUp(self): self.indexer = MimetypeRangeIndexerTest() # will play along with the objstorage's mocked contents for now @@ -256,103 +137,23 @@ # InMemoryObjStorage, swh.storage.tests's gen_contents, and # hypothesis to generate data to actually run indexer on those + self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' + self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' self.expected_results = { - '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': { + self.id0: { 'encoding': b'us-ascii', - 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', + 'id': self.id0, 'indexer_configuration_id': 10, 'mimetype': b'text/plain'}, - '02fb2c89e14f7fab46701478c83779c7beb7b069': { + self.id1: { 'encoding': b'us-ascii', - 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069', + 'id': self.id1, 'indexer_configuration_id': 10, 'mimetype': b'text/x-python'}, - '103bc087db1d26afc3a0283f38663d081e9b01e6': { + self.id2: { 'encoding': b'us-ascii', - 'id': '103bc087db1d26afc3a0283f38663d081e9b01e6', + 'id': self.id2, 'indexer_configuration_id': 10, 'mimetype': b'text/plain'} } - - def assert_mimetypes_ok(self, start, end, actual_results, - expected_results=None): - if expected_results is None: - expected_results = self.expected_results - - for mimetype in actual_results: - _id = mimetype['id'] - self.assertEqual(mimetype, expected_results[_id]) - self.assertTrue(start <= _id and _id <= end) - _tool_id = mimetype['indexer_configuration_id'] - self.assertEqual(_tool_id, self.indexer.tool['id']) - - def test__index_contents(self): - """Indexing contents without existing data results in indexed data - - """ - start, end = [self.contents[0], self.contents[2]] # output hex ids - # given - actual_results = list(self.indexer._index_contents( - start, end, indexed={})) - - self.assert_mimetypes_ok(start, end, actual_results) - - def test__index_contents_with_indexed_data(self): - """Indexing contents with existing data results in less indexed data - - """ - start, end = [self.contents[0], self.contents[2]] # output hex ids - data_indexed = [ - '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', - '103bc087db1d26afc3a0283f38663d081e9b01e6' - ] - - # given - actual_results = self.indexer._index_contents( - start, end, indexed=set(data_indexed)) - - # craft the expected results - expected_results = self.expected_results.copy() - for already_indexed_key in data_indexed: - expected_results.pop(already_indexed_key) - - self.assert_mimetypes_ok( - start, end, actual_results, expected_results) - - def test_generate_content_mimetype_get(self): - """Optimal indexing should result in indexed data - - """ - start, end = [self.contents[0], self.contents[2]] # output hex ids - # given - actual_results = self.indexer.run(start, end) - - # then - self.assertTrue(actual_results) - - def test_generate_content_mimetype_get_input_as_bytes(self): - """Optimal indexing should result in indexed data - - Input are in bytes here. - - """ - _start, _end = [self.contents[0], self.contents[2]] # output hex ids - start, end = map(hashutil.hash_to_bytes, (_start, _end)) - - # given - actual_results = self.indexer.run( # checks the bytes input this time - start, end, skip_existing=False) # no data so same result - - # then - self.assertTrue(actual_results) - - def test_generate_content_mimetype_get_no_result(self): - """No result indexed returns False""" - start, end = ['0000000000000000000000000000000000000000', - '0000000000000000000000000000000000000001'] - # given - actual_results = self.indexer.run( - start, end, incremental=False) - - # then - self.assertFalse(actual_results) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,10 +1,11 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.objstorage.exc import ObjNotFoundError +from swh.model import hashutil ORIGINS = [ { @@ -124,6 +125,15 @@ } +SHA1_TO_LICENSES = { + '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'], + '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'], + '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'], + '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'], + 'da39a3ee5e6b4b0d3255bfef95601890afd80709': [], +} + + class MockObjStorage: """Mock an swh-objstorage objstorage with predefined contents. @@ -398,3 +408,244 @@ 'status': None, 'sha256': None }] + + +class BasicMockStorage(): + """In memory implementation to fake the content_get_range api. + + FIXME: To remove when the actual in-memory lands. + + """ + contents = [] + + def __init__(self, contents): + self.contents = contents + + def content_get_range(self, start, end, limit=1000): + # to make input test data consilient with actual runtime the + # other way of doing properly things would be to rewrite all + # tests (that's another task entirely so not right now) + if isinstance(start, bytes): + start = hashutil.hash_to_hex(start) + if isinstance(end, bytes): + end = hashutil.hash_to_hex(end) + results = [] + _next_id = None + counter = 0 + for c in self.contents: + _id = c['sha1'] + if start <= _id and _id <= end: + results.append(c) + if counter >= limit: + break + counter += 1 + + return { + 'contents': results, + 'next': _next_id + } + + +class BasicMockIndexerStorage(): + """Mock Indexer storage to simplify reading indexers' outputs. + + """ + state = [] + + def _internal_add(self, data, conflict_update=None): + """All content indexer have the same structure. So reuse `data` as the + same data. It's either mimetype, language, + fossology_license, etc... + + """ + self.state = data + self.conflict_update = conflict_update + + def content_mimetype_add(self, data, conflict_update=None): + self._internal_add(data, conflict_update=conflict_update) + + def content_fossology_license_add(self, data, conflict_update=None): + self._internal_add(data, conflict_update=conflict_update) + + def content_language_add(self, data, conflict_update=None): + self._internal_add(data, conflict_update=conflict_update) + + def _internal_get_range(self, start, end, + indexer_configuration_id, limit=1000): + """Same logic as _internal_add, we retrieve indexed data given an + identifier. So the code here does not change even though + the underlying data does. + + """ + # to make input test data consilient with actual runtime the + # other way of doing properly things would be to rewrite all + # tests (that's another task entirely so not right now) + if isinstance(start, bytes): + start = hashutil.hash_to_hex(start) + if isinstance(end, bytes): + end = hashutil.hash_to_hex(end) + results = [] + _next = None + counter = 0 + for m in self.state: + _id = m['id'] + _tool_id = m['indexer_configuration_id'] + if (start <= _id and _id <= end and + _tool_id == indexer_configuration_id): + results.append(_id) + if counter >= limit: + break + counter += 1 + + return { + 'ids': results, + 'next': _next + } + + def content_mimetype_get_range( + self, start, end, indexer_configuration_id, limit=1000): + return self._internal_get_range( + start, end, indexer_configuration_id, limit=limit) + + def content_fossology_license_get_range( + self, start, end, indexer_configuration_id, limit=1000): + return self._internal_get_range( + start, end, indexer_configuration_id, limit=limit) + + def indexer_configuration_add(self, tools): + return [{ + 'id': 10, + }] + + +class CommonContentIndexerTest: + def assert_results_ok(self, actual_results, expected_results=None): + if expected_results is None: + expected_results = self.expected_results + + for indexed_data in actual_results: + _id = indexed_data['id'] + self.assertEqual(indexed_data, expected_results[_id]) + _tool_id = indexed_data['indexer_configuration_id'] + self.assertEqual(_tool_id, self.indexer.tool['id']) + + def test_index(self): + """Known sha1 have their data indexed + + """ + sha1s = [self.id0, self.id1, self.id2] + + # when + self.indexer.run(sha1s, policy_update='update-dups') + + actual_results = self.indexer.idx_storage.state + self.assertTrue(self.indexer.idx_storage.conflict_update) + self.assert_results_ok(actual_results) + + # 2nd pass + self.indexer.run(sha1s, policy_update='ignore-dups') + + self.assertFalse(self.indexer.idx_storage.conflict_update) + self.assert_results_ok(actual_results) + + def test_index_one_unknown_sha1(self): + """Unknown sha1 are not indexed""" + sha1s = [self.id1, + '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown + '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown + + # when + self.indexer.run(sha1s, policy_update='update-dups') + actual_results = self.indexer.idx_storage.state + + # then + expected_results = { + k: v for k, v in self.expected_results.items() if k in sha1s + } + + self.assert_results_ok(actual_results, expected_results) + + +class CommonContentIndexerRangeTest: + """Allows to factorize tests on range indexer. + + """ + def assert_results_ok(self, start, end, actual_results, + expected_results=None): + if expected_results is None: + expected_results = self.expected_results + + for indexed_data in actual_results: + _id = indexed_data['id'] + self.assertEqual(indexed_data, expected_results[_id]) + self.assertTrue(start <= _id and _id <= end) + _tool_id = indexed_data['indexer_configuration_id'] + self.assertEqual(_tool_id, self.indexer.tool['id']) + + def test__index_contents(self): + """Indexing contents without existing data results in indexed data + + """ + start, end = [self.contents[0], self.contents[2]] # output hex ids + # given + actual_results = list(self.indexer._index_contents( + start, end, indexed={})) + + self.assert_results_ok(start, end, actual_results) + + def test__index_contents_with_indexed_data(self): + """Indexing contents with existing data results in less indexed data + + """ + start, end = [self.contents[0], self.contents[2]] # output hex ids + data_indexed = [self.id0, self.id2] + + # given + actual_results = self.indexer._index_contents( + start, end, indexed=set(data_indexed)) + + # craft the expected results + expected_results = self.expected_results.copy() + for already_indexed_key in data_indexed: + expected_results.pop(already_indexed_key) + + self.assert_results_ok( + start, end, actual_results, expected_results) + + def test_generate_content_get(self): + """Optimal indexing should result in indexed data + + """ + start, end = [self.contents[0], self.contents[2]] # output hex ids + # given + actual_results = self.indexer.run(start, end) + + # then + self.assertTrue(actual_results) + + def test_generate_content_get_input_as_bytes(self): + """Optimal indexing should result in indexed data + + Input are in bytes here. + + """ + _start, _end = [self.contents[0], self.contents[2]] # output hex ids + start, end = map(hashutil.hash_to_bytes, (_start, _end)) + + # given + actual_results = self.indexer.run( # checks the bytes input this time + start, end, skip_existing=False) # no data so same result + + # then + self.assertTrue(actual_results) + + def test_generate_content_get_no_result(self): + """No result indexed returns False""" + start, end = ['0000000000000000000000000000000000000000', + '0000000000000000000000000000000000000001'] + # given + actual_results = self.indexer.run( + start, end, incremental=False) + + # then + self.assertFalse(actual_results)