diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py index 3d46407..395019b 100644 --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -1,141 +1,185 @@ -# Copyright (C) 2016-2017 The Software Heritage developers +# Copyright (C) 2016-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import click import subprocess from swh.model import hashutil -from .indexer import ContentIndexer, DiskIndexer +from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer -def compute_license(path, log=None): - """Determine license from file at path. +class MixinFossologyLicenseIndexer: + """Mixin fossology license indexer. - Args: - path: filepath to determine the license - - Returns: - A dict with the following keys: - - licenses ([str]): associated detected licenses to path - - path (bytes): content filepath - - tool (str): tool used to compute the output - - """ - try: - properties = subprocess.check_output(['nomossa', path], - universal_newlines=True) - if properties: - res = properties.rstrip().split(' contains license(s) ') - licenses = res[1].split(',') - - return { - 'licenses': licenses, - 'path': path, - } - except subprocess.CalledProcessError: - if log: - from os import path as __path - log.exception('Problem during license detection for sha1 %s' % - __path.basename(path)) - return { - 'licenses': [], - 'path': path, - } - - -class ContentFossologyLicenseIndexer(ContentIndexer, DiskIndexer): - """Indexer in charge of: - - filtering out content already indexed - - reading content from objstorage per the content's id (sha1) - - computing {license, encoding} from that content - - store result in storage + See :class:`ContentFossologyLicenseIndexer` and + :class:`FossologyLicenseRangeIndexer` """ ADDITIONAL_CONFIG = { 'workdir': ('str', '/tmp/swh/indexer.fossology.license'), 'tools': ('dict', { 'name': 'nomos', 'version': '3.1.0rc2-31-ga2cbb8c', 'configuration': { 'command_line': 'nomossa ', }, }), } CONFIG_BASE_FILENAME = 'indexer/fossology_license' def prepare(self): super().prepare() self.working_directory = self.config['workdir'] self.tool = self.tools[0] - def filter(self, ids): - """Filter out known sha1s and return only missing ones. + def compute_license(self, path, log=None): + """Determine license from file at path. + + Args: + path: filepath to determine the license + + Returns: + A dict with the following keys: + - licenses ([str]): associated detected licenses to path + - path (bytes): content filepath + - tool (str): tool used to compute the output """ - yield from self.idx_storage.content_fossology_license_missing(( - { - 'id': sha1, - 'indexer_configuration_id': self.tool['id'], - } for sha1 in ids - )) + try: + properties = subprocess.check_output(['nomossa', path], + universal_newlines=True) + if properties: + res = properties.rstrip().split(' contains license(s) ') + licenses = res[1].split(',') + + return { + 'licenses': licenses, + 'path': path, + } + except subprocess.CalledProcessError: + if log: + from os import path as __path + log.exception('Problem during license detection for sha1 %s' % + __path.basename(path)) + return { + 'licenses': [], + 'path': path, + } def index(self, id, data): """Index sha1s' content and store result. Args: - sha1 (bytes): content's identifier + id (bytes): content's identifier raw_content (bytes): raw content in bytes Returns: A dict, representing a content_license, with keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path """ - filename = hashutil.hash_to_hex(id) + if isinstance(id, str): + id = hashutil.hash_to_hex(id) content_path = self.write_to_temp( - filename=filename, + filename=id, data=data) try: - properties = compute_license(path=content_path, log=self.log) + properties = self.compute_license(path=content_path, log=self.log) properties.update({ 'id': id, 'indexer_configuration_id': self.tool['id'], }) finally: self.cleanup(content_path) return properties def persist_index_computations(self, results, policy_update): """Persist the results in storage. Args: results ([dict]): list of content_license, dict with the following keys: - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them """ self.idx_storage.content_fossology_license_add( results, conflict_update=(policy_update == 'update-dups')) +class ContentFossologyLicenseIndexer( + MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer): + """Indexer in charge of: + - filtering out content already indexed + - reading content from objstorage per the content's id (sha1) + - computing {license, encoding} from that content + - store result in storage + + """ + def filter(self, ids): + """Filter out known sha1s and return only missing ones. + + """ + yield from self.idx_storage.content_fossology_license_missing(( + { + 'id': sha1, + 'indexer_configuration_id': self.tool['id'], + } for sha1 in ids + )) + + +class FossologyLicenseRangeIndexer( + MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer): + """FossologyLicense Range Indexer working on range of content identifiers. + + It: + - filters out the non textual content + - (optionally) filters out content already indexed (cf :callable:`range`) + - reads content from objstorage per the content's id (sha1) + - computes {mimetype, encoding} from that content + - stores result in storage + + """ + def indexed_contents_in_range(self, start, end): + """Retrieve indexed content id within range [start, end]. + + Args + **start** (bytes): Starting bound from range identifier + **end** (bytes): End range identifier + + Yields: + Content identifier (bytes) present in the range [start, end] + + """ + while start: + result = self.idx_storage.content_fossology_license_get_range( + start, end, self.tool['id']) + contents = result['ids'] + for _id in contents: + yield _id + start = result['next'] + if start is None: + break + + @click.command(help='Compute license for path using tool') @click.option('--tool', default='nomossa', help="Path to tool") @click.option('--path', required=1, help="Path to execute index on") def main(tool, path): - print(compute_license(tool, path)) + indexer = ContentFossologyLicenseIndexer() + print(indexer.compute_license(tool, path)) if __name__ == '__main__': main() diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py new file mode 100644 index 0000000..0fed6fa --- /dev/null +++ b/swh/indexer/tests/test_fossology_license.py @@ -0,0 +1,227 @@ +# Copyright (C) 2017-2018 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest +import logging + +from swh.indexer.fossology_license import ( + ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer +) + +from swh.indexer.tests.test_utils import ( + MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, + SHA1_TO_LICENSES, IndexerRangeTest +) + + +class NoDiskIndexer: + """Mixin to override the DiskIndexer behavior avoiding side-effects in + tests. + + """ + + def write_to_temp(self, filename, data): # noop + return filename + + def cleanup(self, content_path): # noop + return None + + +class InjectLicenseIndexer: + """Override license computations. + + """ + def compute_license(self, path, log=None): + """path is the content identifier + + """ + return { + 'licenses': SHA1_TO_LICENSES.get(path) + } + + +class FossologyLicenseTestIndexer( + NoDiskIndexer, InjectLicenseIndexer, ContentFossologyLicenseIndexer): + """Specific mimetype whose configuration is enough to satisfy the + indexing tests. + + """ + def prepare(self): + self.config = { + 'tools': { + 'name': 'nomos', + 'version': '3.1.0rc2-31-ga2cbb8c', + 'configuration': { + 'command_line': 'nomossa ', + }, + }, + } + self.idx_storage = BasicMockIndexerStorage() + self.log = logging.getLogger('swh.indexer') + self.objstorage = MockObjStorage() + self.tools = self.register_tools(self.config['tools']) + self.tool = self.tools[0] + + +class FossologyLicenseIndexerUnknownToolTestStorage( + FossologyLicenseTestIndexer): + """Specific fossology license indexer whose configuration is not + enough to satisfy the indexing checks + + """ + def prepare(self): + super().prepare() + self.tools = None + + +class TestFossologyLicenseIndexerWithErrors(unittest.TestCase): + def test_wrong_unknown_configuration_tool(self): + """Indexer with unknown configuration tool should fail the check""" + with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): + FossologyLicenseIndexerUnknownToolTestStorage() + + +class TestFossologyLicenseIndexer(unittest.TestCase): + """Fossology license tests. + + """ + def setUp(self): + self.indexer = FossologyLicenseTestIndexer() + + def test_index_no_update(self): + """Index sha1s results in new computed licenses + + """ + id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' + sha1s = [id0, id1] + + # when + self.indexer.run(sha1s, policy_update='ignore-dups') + + # then + expected_results = [{ + 'id': id0, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id0], + }, { + 'id': id1, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id1], + }] + + self.assertFalse(self.indexer.idx_storage.conflict_update) + self.assertEqual(expected_results, self.indexer.idx_storage.state) + + def test_index_update(self): + """Index sha1s results in new computed licenses + + """ + id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15' + id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content + sha1s = [id0, id1, id2] + + # when + self.indexer.run(sha1s, policy_update='update-dups') + + # then + expected_results = [{ + 'id': id0, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id0], + }, { + 'id': id1, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id1], + }, { + 'id': id2, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id2], + }] + + self.assertTrue(self.indexer.idx_storage.conflict_update) + self.assertEqual(expected_results, self.indexer.idx_storage.state) + + def test_index_one_unknown_sha1(self): + """Only existing contents are indexed + + """ + # given + id0 = '688a5ef812c53907562fe379d4b3851e69c7cb15' + sha1s = [id0, + '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown + '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown + + # when + self.indexer.run(sha1s, policy_update='update-dups') + + # then + expected_results = [{ + 'id': id0, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[id0], + }] + + self.assertTrue(self.indexer.idx_storage.conflict_update) + self.assertEqual(expected_results, self.indexer.idx_storage.state) + + +class FossologyLicenseRangeIndexerTest( + NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer): + """Testing the range indexer on fossology license. + + """ + def prepare(self): + self.config = { + 'tools': { + 'name': 'nomos', + 'version': '3.1.0rc2-31-ga2cbb8c', + 'configuration': { + 'command_line': 'nomossa ', + }, + }, + 'write_batch_size': 100, + } + self.idx_storage = BasicMockIndexerStorage() + self.log = logging.getLogger('swh.indexer') + # this hardcodes some contents, will use this to setup the storage + self.objstorage = MockObjStorage() + # sync objstorage and storage + contents = [{'sha1': c_id} for c_id in self.objstorage] + self.storage = BasicMockStorage(contents) + self.tools = self.register_tools(self.config['tools']) + self.tool = self.tools[0] + + +class TestFossologyLicenseRangeIndexer(IndexerRangeTest, unittest.TestCase): + def setUp(self): + self.indexer = FossologyLicenseRangeIndexerTest() + # will play along with the objstorage's mocked contents for now + self.contents = sorted(self.indexer.objstorage) + # FIXME: leverage swh.objstorage.in_memory_storage's + # InMemoryObjStorage, swh.storage.tests's gen_contents, and + # hypothesis to generate data to actually run indexer on those + + self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' + self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' + self.expected_results = { + self.id0: { + 'id': self.id0, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id0] + }, + self.id1: { + 'id': self.id1, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id1] + }, + self.id2: { + 'id': self.id2, + 'indexer_configuration_id': 10, + 'licenses': SHA1_TO_LICENSES[self.id2] + } + } diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py index 102b298..6206e88 100644 --- a/swh/indexer/tests/test_mimetype.py +++ b/swh/indexer/tests/test_mimetype.py @@ -1,358 +1,198 @@ # Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest import logging from swh.indexer.mimetype import ( ContentMimetypeIndexer, MimetypeRangeIndexer ) -from swh.indexer.tests.test_utils import MockObjStorage -from swh.model import hashutil - - -class _MockStorage(): - """In memory implementation to fake the content_get_range api. - - FIXME: To remove when the actual in-memory lands. - - """ - contents = [] - - def __init__(self, contents): - self.contents = contents - - def content_get_range(self, start, end, limit=1000): - # to make input test data consilient with actual runtime the - # other way of doing properly things would be to rewrite all - # tests (that's another task entirely so not right now) - if isinstance(start, bytes): - start = hashutil.hash_to_hex(start) - if isinstance(end, bytes): - end = hashutil.hash_to_hex(end) - results = [] - _next_id = None - counter = 0 - for c in self.contents: - _id = c['sha1'] - if start <= _id and _id <= end: - results.append(c) - if counter >= limit: - break - counter += 1 - - return { - 'contents': results, - 'next': _next_id - } - - -class _MockIndexerStorage(): - """Mock storage to simplify reading indexers' outputs. - - """ - state = [] - - def content_mimetype_add(self, mimetypes, conflict_update=None): - self.state = mimetypes - self.conflict_update = conflict_update - - def content_mimetype_get_range(self, start, end, indexer_configuration_id, - limit=1000): - """Basic in-memory implementation (limit is unused). - - """ - # to make input test data consilient with actual runtime the - # other way of doing properly things would be to rewrite all - # tests (that's another task entirely so not right now) - if isinstance(start, bytes): - start = hashutil.hash_to_hex(start) - if isinstance(end, bytes): - end = hashutil.hash_to_hex(end) - results = [] - _next = None - counter = 0 - for m in self.state: - _id = m['id'] - _tool_id = m['indexer_configuration_id'] - if (start <= _id and _id <= end and - _tool_id == indexer_configuration_id): - results.append(_id) - if counter >= limit: - break - counter += 1 - - return { - 'ids': results, - 'next': _next - } - - def indexer_configuration_add(self, tools): - return [{ - 'id': 10, - }] +from swh.indexer.tests.test_utils import ( + MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, IndexerRangeTest +) class MimetypeTestIndexer(ContentMimetypeIndexer): """Specific mimetype whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'tools': { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }, } - self.idx_storage = _MockIndexerStorage() + self.idx_storage = BasicMockIndexerStorage() self.log = logging.getLogger('swh.indexer') self.objstorage = MockObjStorage() self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] class MimetypeIndexerUnknownToolTestStorage(MimetypeTestIndexer): """Specific mimetype whose configuration is not enough to satisfy the indexing tests. """ def prepare(self): super().prepare() self.tools = None class TestMimetypeIndexerWithErrors(unittest.TestCase): def test_wrong_unknown_configuration_tool(self): """Indexer with unknown configuration tool should fail the check""" with self.assertRaisesRegex(ValueError, 'Tools None is unknown'): MimetypeIndexerUnknownToolTestStorage() class TestMimetypeIndexer(unittest.TestCase): def setUp(self): self.indexer = MimetypeTestIndexer() def test_index_no_update(self): # given sha1s = [ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', '688a5ef812c53907562fe379d4b3851e69c7cb15', ] # when self.indexer.run(sha1s, policy_update='ignore-dups') # then expected_results = [{ 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }] self.assertFalse(self.indexer.idx_storage.conflict_update) self.assertEqual(expected_results, self.indexer.idx_storage.state) def test_index_update(self): # given sha1s = [ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', '688a5ef812c53907562fe379d4b3851e69c7cb15', 'da39a3ee5e6b4b0d3255bfef95601890afd80709', # empty content ] # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = [{ 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }, { 'id': 'da39a3ee5e6b4b0d3255bfef95601890afd80709', 'indexer_configuration_id': 10, 'mimetype': b'application/x-empty', 'encoding': b'binary', }] self.assertTrue(self.indexer.idx_storage.conflict_update) self.assertEqual(expected_results, self.indexer.idx_storage.state) def test_index_one_unknown_sha1(self): # given sha1s = ['688a5ef812c53907562fe379d4b3851e69c7cb15', '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown # when self.indexer.run(sha1s, policy_update='update-dups') # then expected_results = [{ 'id': '688a5ef812c53907562fe379d4b3851e69c7cb15', 'indexer_configuration_id': 10, 'mimetype': b'text/plain', 'encoding': b'us-ascii', }] self.assertTrue(self.indexer.idx_storage.conflict_update) self.assertEqual(expected_results, self.indexer.idx_storage.state) class MimetypeRangeIndexerTest(MimetypeRangeIndexer): """Specific mimetype whose configuration is enough to satisfy the indexing tests. """ def prepare(self): self.config = { 'tools': { 'name': 'file', 'version': '1:5.30-1+deb9u1', 'configuration': { "type": "library", "debian-package": "python3-magic" }, }, 'write_batch_size': 100, } - self.idx_storage = _MockIndexerStorage() + self.idx_storage = BasicMockIndexerStorage() self.log = logging.getLogger('swh.indexer') # this hardcodes some contents, will use this to setup the storage self.objstorage = MockObjStorage() # sync objstorage and storage contents = [{'sha1': c_id} for c_id in self.objstorage] - self.storage = _MockStorage(contents) + self.storage = BasicMockStorage(contents) self.tools = self.register_tools(self.config['tools']) self.tool = self.tools[0] -class TestMimetypeRangeIndexer(unittest.TestCase): +class TestMimetypeRangeIndexer(IndexerRangeTest, unittest.TestCase): + """Range Mimetype Indexer tests on """ def setUp(self): self.indexer = MimetypeRangeIndexerTest() # will play along with the objstorage's mocked contents for now self.contents = sorted(self.indexer.objstorage) # FIXME: leverage swh.objstorage.in_memory_storage's # InMemoryObjStorage, swh.storage.tests's gen_contents, and # hypothesis to generate data to actually run indexer on those + self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5' + self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069' + self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6' self.expected_results = { - '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': { + self.id0: { 'encoding': b'us-ascii', - 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', + 'id': self.id0, 'indexer_configuration_id': 10, 'mimetype': b'text/plain'}, - '02fb2c89e14f7fab46701478c83779c7beb7b069': { + self.id1: { 'encoding': b'us-ascii', - 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069', + 'id': self.id1, 'indexer_configuration_id': 10, 'mimetype': b'text/x-python'}, - '103bc087db1d26afc3a0283f38663d081e9b01e6': { + self.id2: { 'encoding': b'us-ascii', - 'id': '103bc087db1d26afc3a0283f38663d081e9b01e6', + 'id': self.id2, 'indexer_configuration_id': 10, 'mimetype': b'text/plain'} } - - def assert_mimetypes_ok(self, start, end, actual_results, - expected_results=None): - if expected_results is None: - expected_results = self.expected_results - - for mimetype in actual_results: - _id = mimetype['id'] - self.assertEqual(mimetype, expected_results[_id]) - self.assertTrue(start <= _id and _id <= end) - _tool_id = mimetype['indexer_configuration_id'] - self.assertEqual(_tool_id, self.indexer.tool['id']) - - def test__index_contents(self): - """Indexing contents without existing data results in indexed data - - """ - start, end = [self.contents[0], self.contents[2]] # output hex ids - # given - actual_results = list(self.indexer._index_contents( - start, end, indexed={})) - - self.assert_mimetypes_ok(start, end, actual_results) - - def test__index_contents_with_indexed_data(self): - """Indexing contents with existing data results in less indexed data - - """ - start, end = [self.contents[0], self.contents[2]] # output hex ids - data_indexed = [ - '01c9379dfc33803963d07c1ccc748d3fe4c96bb5', - '103bc087db1d26afc3a0283f38663d081e9b01e6' - ] - - # given - actual_results = self.indexer._index_contents( - start, end, indexed=set(data_indexed)) - - # craft the expected results - expected_results = self.expected_results.copy() - for already_indexed_key in data_indexed: - expected_results.pop(already_indexed_key) - - self.assert_mimetypes_ok( - start, end, actual_results, expected_results) - - def test_generate_content_mimetype_get(self): - """Optimal indexing should result in indexed data - - """ - start, end = [self.contents[0], self.contents[2]] # output hex ids - # given - actual_results = self.indexer.run(start, end) - - # then - self.assertTrue(actual_results) - - def test_generate_content_mimetype_get_input_as_bytes(self): - """Optimal indexing should result in indexed data - - Input are in bytes here. - - """ - _start, _end = [self.contents[0], self.contents[2]] # output hex ids - start, end = map(hashutil.hash_to_bytes, (_start, _end)) - - # given - actual_results = self.indexer.run( # checks the bytes input this time - start, end, skip_existing=False) # no data so same result - - # then - self.assertTrue(actual_results) - - def test_generate_content_mimetype_get_no_result(self): - """No result indexed returns False""" - start, end = ['0000000000000000000000000000000000000000', - '0000000000000000000000000000000000000001'] - # given - actual_results = self.indexer.run( - start, end, incremental=False) - - # then - self.assertFalse(actual_results) diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py index 8dc958c..6a4e705 100644 --- a/swh/indexer/tests/test_utils.py +++ b/swh/indexer/tests/test_utils.py @@ -1,400 +1,600 @@ -# Copyright (C) 2017 The Software Heritage developers +# Copyright (C) 2017-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.objstorage.exc import ObjNotFoundError +from swh.model import hashutil ORIGINS = [ { 'id': 52189575, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/SoftwareHeritage/swh-storage'}, { 'id': 4423668, 'lister': None, 'project': None, 'type': 'ftp', 'url': 'rsync://ftp.gnu.org/gnu/3dldf'}, { 'id': 77775770, 'lister': None, 'project': None, 'type': 'deposit', 'url': 'https://forge.softwareheritage.org/source/jesuisgpl/'}, { 'id': 85072327, 'lister': None, 'project': None, 'type': 'pypi', 'url': 'https://pypi.org/project/limnoria/'}, { 'id': 49908349, 'lister': None, 'project': None, 'type': 'svn', 'url': 'http://0-512-md.googlecode.com/svn/'}, { 'id': 54974445, 'lister': None, 'project': None, 'type': 'git', 'url': 'https://github.com/librariesio/yarn-parser'}, ] SNAPSHOTS = { 52189575: { 'branches': { b'refs/heads/add-revision-origin-cache': { 'target': b'L[\xce\x1c\x88\x8eF\t\xf1"\x19\x1e\xfb\xc0' b's\xe7/\xe9l\x1e', 'target_type': 'revision'}, b'HEAD': { 'target': b'8K\x12\x00d\x03\xcc\xe4]bS\xe3\x8f{\xd7}' b'\xac\xefrm', 'target_type': 'revision'}, b'refs/tags/v0.0.103': { 'target': b'\xb6"Im{\xfdLb\xb0\x94N\xea\x96m\x13x\x88+' b'\x0f\xdd', 'target_type': 'release'}, }}, 4423668: { 'branches': { b'3DLDF-1.1.4.tar.gz': { 'target': b'dJ\xfb\x1c\x91\xf4\x82B%]6\xa2\x90|\xd3\xfc' b'"G\x99\x11', 'target_type': 'revision'}, b'3DLDF-2.0.2.tar.gz': { 'target': b'\xb6\x0e\xe7\x9e9\xac\xaa\x19\x9e=' b'\xd1\xc5\x00\\\xc6\xfc\xe0\xa6\xb4V', 'target_type': 'revision'}, b'3DLDF-2.0.3-examples.tar.gz': { 'target': b'!H\x19\xc0\xee\x82-\x12F1\xbd\x97' b'\xfe\xadZ\x80\x80\xc1\x83\xff', 'target_type': 'revision'}, b'3DLDF-2.0.3.tar.gz': { 'target': b'\x8e\xa9\x8e/\xea}\x9feF\xf4\x9f\xfd\xee' b'\xcc\x1a\xb4`\x8c\x8by', 'target_type': 'revision'}, b'3DLDF-2.0.tar.gz': { 'target': b'F6*\xff(?\x19a\xef\xb6\xc2\x1fv$S\xe3G' b'\xd3\xd1m', b'target_type': 'revision'} }}, 77775770: { 'branches': { b'master': { 'target': b'\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{' b'\xa6\xe9\x99\xb1\x9e]q\xeb', 'target_type': 'revision'} }, 'id': b"h\xc0\xd2a\x04\xd4~'\x8d\xd6\xbe\x07\xeda\xfa\xfbV" b"\x1d\r "}, 85072327: { 'branches': { b'HEAD': { 'target': b'releases/2018.09.09', 'target_type': 'alias'}, b'releases/2018.09.01': { 'target': b'<\xee1(\xe8\x8d_\xc1\xc9\xa6rT\xf1\x1d' b'\xbb\xdfF\xfdw\xcf', 'target_type': 'revision'}, b'releases/2018.09.09': { 'target': b'\x83\xb9\xb6\xc7\x05\xb1%\xd0\xfem\xd8k' b'A\x10\x9d\xc5\xfa2\xf8t', 'target_type': 'revision'}}, 'id': b'{\xda\x8e\x84\x7fX\xff\x92\x80^\x93V\x18\xa3\xfay' b'\x12\x9e\xd6\xb3'}, 49908349: { 'branches': { b'master': { 'target': b'\xe4?r\xe1,\x88\xab\xec\xe7\x9a\x87\xb8' b'\xc9\xad#.\x1bw=\x18', 'target_type': 'revision'}}, 'id': b'\xa1\xa2\x8c\n\xb3\x87\xa8\xf9\xe0a\x8c\xb7' b'\x05\xea\xb8\x1f\xc4H\xf4s'}, 54974445: { 'branches': { b'HEAD': { 'target': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'target_type': 'revision'}}} } +SHA1_TO_LICENSES = { + '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'], + '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'], + '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'], + '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'], + 'da39a3ee5e6b4b0d3255bfef95601890afd80709': [], +} + + class MockObjStorage: """Mock an swh-objstorage objstorage with predefined contents. """ data = {} def __init__(self): self.data = { '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': b'this is some text', '688a5ef812c53907562fe379d4b3851e69c7cb15': b'another text', '8986af901dd2043044ce8f0d8fc039153641cf17': b'yet another text', '02fb2c89e14f7fab46701478c83779c7beb7b069': b""" import unittest import logging from swh.indexer.mimetype import ContentMimetypeIndexer from swh.indexer.tests.test_utils import MockObjStorage class MockStorage(): def content_mimetype_add(self, mimetypes): self.state = mimetypes self.conflict_update = conflict_update def indexer_configuration_add(self, tools): return [{ 'id': 10, }] """, '103bc087db1d26afc3a0283f38663d081e9b01e6': b""" #ifndef __AVL__ #define __AVL__ typedef struct _avl_tree avl_tree; typedef struct _data_t { int content; } data_t; """, '93666f74f1cf635c8c8ac118879da6ec5623c410': b""" (should 'pygments (recognize 'lisp 'easily)) """, '26a9f72a7c87cc9205725cfd879f514ff4f3d8d5': b""" { "name": "test_metadata", "version": "0.0.1", "description": "Simple package.json test for indexer", "repository": { "type": "git", "url": "https://github.com/moranegg/metadata_test" } } """, 'd4c647f0fc257591cc9ba1722484229780d1c607': b""" { "version": "5.0.3", "name": "npm", "description": "a package manager for JavaScript", "keywords": [ "install", "modules", "package manager", "package.json" ], "preferGlobal": true, "config": { "publishtest": false }, "homepage": "https://docs.npmjs.com/", "author": "Isaac Z. Schlueter (http://blog.izs.me)", "repository": { "type": "git", "url": "https://github.com/npm/npm" }, "bugs": { "url": "https://github.com/npm/npm/issues" }, "dependencies": { "JSONStream": "~1.3.1", "abbrev": "~1.1.0", "ansi-regex": "~2.1.1", "ansicolors": "~0.3.2", "ansistyles": "~0.1.3" }, "devDependencies": { "tacks": "~1.2.6", "tap": "~10.3.2" }, "license": "Artistic-2.0" } """, 'a7ab314d8a11d2c93e3dcf528ca294e7b431c449': b""" """, 'da39a3ee5e6b4b0d3255bfef95601890afd80709': b'', } def __iter__(self): yield from self.data.keys() def __contains__(self, sha1): return self.data.get(sha1) is not None def get(self, sha1): raw_content = self.data.get(sha1) if raw_content is None: raise ObjNotFoundError(sha1) return raw_content class MockIndexerStorage(): """Mock an swh-indexer storage. """ added_data = [] def indexer_configuration_add(self, tools): tool = tools[0] if tool['tool_name'] == 'swh-metadata-translator': return [{ 'id': 30, 'tool_name': 'swh-metadata-translator', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'NpmMapping' }, }] elif tool['tool_name'] == 'swh-metadata-detector': return [{ 'id': 7, 'tool_name': 'swh-metadata-detector', 'tool_version': '0.0.1', 'tool_configuration': { 'type': 'local', 'context': 'NpmMapping' }, }] elif tool['tool_name'] == 'origin-metadata': return [{ 'id': 8, 'tool_name': 'origin-metadata', 'tool_version': '0.0.1', 'tool_configuration': {}, }] else: assert False, 'Unknown tool {tool_name}'.format(**tool) def content_metadata_missing(self, sha1s): yield from [] def content_metadata_add(self, metadata, conflict_update=None): self.added_data.append( ('content_metadata', conflict_update, metadata)) def revision_metadata_add(self, metadata, conflict_update=None): self.added_data.append( ('revision_metadata', conflict_update, metadata)) def origin_intrinsic_metadata_add(self, metadata, conflict_update=None): self.added_data.append( ('origin_intrinsic_metadata', conflict_update, metadata)) def content_metadata_get(self, sha1s): return [{ 'tool': { 'configuration': { 'type': 'local', 'context': 'NpmMapping' }, 'version': '0.0.1', 'id': 6, 'name': 'swh-metadata-translator' }, 'id': b'cde', 'translated_metadata': { '@context': 'https://doi.org/10.5063/schema/codemeta-2.0', 'type': 'SoftwareSourceCode', 'codemeta:issueTracker': 'https://github.com/librariesio/yarn-parser/issues', 'version': '1.0.0', 'name': 'yarn-parser', 'schema:author': 'Andrew Nesbitt', 'url': 'https://github.com/librariesio/yarn-parser#readme', 'processorRequirements': {'node': '7.5'}, 'license': 'AGPL-3.0', 'keywords': ['yarn', 'parse', 'lock', 'dependencies'], 'schema:codeRepository': 'git+https://github.com/librariesio/yarn-parser.git', 'description': 'Tiny web service for parsing yarn.lock files', } }] class MockStorage(): """Mock a real swh-storage storage to simplify reading indexers' outputs. """ def origin_get(self, id_): for origin in ORIGINS: for (k, v) in id_.items(): if origin[k] != v: break else: # This block is run iff we didn't break, ie. if all supplied # parts of the id are set to the expected value. return origin assert False, id_ def snapshot_get_latest(self, origin_id): if origin_id in SNAPSHOTS: return SNAPSHOTS[origin_id] else: assert False, origin_id def revision_get(self, revisions): return [{ 'id': b'8dbb6aeb036e7fd80664eb8bfd1507881af1ba9f', 'committer': { 'id': 26, 'name': b'Andrew Nesbitt', 'fullname': b'Andrew Nesbitt ', 'email': b'andrewnez@gmail.com' }, 'synthetic': False, 'date': { 'negative_utc': False, 'timestamp': { 'seconds': 1487596456, 'microseconds': 0 }, 'offset': 0 }, 'directory': b'10' }] def directory_ls(self, directory, recursive=False, cur=None): # with directory: b'\x9d', return [{ 'sha1_git': b'abc', 'name': b'index.js', 'target': b'abc', 'length': 897, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'bcd' }, { 'sha1_git': b'aab', 'name': b'package.json', 'target': b'aab', 'length': 712, 'status': 'visible', 'type': 'file', 'perms': 33188, 'dir_id': b'10', 'sha1': b'cde' }, { 'dir_id': b'10', 'target': b'11', 'type': 'dir', 'length': None, 'name': b'.github', 'sha1': None, 'perms': 16384, 'sha1_git': None, 'status': None, 'sha256': None }] + + +class BasicMockStorage(): + """In memory implementation to fake the content_get_range api. + + FIXME: To remove when the actual in-memory lands. + + """ + contents = [] + + def __init__(self, contents): + self.contents = contents + + def content_get_range(self, start, end, limit=1000): + # to make input test data consilient with actual runtime the + # other way of doing properly things would be to rewrite all + # tests (that's another task entirely so not right now) + if isinstance(start, bytes): + start = hashutil.hash_to_hex(start) + if isinstance(end, bytes): + end = hashutil.hash_to_hex(end) + results = [] + _next_id = None + counter = 0 + for c in self.contents: + _id = c['sha1'] + if start <= _id and _id <= end: + results.append(c) + if counter >= limit: + break + counter += 1 + + return { + 'contents': results, + 'next': _next_id + } + + +class BasicMockIndexerStorage(): + """Mock Indexer storage to simplify reading indexers' outputs. + + """ + state = [] + + def _internal_add(self, data, conflict_update=None): + """All content indexer have the same structure. So reuse `data` as the + same data. It's either mimetype, language, + fossology_license, etc... + + """ + self.state = data + self.conflict_update = conflict_update + + def content_mimetype_add(self, data, conflict_update=None): + self._internal_add(data, conflict_update=conflict_update) + + def content_fossology_license_add(self, data, conflict_update=None): + self._internal_add(data, conflict_update=conflict_update) + + def _internal_get_range(self, start, end, + indexer_configuration_id, limit=1000): + """Same logic as _internal_add, we retrieve indexed data given an + identifier. So the code here does not change even though + the underlying data does. + + """ + # to make input test data consilient with actual runtime the + # other way of doing properly things would be to rewrite all + # tests (that's another task entirely so not right now) + if isinstance(start, bytes): + start = hashutil.hash_to_hex(start) + if isinstance(end, bytes): + end = hashutil.hash_to_hex(end) + results = [] + _next = None + counter = 0 + for m in self.state: + _id = m['id'] + _tool_id = m['indexer_configuration_id'] + if (start <= _id and _id <= end and + _tool_id == indexer_configuration_id): + results.append(_id) + if counter >= limit: + break + counter += 1 + + return { + 'ids': results, + 'next': _next + } + + def content_mimetype_get_range( + self, start, end, indexer_configuration_id, limit=1000): + return self._internal_get_range( + start, end, indexer_configuration_id, limit=limit) + + def content_fossology_license_get_range( + self, start, end, indexer_configuration_id, limit=1000): + return self._internal_get_range( + start, end, indexer_configuration_id, limit=limit) + + def indexer_configuration_add(self, tools): + return [{ + 'id': 10, + }] + + +class IndexerRangeTest: + """Allows to factorize tests on range indexer. + + """ + def assert_results_ok(self, start, end, actual_results, + expected_results=None): + if expected_results is None: + expected_results = self.expected_results + + for indexed_data in actual_results: + _id = indexed_data['id'] + self.assertEqual(indexed_data, expected_results[_id]) + self.assertTrue(start <= _id and _id <= end) + _tool_id = indexed_data['indexer_configuration_id'] + self.assertEqual(_tool_id, self.indexer.tool['id']) + + def test__index_contents(self): + """Indexing contents without existing data results in indexed data + + """ + start, end = [self.contents[0], self.contents[2]] # output hex ids + # given + actual_results = list(self.indexer._index_contents( + start, end, indexed={})) + + self.assert_results_ok(start, end, actual_results) + + def test__index_contents_with_indexed_data(self): + """Indexing contents with existing data results in less indexed data + + """ + start, end = [self.contents[0], self.contents[2]] # output hex ids + data_indexed = [self.id0, self.id2] + + # given + actual_results = self.indexer._index_contents( + start, end, indexed=set(data_indexed)) + + # craft the expected results + expected_results = self.expected_results.copy() + for already_indexed_key in data_indexed: + expected_results.pop(already_indexed_key) + + self.assert_results_ok( + start, end, actual_results, expected_results) + + def test_generate_content_mimetype_get(self): + """Optimal indexing should result in indexed data + + """ + start, end = [self.contents[0], self.contents[2]] # output hex ids + # given + actual_results = self.indexer.run(start, end) + + # then + self.assertTrue(actual_results) + + def test_generate_content_mimetype_get_input_as_bytes(self): + """Optimal indexing should result in indexed data + + Input are in bytes here. + + """ + _start, _end = [self.contents[0], self.contents[2]] # output hex ids + start, end = map(hashutil.hash_to_bytes, (_start, _end)) + + # given + actual_results = self.indexer.run( # checks the bytes input this time + start, end, skip_existing=False) # no data so same result + + # then + self.assertTrue(actual_results) + + def test_generate_content_mimetype_get_no_result(self): + """No result indexed returns False""" + start, end = ['0000000000000000000000000000000000000000', + '0000000000000000000000000000000000000001'] + # given + actual_results = self.indexer.run( + start, end, incremental=False) + + # then + self.assertFalse(actual_results)