Page MenuHomeSoftware Heritage

D670.diff
No OneTemporary

D670.diff

diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2017 The Software Heritage developers
+# Copyright (C) 2016-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -8,50 +8,14 @@
from swh.model import hashutil
-from .indexer import ContentIndexer, DiskIndexer
+from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer
-def compute_license(path, log=None):
- """Determine license from file at path.
+class MixinFossologyLicenseIndexer:
+ """Mixin fossology license indexer.
- Args:
- path: filepath to determine the license
-
- Returns:
- A dict with the following keys:
- - licenses ([str]): associated detected licenses to path
- - path (bytes): content filepath
- - tool (str): tool used to compute the output
-
- """
- try:
- properties = subprocess.check_output(['nomossa', path],
- universal_newlines=True)
- if properties:
- res = properties.rstrip().split(' contains license(s) ')
- licenses = res[1].split(',')
-
- return {
- 'licenses': licenses,
- 'path': path,
- }
- except subprocess.CalledProcessError:
- if log:
- from os import path as __path
- log.exception('Problem during license detection for sha1 %s' %
- __path.basename(path))
- return {
- 'licenses': [],
- 'path': path,
- }
-
-
-class ContentFossologyLicenseIndexer(ContentIndexer, DiskIndexer):
- """Indexer in charge of:
- - filtering out content already indexed
- - reading content from objstorage per the content's id (sha1)
- - computing {license, encoding} from that content
- - store result in storage
+ See :class:`ContentFossologyLicenseIndexer` and
+ :class:`FossologyLicenseRangeIndexer`
"""
ADDITIONAL_CONFIG = {
@@ -72,22 +36,45 @@
self.working_directory = self.config['workdir']
self.tool = self.tools[0]
- def filter(self, ids):
- """Filter out known sha1s and return only missing ones.
+ def compute_license(self, path, log=None):
+ """Determine license from file at path.
+
+ Args:
+ path: filepath to determine the license
+
+ Returns:
+ A dict with the following keys:
+ - licenses ([str]): associated detected licenses to path
+ - path (bytes): content filepath
+ - tool (str): tool used to compute the output
"""
- yield from self.idx_storage.content_fossology_license_missing((
- {
- 'id': sha1,
- 'indexer_configuration_id': self.tool['id'],
- } for sha1 in ids
- ))
+ try:
+ properties = subprocess.check_output(['nomossa', path],
+ universal_newlines=True)
+ if properties:
+ res = properties.rstrip().split(' contains license(s) ')
+ licenses = res[1].split(',')
+
+ return {
+ 'licenses': licenses,
+ 'path': path,
+ }
+ except subprocess.CalledProcessError:
+ if log:
+ from os import path as __path
+ log.exception('Problem during license detection for sha1 %s' %
+ __path.basename(path))
+ return {
+ 'licenses': [],
+ 'path': path,
+ }
def index(self, id, data):
"""Index sha1s' content and store result.
Args:
- sha1 (bytes): content's identifier
+ id (bytes): content's identifier
raw_content (bytes): raw content in bytes
Returns:
@@ -97,13 +84,14 @@
- path (bytes): path
"""
- filename = hashutil.hash_to_hex(id)
+ if isinstance(id, str):
+ id = hashutil.hash_to_hex(id)
content_path = self.write_to_temp(
- filename=filename,
+ filename=id,
data=data)
try:
- properties = compute_license(path=content_path, log=self.log)
+ properties = self.compute_license(path=content_path, log=self.log)
properties.update({
'id': id,
'indexer_configuration_id': self.tool['id'],
@@ -130,11 +118,67 @@
results, conflict_update=(policy_update == 'update-dups'))
+class ContentFossologyLicenseIndexer(
+ MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer):
+ """Indexer in charge of:
+ - filtering out content already indexed
+ - reading content from objstorage per the content's id (sha1)
+ - computing {license, encoding} from that content
+ - store result in storage
+
+ """
+ def filter(self, ids):
+ """Filter out known sha1s and return only missing ones.
+
+ """
+ yield from self.idx_storage.content_fossology_license_missing((
+ {
+ 'id': sha1,
+ 'indexer_configuration_id': self.tool['id'],
+ } for sha1 in ids
+ ))
+
+
+class FossologyLicenseRangeIndexer(
+ MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer):
+ """FossologyLicense Range Indexer working on range of content identifiers.
+
+ It:
+ - filters out the non textual content
+ - (optionally) filters out content already indexed (cf :callable:`range`)
+ - reads content from objstorage per the content's id (sha1)
+ - computes {mimetype, encoding} from that content
+ - stores result in storage
+
+ """
+ def indexed_contents_in_range(self, start, end):
+ """Retrieve indexed content id within range [start, end].
+
+ Args
+ **start** (bytes): Starting bound from range identifier
+ **end** (bytes): End range identifier
+
+ Yields:
+ Content identifier (bytes) present in the range [start, end]
+
+ """
+ while start:
+ result = self.idx_storage.content_fossology_license_get_range(
+ start, end, self.tool['id'])
+ contents = result['ids']
+ for _id in contents:
+ yield _id
+ start = result['next']
+ if start is None:
+ break
+
+
@click.command(help='Compute license for path using tool')
@click.option('--tool', default='nomossa', help="Path to tool")
@click.option('--path', required=1, help="Path to execute index on")
def main(tool, path):
- print(compute_license(tool, path))
+ indexer = ContentFossologyLicenseIndexer()
+ print(indexer.compute_license(tool, path))
if __name__ == '__main__':
diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/test_fossology_license.py
@@ -0,0 +1,227 @@
+# Copyright (C) 2017-2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import unittest
+import logging
+
+from swh.indexer.fossology_license import (
+ ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer
+)
+
+from swh.indexer.tests.test_utils import (
+ MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
+ SHA1_TO_LICENSES, IndexerRangeTest
+)
+
+
+class NoDiskIndexer:
+ """Mixin to override the DiskIndexer behavior avoiding side-effects in
+ tests.
+
+ """
+
+ def write_to_temp(self, filename, data): # noop
+ return filename
+
+ def cleanup(self, content_path): # noop
+ return None
+
+
+class InjectLicenseIndexer:
+ """Override license computations.
+
+ """
+ def compute_license(self, path, log=None):
+ """path is the content identifier
+
+ """
+ return {
+ 'licenses': SHA1_TO_LICENSES.get(path)
+ }
+
+
+class FossologyLicenseTestIndexer(
+ NoDiskIndexer, InjectLicenseIndexer, ContentFossologyLicenseIndexer):
+ """Specific mimetype whose configuration is enough to satisfy the
+ indexing tests.
+
+ """
+ def prepare(self):
+ self.config = {
+ 'tools': {
+ 'name': 'nomos',
+ 'version': '3.1.0rc2-31-ga2cbb8c',
+ 'configuration': {
+ 'command_line': 'nomossa <filepath>',
+ },
+ },
+ }
+ self.idx_storage = BasicMockIndexerStorage()
+ self.log = logging.getLogger('swh.indexer')
+ self.objstorage = MockObjStorage()
+ self.tools = self.register_tools(self.config['tools'])
+ self.tool = self.tools[0]
+
+
+class FossologyLicenseIndexerUnknownToolTestStorage(
+ FossologyLicenseTestIndexer):
+ """Specific fossology license indexer whose configuration is not
+ enough to satisfy the indexing checks
+
+ """
+ def prepare(self):
+ super().prepare()
+ self.tools = None
+
+
+class TestFossologyLicenseIndexerWithErrors(unittest.TestCase):
+ def test_wrong_unknown_configuration_tool(self):
+ """Indexer with unknown configuration tool should fail the check"""
+ with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
+ FossologyLicenseIndexerUnknownToolTestStorage()
+
+
+class TestFossologyLicenseIndexer(unittest.TestCase):
+ """Fossology license tests.
+
+ """
+ def setUp(self):
+ self.indexer = FossologyLicenseTestIndexer()
+
+ def test_index_no_update(self):
+ """Index sha1s results in new computed licenses
+
+ """
+ id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
+ id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
+ sha1s = [id0, id1]
+
+ # when
+ self.indexer.run(sha1s, policy_update='ignore-dups')
+
+ # then
+ expected_results = [{
+ 'id': id0,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id0],
+ }, {
+ 'id': id1,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id1],
+ }]
+
+ self.assertFalse(self.indexer.idx_storage.conflict_update)
+ self.assertEqual(expected_results, self.indexer.idx_storage.state)
+
+ def test_index_update(self):
+ """Index sha1s results in new computed licenses
+
+ """
+ id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
+ id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
+ id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content
+ sha1s = [id0, id1, id2]
+
+ # when
+ self.indexer.run(sha1s, policy_update='update-dups')
+
+ # then
+ expected_results = [{
+ 'id': id0,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id0],
+ }, {
+ 'id': id1,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id1],
+ }, {
+ 'id': id2,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id2],
+ }]
+
+ self.assertTrue(self.indexer.idx_storage.conflict_update)
+ self.assertEqual(expected_results, self.indexer.idx_storage.state)
+
+ def test_index_one_unknown_sha1(self):
+ """Only existing contents are indexed
+
+ """
+ # given
+ id0 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
+ sha1s = [id0,
+ '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
+ '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown
+
+ # when
+ self.indexer.run(sha1s, policy_update='update-dups')
+
+ # then
+ expected_results = [{
+ 'id': id0,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id0],
+ }]
+
+ self.assertTrue(self.indexer.idx_storage.conflict_update)
+ self.assertEqual(expected_results, self.indexer.idx_storage.state)
+
+
+class FossologyLicenseRangeIndexerTest(
+ NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer):
+ """Testing the range indexer on fossology license.
+
+ """
+ def prepare(self):
+ self.config = {
+ 'tools': {
+ 'name': 'nomos',
+ 'version': '3.1.0rc2-31-ga2cbb8c',
+ 'configuration': {
+ 'command_line': 'nomossa <filepath>',
+ },
+ },
+ 'write_batch_size': 100,
+ }
+ self.idx_storage = BasicMockIndexerStorage()
+ self.log = logging.getLogger('swh.indexer')
+ # this hardcodes some contents, will use this to setup the storage
+ self.objstorage = MockObjStorage()
+ # sync objstorage and storage
+ contents = [{'sha1': c_id} for c_id in self.objstorage]
+ self.storage = BasicMockStorage(contents)
+ self.tools = self.register_tools(self.config['tools'])
+ self.tool = self.tools[0]
+
+
+class TestFossologyLicenseRangeIndexer(IndexerRangeTest, unittest.TestCase):
+ def setUp(self):
+ self.indexer = FossologyLicenseRangeIndexerTest()
+ # will play along with the objstorage's mocked contents for now
+ self.contents = sorted(self.indexer.objstorage)
+ # FIXME: leverage swh.objstorage.in_memory_storage's
+ # InMemoryObjStorage, swh.storage.tests's gen_contents, and
+ # hypothesis to generate data to actually run indexer on those
+
+ self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
+ self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
+ self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
+ self.expected_results = {
+ self.id0: {
+ 'id': self.id0,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[self.id0]
+ },
+ self.id1: {
+ 'id': self.id1,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[self.id1]
+ },
+ self.id2: {
+ 'id': self.id2,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[self.id2]
+ }
+ }
diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
--- a/swh/indexer/tests/test_mimetype.py
+++ b/swh/indexer/tests/test_mimetype.py
@@ -10,90 +10,9 @@
ContentMimetypeIndexer, MimetypeRangeIndexer
)
-from swh.indexer.tests.test_utils import MockObjStorage
-from swh.model import hashutil
-
-
-class _MockStorage():
- """In memory implementation to fake the content_get_range api.
-
- FIXME: To remove when the actual in-memory lands.
-
- """
- contents = []
-
- def __init__(self, contents):
- self.contents = contents
-
- def content_get_range(self, start, end, limit=1000):
- # to make input test data consilient with actual runtime the
- # other way of doing properly things would be to rewrite all
- # tests (that's another task entirely so not right now)
- if isinstance(start, bytes):
- start = hashutil.hash_to_hex(start)
- if isinstance(end, bytes):
- end = hashutil.hash_to_hex(end)
- results = []
- _next_id = None
- counter = 0
- for c in self.contents:
- _id = c['sha1']
- if start <= _id and _id <= end:
- results.append(c)
- if counter >= limit:
- break
- counter += 1
-
- return {
- 'contents': results,
- 'next': _next_id
- }
-
-
-class _MockIndexerStorage():
- """Mock storage to simplify reading indexers' outputs.
-
- """
- state = []
-
- def content_mimetype_add(self, mimetypes, conflict_update=None):
- self.state = mimetypes
- self.conflict_update = conflict_update
-
- def content_mimetype_get_range(self, start, end, indexer_configuration_id,
- limit=1000):
- """Basic in-memory implementation (limit is unused).
-
- """
- # to make input test data consilient with actual runtime the
- # other way of doing properly things would be to rewrite all
- # tests (that's another task entirely so not right now)
- if isinstance(start, bytes):
- start = hashutil.hash_to_hex(start)
- if isinstance(end, bytes):
- end = hashutil.hash_to_hex(end)
- results = []
- _next = None
- counter = 0
- for m in self.state:
- _id = m['id']
- _tool_id = m['indexer_configuration_id']
- if (start <= _id and _id <= end and
- _tool_id == indexer_configuration_id):
- results.append(_id)
- if counter >= limit:
- break
- counter += 1
-
- return {
- 'ids': results,
- 'next': _next
- }
-
- def indexer_configuration_add(self, tools):
- return [{
- 'id': 10,
- }]
+from swh.indexer.tests.test_utils import (
+ MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, IndexerRangeTest
+)
class MimetypeTestIndexer(ContentMimetypeIndexer):
@@ -112,7 +31,7 @@
},
},
}
- self.idx_storage = _MockIndexerStorage()
+ self.idx_storage = BasicMockIndexerStorage()
self.log = logging.getLogger('swh.indexer')
self.objstorage = MockObjStorage()
self.tools = self.register_tools(self.config['tools'])
@@ -236,18 +155,19 @@
},
'write_batch_size': 100,
}
- self.idx_storage = _MockIndexerStorage()
+ self.idx_storage = BasicMockIndexerStorage()
self.log = logging.getLogger('swh.indexer')
# this hardcodes some contents, will use this to setup the storage
self.objstorage = MockObjStorage()
# sync objstorage and storage
contents = [{'sha1': c_id} for c_id in self.objstorage]
- self.storage = _MockStorage(contents)
+ self.storage = BasicMockStorage(contents)
self.tools = self.register_tools(self.config['tools'])
self.tool = self.tools[0]
-class TestMimetypeRangeIndexer(unittest.TestCase):
+class TestMimetypeRangeIndexer(IndexerRangeTest, unittest.TestCase):
+ """Range Mimetype Indexer tests on """
def setUp(self):
self.indexer = MimetypeRangeIndexerTest()
# will play along with the objstorage's mocked contents for now
@@ -256,103 +176,23 @@
# InMemoryObjStorage, swh.storage.tests's gen_contents, and
# hypothesis to generate data to actually run indexer on those
+ self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
+ self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
+ self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
self.expected_results = {
- '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': {
+ self.id0: {
'encoding': b'us-ascii',
- 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5',
+ 'id': self.id0,
'indexer_configuration_id': 10,
'mimetype': b'text/plain'},
- '02fb2c89e14f7fab46701478c83779c7beb7b069': {
+ self.id1: {
'encoding': b'us-ascii',
- 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069',
+ 'id': self.id1,
'indexer_configuration_id': 10,
'mimetype': b'text/x-python'},
- '103bc087db1d26afc3a0283f38663d081e9b01e6': {
+ self.id2: {
'encoding': b'us-ascii',
- 'id': '103bc087db1d26afc3a0283f38663d081e9b01e6',
+ 'id': self.id2,
'indexer_configuration_id': 10,
'mimetype': b'text/plain'}
}
-
- def assert_mimetypes_ok(self, start, end, actual_results,
- expected_results=None):
- if expected_results is None:
- expected_results = self.expected_results
-
- for mimetype in actual_results:
- _id = mimetype['id']
- self.assertEqual(mimetype, expected_results[_id])
- self.assertTrue(start <= _id and _id <= end)
- _tool_id = mimetype['indexer_configuration_id']
- self.assertEqual(_tool_id, self.indexer.tool['id'])
-
- def test__index_contents(self):
- """Indexing contents without existing data results in indexed data
-
- """
- start, end = [self.contents[0], self.contents[2]] # output hex ids
- # given
- actual_results = list(self.indexer._index_contents(
- start, end, indexed={}))
-
- self.assert_mimetypes_ok(start, end, actual_results)
-
- def test__index_contents_with_indexed_data(self):
- """Indexing contents with existing data results in less indexed data
-
- """
- start, end = [self.contents[0], self.contents[2]] # output hex ids
- data_indexed = [
- '01c9379dfc33803963d07c1ccc748d3fe4c96bb5',
- '103bc087db1d26afc3a0283f38663d081e9b01e6'
- ]
-
- # given
- actual_results = self.indexer._index_contents(
- start, end, indexed=set(data_indexed))
-
- # craft the expected results
- expected_results = self.expected_results.copy()
- for already_indexed_key in data_indexed:
- expected_results.pop(already_indexed_key)
-
- self.assert_mimetypes_ok(
- start, end, actual_results, expected_results)
-
- def test_generate_content_mimetype_get(self):
- """Optimal indexing should result in indexed data
-
- """
- start, end = [self.contents[0], self.contents[2]] # output hex ids
- # given
- actual_results = self.indexer.run(start, end)
-
- # then
- self.assertTrue(actual_results)
-
- def test_generate_content_mimetype_get_input_as_bytes(self):
- """Optimal indexing should result in indexed data
-
- Input are in bytes here.
-
- """
- _start, _end = [self.contents[0], self.contents[2]] # output hex ids
- start, end = map(hashutil.hash_to_bytes, (_start, _end))
-
- # given
- actual_results = self.indexer.run( # checks the bytes input this time
- start, end, skip_existing=False) # no data so same result
-
- # then
- self.assertTrue(actual_results)
-
- def test_generate_content_mimetype_get_no_result(self):
- """No result indexed returns False"""
- start, end = ['0000000000000000000000000000000000000000',
- '0000000000000000000000000000000000000001']
- # given
- actual_results = self.indexer.run(
- start, end, incremental=False)
-
- # then
- self.assertFalse(actual_results)
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -1,10 +1,11 @@
-# Copyright (C) 2017 The Software Heritage developers
+# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.objstorage.exc import ObjNotFoundError
+from swh.model import hashutil
ORIGINS = [
{
@@ -124,6 +125,15 @@
}
+SHA1_TO_LICENSES = {
+ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
+ '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
+ '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
+ '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
+}
+
+
class MockObjStorage:
"""Mock an swh-objstorage objstorage with predefined contents.
@@ -398,3 +408,193 @@
'status': None,
'sha256': None
}]
+
+
+class BasicMockStorage():
+ """In memory implementation to fake the content_get_range api.
+
+ FIXME: To remove when the actual in-memory lands.
+
+ """
+ contents = []
+
+ def __init__(self, contents):
+ self.contents = contents
+
+ def content_get_range(self, start, end, limit=1000):
+ # to make input test data consilient with actual runtime the
+ # other way of doing properly things would be to rewrite all
+ # tests (that's another task entirely so not right now)
+ if isinstance(start, bytes):
+ start = hashutil.hash_to_hex(start)
+ if isinstance(end, bytes):
+ end = hashutil.hash_to_hex(end)
+ results = []
+ _next_id = None
+ counter = 0
+ for c in self.contents:
+ _id = c['sha1']
+ if start <= _id and _id <= end:
+ results.append(c)
+ if counter >= limit:
+ break
+ counter += 1
+
+ return {
+ 'contents': results,
+ 'next': _next_id
+ }
+
+
+class BasicMockIndexerStorage():
+ """Mock Indexer storage to simplify reading indexers' outputs.
+
+ """
+ state = []
+
+ def _internal_add(self, data, conflict_update=None):
+ """All content indexer have the same structure. So reuse `data` as the
+ same data. It's either mimetype, language,
+ fossology_license, etc...
+
+ """
+ self.state = data
+ self.conflict_update = conflict_update
+
+ def content_mimetype_add(self, data, conflict_update=None):
+ self._internal_add(data, conflict_update=conflict_update)
+
+ def content_fossology_license_add(self, data, conflict_update=None):
+ self._internal_add(data, conflict_update=conflict_update)
+
+ def _internal_get_range(self, start, end,
+ indexer_configuration_id, limit=1000):
+ """Same logic as _internal_add, we retrieve indexed data given an
+ identifier. So the code here does not change even though
+ the underlying data does.
+
+ """
+ # to make input test data consilient with actual runtime the
+ # other way of doing properly things would be to rewrite all
+ # tests (that's another task entirely so not right now)
+ if isinstance(start, bytes):
+ start = hashutil.hash_to_hex(start)
+ if isinstance(end, bytes):
+ end = hashutil.hash_to_hex(end)
+ results = []
+ _next = None
+ counter = 0
+ for m in self.state:
+ _id = m['id']
+ _tool_id = m['indexer_configuration_id']
+ if (start <= _id and _id <= end and
+ _tool_id == indexer_configuration_id):
+ results.append(_id)
+ if counter >= limit:
+ break
+ counter += 1
+
+ return {
+ 'ids': results,
+ 'next': _next
+ }
+
+ def content_mimetype_get_range(
+ self, start, end, indexer_configuration_id, limit=1000):
+ return self._internal_get_range(
+ start, end, indexer_configuration_id, limit=limit)
+
+ def content_fossology_license_get_range(
+ self, start, end, indexer_configuration_id, limit=1000):
+ return self._internal_get_range(
+ start, end, indexer_configuration_id, limit=limit)
+
+ def indexer_configuration_add(self, tools):
+ return [{
+ 'id': 10,
+ }]
+
+
+class IndexerRangeTest:
+ """Allows to factorize tests on range indexer.
+
+ """
+ def assert_results_ok(self, start, end, actual_results,
+ expected_results=None):
+ if expected_results is None:
+ expected_results = self.expected_results
+
+ for indexed_data in actual_results:
+ _id = indexed_data['id']
+ self.assertEqual(indexed_data, expected_results[_id])
+ self.assertTrue(start <= _id and _id <= end)
+ _tool_id = indexed_data['indexer_configuration_id']
+ self.assertEqual(_tool_id, self.indexer.tool['id'])
+
+ def test__index_contents(self):
+ """Indexing contents without existing data results in indexed data
+
+ """
+ start, end = [self.contents[0], self.contents[2]] # output hex ids
+ # given
+ actual_results = list(self.indexer._index_contents(
+ start, end, indexed={}))
+
+ self.assert_results_ok(start, end, actual_results)
+
+ def test__index_contents_with_indexed_data(self):
+ """Indexing contents with existing data results in less indexed data
+
+ """
+ start, end = [self.contents[0], self.contents[2]] # output hex ids
+ data_indexed = [self.id0, self.id2]
+
+ # given
+ actual_results = self.indexer._index_contents(
+ start, end, indexed=set(data_indexed))
+
+ # craft the expected results
+ expected_results = self.expected_results.copy()
+ for already_indexed_key in data_indexed:
+ expected_results.pop(already_indexed_key)
+
+ self.assert_results_ok(
+ start, end, actual_results, expected_results)
+
+ def test_generate_content_mimetype_get(self):
+ """Optimal indexing should result in indexed data
+
+ """
+ start, end = [self.contents[0], self.contents[2]] # output hex ids
+ # given
+ actual_results = self.indexer.run(start, end)
+
+ # then
+ self.assertTrue(actual_results)
+
+ def test_generate_content_mimetype_get_input_as_bytes(self):
+ """Optimal indexing should result in indexed data
+
+ Input are in bytes here.
+
+ """
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
+
+ # given
+ actual_results = self.indexer.run( # checks the bytes input this time
+ start, end, skip_existing=False) # no data so same result
+
+ # then
+ self.assertTrue(actual_results)
+
+ def test_generate_content_mimetype_get_no_result(self):
+ """No result indexed returns False"""
+ start, end = ['0000000000000000000000000000000000000000',
+ '0000000000000000000000000000000000000001']
+ # given
+ actual_results = self.indexer.run(
+ start, end, incremental=False)
+
+ # then
+ self.assertFalse(actual_results)

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:44 PM (2 w, 8 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3224997

Event Timeline