Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9346113
D670.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
30 KB
Subscribers
None
D670.diff
View Options
diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2017 The Software Heritage developers
+# Copyright (C) 2016-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -8,50 +8,14 @@
from swh.model import hashutil
-from .indexer import ContentIndexer, DiskIndexer
+from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer
-def compute_license(path, log=None):
- """Determine license from file at path.
+class MixinFossologyLicenseIndexer:
+ """Mixin fossology license indexer.
- Args:
- path: filepath to determine the license
-
- Returns:
- A dict with the following keys:
- - licenses ([str]): associated detected licenses to path
- - path (bytes): content filepath
- - tool (str): tool used to compute the output
-
- """
- try:
- properties = subprocess.check_output(['nomossa', path],
- universal_newlines=True)
- if properties:
- res = properties.rstrip().split(' contains license(s) ')
- licenses = res[1].split(',')
-
- return {
- 'licenses': licenses,
- 'path': path,
- }
- except subprocess.CalledProcessError:
- if log:
- from os import path as __path
- log.exception('Problem during license detection for sha1 %s' %
- __path.basename(path))
- return {
- 'licenses': [],
- 'path': path,
- }
-
-
-class ContentFossologyLicenseIndexer(ContentIndexer, DiskIndexer):
- """Indexer in charge of:
- - filtering out content already indexed
- - reading content from objstorage per the content's id (sha1)
- - computing {license, encoding} from that content
- - store result in storage
+ See :class:`ContentFossologyLicenseIndexer` and
+ :class:`FossologyLicenseRangeIndexer`
"""
ADDITIONAL_CONFIG = {
@@ -72,22 +36,45 @@
self.working_directory = self.config['workdir']
self.tool = self.tools[0]
- def filter(self, ids):
- """Filter out known sha1s and return only missing ones.
+ def compute_license(self, path, log=None):
+ """Determine license from file at path.
+
+ Args:
+ path: filepath to determine the license
+
+ Returns:
+ A dict with the following keys:
+ - licenses ([str]): associated detected licenses to path
+ - path (bytes): content filepath
+ - tool (str): tool used to compute the output
"""
- yield from self.idx_storage.content_fossology_license_missing((
- {
- 'id': sha1,
- 'indexer_configuration_id': self.tool['id'],
- } for sha1 in ids
- ))
+ try:
+ properties = subprocess.check_output(['nomossa', path],
+ universal_newlines=True)
+ if properties:
+ res = properties.rstrip().split(' contains license(s) ')
+ licenses = res[1].split(',')
+
+ return {
+ 'licenses': licenses,
+ 'path': path,
+ }
+ except subprocess.CalledProcessError:
+ if log:
+ from os import path as __path
+ log.exception('Problem during license detection for sha1 %s' %
+ __path.basename(path))
+ return {
+ 'licenses': [],
+ 'path': path,
+ }
def index(self, id, data):
"""Index sha1s' content and store result.
Args:
- sha1 (bytes): content's identifier
+ id (bytes): content's identifier
raw_content (bytes): raw content in bytes
Returns:
@@ -97,13 +84,14 @@
- path (bytes): path
"""
- filename = hashutil.hash_to_hex(id)
+ if isinstance(id, str):
+ id = hashutil.hash_to_hex(id)
content_path = self.write_to_temp(
- filename=filename,
+ filename=id,
data=data)
try:
- properties = compute_license(path=content_path, log=self.log)
+ properties = self.compute_license(path=content_path, log=self.log)
properties.update({
'id': id,
'indexer_configuration_id': self.tool['id'],
@@ -130,11 +118,67 @@
results, conflict_update=(policy_update == 'update-dups'))
+class ContentFossologyLicenseIndexer(
+ MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer):
+ """Indexer in charge of:
+ - filtering out content already indexed
+ - reading content from objstorage per the content's id (sha1)
+ - computing {license, encoding} from that content
+ - store result in storage
+
+ """
+ def filter(self, ids):
+ """Filter out known sha1s and return only missing ones.
+
+ """
+ yield from self.idx_storage.content_fossology_license_missing((
+ {
+ 'id': sha1,
+ 'indexer_configuration_id': self.tool['id'],
+ } for sha1 in ids
+ ))
+
+
+class FossologyLicenseRangeIndexer(
+ MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer):
+ """FossologyLicense Range Indexer working on range of content identifiers.
+
+ It:
+ - filters out the non textual content
+ - (optionally) filters out content already indexed (cf :callable:`range`)
+ - reads content from objstorage per the content's id (sha1)
+ - computes {mimetype, encoding} from that content
+ - stores result in storage
+
+ """
+ def indexed_contents_in_range(self, start, end):
+ """Retrieve indexed content id within range [start, end].
+
+ Args
+ **start** (bytes): Starting bound from range identifier
+ **end** (bytes): End range identifier
+
+ Yields:
+ Content identifier (bytes) present in the range [start, end]
+
+ """
+ while start:
+ result = self.idx_storage.content_fossology_license_get_range(
+ start, end, self.tool['id'])
+ contents = result['ids']
+ for _id in contents:
+ yield _id
+ start = result['next']
+ if start is None:
+ break
+
+
@click.command(help='Compute license for path using tool')
@click.option('--tool', default='nomossa', help="Path to tool")
@click.option('--path', required=1, help="Path to execute index on")
def main(tool, path):
- print(compute_license(tool, path))
+ indexer = ContentFossologyLicenseIndexer()
+ print(indexer.compute_license(tool, path))
if __name__ == '__main__':
diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
new file mode 100644
--- /dev/null
+++ b/swh/indexer/tests/test_fossology_license.py
@@ -0,0 +1,227 @@
+# Copyright (C) 2017-2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import unittest
+import logging
+
+from swh.indexer.fossology_license import (
+ ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer
+)
+
+from swh.indexer.tests.test_utils import (
+ MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
+ SHA1_TO_LICENSES, IndexerRangeTest
+)
+
+
+class NoDiskIndexer:
+ """Mixin to override the DiskIndexer behavior avoiding side-effects in
+ tests.
+
+ """
+
+ def write_to_temp(self, filename, data): # noop
+ return filename
+
+ def cleanup(self, content_path): # noop
+ return None
+
+
+class InjectLicenseIndexer:
+ """Override license computations.
+
+ """
+ def compute_license(self, path, log=None):
+ """path is the content identifier
+
+ """
+ return {
+ 'licenses': SHA1_TO_LICENSES.get(path)
+ }
+
+
+class FossologyLicenseTestIndexer(
+ NoDiskIndexer, InjectLicenseIndexer, ContentFossologyLicenseIndexer):
+ """Specific mimetype whose configuration is enough to satisfy the
+ indexing tests.
+
+ """
+ def prepare(self):
+ self.config = {
+ 'tools': {
+ 'name': 'nomos',
+ 'version': '3.1.0rc2-31-ga2cbb8c',
+ 'configuration': {
+ 'command_line': 'nomossa <filepath>',
+ },
+ },
+ }
+ self.idx_storage = BasicMockIndexerStorage()
+ self.log = logging.getLogger('swh.indexer')
+ self.objstorage = MockObjStorage()
+ self.tools = self.register_tools(self.config['tools'])
+ self.tool = self.tools[0]
+
+
+class FossologyLicenseIndexerUnknownToolTestStorage(
+ FossologyLicenseTestIndexer):
+ """Specific fossology license indexer whose configuration is not
+ enough to satisfy the indexing checks
+
+ """
+ def prepare(self):
+ super().prepare()
+ self.tools = None
+
+
+class TestFossologyLicenseIndexerWithErrors(unittest.TestCase):
+ def test_wrong_unknown_configuration_tool(self):
+ """Indexer with unknown configuration tool should fail the check"""
+ with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
+ FossologyLicenseIndexerUnknownToolTestStorage()
+
+
+class TestFossologyLicenseIndexer(unittest.TestCase):
+ """Fossology license tests.
+
+ """
+ def setUp(self):
+ self.indexer = FossologyLicenseTestIndexer()
+
+ def test_index_no_update(self):
+ """Index sha1s results in new computed licenses
+
+ """
+ id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
+ id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
+ sha1s = [id0, id1]
+
+ # when
+ self.indexer.run(sha1s, policy_update='ignore-dups')
+
+ # then
+ expected_results = [{
+ 'id': id0,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id0],
+ }, {
+ 'id': id1,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id1],
+ }]
+
+ self.assertFalse(self.indexer.idx_storage.conflict_update)
+ self.assertEqual(expected_results, self.indexer.idx_storage.state)
+
+ def test_index_update(self):
+ """Index sha1s results in new computed licenses
+
+ """
+ id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
+ id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
+ id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content
+ sha1s = [id0, id1, id2]
+
+ # when
+ self.indexer.run(sha1s, policy_update='update-dups')
+
+ # then
+ expected_results = [{
+ 'id': id0,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id0],
+ }, {
+ 'id': id1,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id1],
+ }, {
+ 'id': id2,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id2],
+ }]
+
+ self.assertTrue(self.indexer.idx_storage.conflict_update)
+ self.assertEqual(expected_results, self.indexer.idx_storage.state)
+
+ def test_index_one_unknown_sha1(self):
+ """Only existing contents are indexed
+
+ """
+ # given
+ id0 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
+ sha1s = [id0,
+ '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
+ '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown
+
+ # when
+ self.indexer.run(sha1s, policy_update='update-dups')
+
+ # then
+ expected_results = [{
+ 'id': id0,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[id0],
+ }]
+
+ self.assertTrue(self.indexer.idx_storage.conflict_update)
+ self.assertEqual(expected_results, self.indexer.idx_storage.state)
+
+
+class FossologyLicenseRangeIndexerTest(
+ NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer):
+ """Testing the range indexer on fossology license.
+
+ """
+ def prepare(self):
+ self.config = {
+ 'tools': {
+ 'name': 'nomos',
+ 'version': '3.1.0rc2-31-ga2cbb8c',
+ 'configuration': {
+ 'command_line': 'nomossa <filepath>',
+ },
+ },
+ 'write_batch_size': 100,
+ }
+ self.idx_storage = BasicMockIndexerStorage()
+ self.log = logging.getLogger('swh.indexer')
+ # this hardcodes some contents, will use this to setup the storage
+ self.objstorage = MockObjStorage()
+ # sync objstorage and storage
+ contents = [{'sha1': c_id} for c_id in self.objstorage]
+ self.storage = BasicMockStorage(contents)
+ self.tools = self.register_tools(self.config['tools'])
+ self.tool = self.tools[0]
+
+
+class TestFossologyLicenseRangeIndexer(IndexerRangeTest, unittest.TestCase):
+ def setUp(self):
+ self.indexer = FossologyLicenseRangeIndexerTest()
+ # will play along with the objstorage's mocked contents for now
+ self.contents = sorted(self.indexer.objstorage)
+ # FIXME: leverage swh.objstorage.in_memory_storage's
+ # InMemoryObjStorage, swh.storage.tests's gen_contents, and
+ # hypothesis to generate data to actually run indexer on those
+
+ self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
+ self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
+ self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
+ self.expected_results = {
+ self.id0: {
+ 'id': self.id0,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[self.id0]
+ },
+ self.id1: {
+ 'id': self.id1,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[self.id1]
+ },
+ self.id2: {
+ 'id': self.id2,
+ 'indexer_configuration_id': 10,
+ 'licenses': SHA1_TO_LICENSES[self.id2]
+ }
+ }
diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
--- a/swh/indexer/tests/test_mimetype.py
+++ b/swh/indexer/tests/test_mimetype.py
@@ -10,90 +10,9 @@
ContentMimetypeIndexer, MimetypeRangeIndexer
)
-from swh.indexer.tests.test_utils import MockObjStorage
-from swh.model import hashutil
-
-
-class _MockStorage():
- """In memory implementation to fake the content_get_range api.
-
- FIXME: To remove when the actual in-memory lands.
-
- """
- contents = []
-
- def __init__(self, contents):
- self.contents = contents
-
- def content_get_range(self, start, end, limit=1000):
- # to make input test data consilient with actual runtime the
- # other way of doing properly things would be to rewrite all
- # tests (that's another task entirely so not right now)
- if isinstance(start, bytes):
- start = hashutil.hash_to_hex(start)
- if isinstance(end, bytes):
- end = hashutil.hash_to_hex(end)
- results = []
- _next_id = None
- counter = 0
- for c in self.contents:
- _id = c['sha1']
- if start <= _id and _id <= end:
- results.append(c)
- if counter >= limit:
- break
- counter += 1
-
- return {
- 'contents': results,
- 'next': _next_id
- }
-
-
-class _MockIndexerStorage():
- """Mock storage to simplify reading indexers' outputs.
-
- """
- state = []
-
- def content_mimetype_add(self, mimetypes, conflict_update=None):
- self.state = mimetypes
- self.conflict_update = conflict_update
-
- def content_mimetype_get_range(self, start, end, indexer_configuration_id,
- limit=1000):
- """Basic in-memory implementation (limit is unused).
-
- """
- # to make input test data consilient with actual runtime the
- # other way of doing properly things would be to rewrite all
- # tests (that's another task entirely so not right now)
- if isinstance(start, bytes):
- start = hashutil.hash_to_hex(start)
- if isinstance(end, bytes):
- end = hashutil.hash_to_hex(end)
- results = []
- _next = None
- counter = 0
- for m in self.state:
- _id = m['id']
- _tool_id = m['indexer_configuration_id']
- if (start <= _id and _id <= end and
- _tool_id == indexer_configuration_id):
- results.append(_id)
- if counter >= limit:
- break
- counter += 1
-
- return {
- 'ids': results,
- 'next': _next
- }
-
- def indexer_configuration_add(self, tools):
- return [{
- 'id': 10,
- }]
+from swh.indexer.tests.test_utils import (
+ MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, IndexerRangeTest
+)
class MimetypeTestIndexer(ContentMimetypeIndexer):
@@ -112,7 +31,7 @@
},
},
}
- self.idx_storage = _MockIndexerStorage()
+ self.idx_storage = BasicMockIndexerStorage()
self.log = logging.getLogger('swh.indexer')
self.objstorage = MockObjStorage()
self.tools = self.register_tools(self.config['tools'])
@@ -236,18 +155,19 @@
},
'write_batch_size': 100,
}
- self.idx_storage = _MockIndexerStorage()
+ self.idx_storage = BasicMockIndexerStorage()
self.log = logging.getLogger('swh.indexer')
# this hardcodes some contents, will use this to setup the storage
self.objstorage = MockObjStorage()
# sync objstorage and storage
contents = [{'sha1': c_id} for c_id in self.objstorage]
- self.storage = _MockStorage(contents)
+ self.storage = BasicMockStorage(contents)
self.tools = self.register_tools(self.config['tools'])
self.tool = self.tools[0]
-class TestMimetypeRangeIndexer(unittest.TestCase):
+class TestMimetypeRangeIndexer(IndexerRangeTest, unittest.TestCase):
+ """Range Mimetype Indexer tests on """
def setUp(self):
self.indexer = MimetypeRangeIndexerTest()
# will play along with the objstorage's mocked contents for now
@@ -256,103 +176,23 @@
# InMemoryObjStorage, swh.storage.tests's gen_contents, and
# hypothesis to generate data to actually run indexer on those
+ self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
+ self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
+ self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
self.expected_results = {
- '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': {
+ self.id0: {
'encoding': b'us-ascii',
- 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5',
+ 'id': self.id0,
'indexer_configuration_id': 10,
'mimetype': b'text/plain'},
- '02fb2c89e14f7fab46701478c83779c7beb7b069': {
+ self.id1: {
'encoding': b'us-ascii',
- 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069',
+ 'id': self.id1,
'indexer_configuration_id': 10,
'mimetype': b'text/x-python'},
- '103bc087db1d26afc3a0283f38663d081e9b01e6': {
+ self.id2: {
'encoding': b'us-ascii',
- 'id': '103bc087db1d26afc3a0283f38663d081e9b01e6',
+ 'id': self.id2,
'indexer_configuration_id': 10,
'mimetype': b'text/plain'}
}
-
- def assert_mimetypes_ok(self, start, end, actual_results,
- expected_results=None):
- if expected_results is None:
- expected_results = self.expected_results
-
- for mimetype in actual_results:
- _id = mimetype['id']
- self.assertEqual(mimetype, expected_results[_id])
- self.assertTrue(start <= _id and _id <= end)
- _tool_id = mimetype['indexer_configuration_id']
- self.assertEqual(_tool_id, self.indexer.tool['id'])
-
- def test__index_contents(self):
- """Indexing contents without existing data results in indexed data
-
- """
- start, end = [self.contents[0], self.contents[2]] # output hex ids
- # given
- actual_results = list(self.indexer._index_contents(
- start, end, indexed={}))
-
- self.assert_mimetypes_ok(start, end, actual_results)
-
- def test__index_contents_with_indexed_data(self):
- """Indexing contents with existing data results in less indexed data
-
- """
- start, end = [self.contents[0], self.contents[2]] # output hex ids
- data_indexed = [
- '01c9379dfc33803963d07c1ccc748d3fe4c96bb5',
- '103bc087db1d26afc3a0283f38663d081e9b01e6'
- ]
-
- # given
- actual_results = self.indexer._index_contents(
- start, end, indexed=set(data_indexed))
-
- # craft the expected results
- expected_results = self.expected_results.copy()
- for already_indexed_key in data_indexed:
- expected_results.pop(already_indexed_key)
-
- self.assert_mimetypes_ok(
- start, end, actual_results, expected_results)
-
- def test_generate_content_mimetype_get(self):
- """Optimal indexing should result in indexed data
-
- """
- start, end = [self.contents[0], self.contents[2]] # output hex ids
- # given
- actual_results = self.indexer.run(start, end)
-
- # then
- self.assertTrue(actual_results)
-
- def test_generate_content_mimetype_get_input_as_bytes(self):
- """Optimal indexing should result in indexed data
-
- Input are in bytes here.
-
- """
- _start, _end = [self.contents[0], self.contents[2]] # output hex ids
- start, end = map(hashutil.hash_to_bytes, (_start, _end))
-
- # given
- actual_results = self.indexer.run( # checks the bytes input this time
- start, end, skip_existing=False) # no data so same result
-
- # then
- self.assertTrue(actual_results)
-
- def test_generate_content_mimetype_get_no_result(self):
- """No result indexed returns False"""
- start, end = ['0000000000000000000000000000000000000000',
- '0000000000000000000000000000000000000001']
- # given
- actual_results = self.indexer.run(
- start, end, incremental=False)
-
- # then
- self.assertFalse(actual_results)
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -1,10 +1,11 @@
-# Copyright (C) 2017 The Software Heritage developers
+# Copyright (C) 2017-2018 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.objstorage.exc import ObjNotFoundError
+from swh.model import hashutil
ORIGINS = [
{
@@ -124,6 +125,15 @@
}
+SHA1_TO_LICENSES = {
+ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
+ '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
+ '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
+ '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
+}
+
+
class MockObjStorage:
"""Mock an swh-objstorage objstorage with predefined contents.
@@ -398,3 +408,193 @@
'status': None,
'sha256': None
}]
+
+
+class BasicMockStorage():
+ """In memory implementation to fake the content_get_range api.
+
+ FIXME: To remove when the actual in-memory lands.
+
+ """
+ contents = []
+
+ def __init__(self, contents):
+ self.contents = contents
+
+ def content_get_range(self, start, end, limit=1000):
+ # to make input test data consilient with actual runtime the
+ # other way of doing properly things would be to rewrite all
+ # tests (that's another task entirely so not right now)
+ if isinstance(start, bytes):
+ start = hashutil.hash_to_hex(start)
+ if isinstance(end, bytes):
+ end = hashutil.hash_to_hex(end)
+ results = []
+ _next_id = None
+ counter = 0
+ for c in self.contents:
+ _id = c['sha1']
+ if start <= _id and _id <= end:
+ results.append(c)
+ if counter >= limit:
+ break
+ counter += 1
+
+ return {
+ 'contents': results,
+ 'next': _next_id
+ }
+
+
+class BasicMockIndexerStorage():
+ """Mock Indexer storage to simplify reading indexers' outputs.
+
+ """
+ state = []
+
+ def _internal_add(self, data, conflict_update=None):
+ """All content indexer have the same structure. So reuse `data` as the
+ same data. It's either mimetype, language,
+ fossology_license, etc...
+
+ """
+ self.state = data
+ self.conflict_update = conflict_update
+
+ def content_mimetype_add(self, data, conflict_update=None):
+ self._internal_add(data, conflict_update=conflict_update)
+
+ def content_fossology_license_add(self, data, conflict_update=None):
+ self._internal_add(data, conflict_update=conflict_update)
+
+ def _internal_get_range(self, start, end,
+ indexer_configuration_id, limit=1000):
+ """Same logic as _internal_add, we retrieve indexed data given an
+ identifier. So the code here does not change even though
+ the underlying data does.
+
+ """
+ # to make input test data consilient with actual runtime the
+ # other way of doing properly things would be to rewrite all
+ # tests (that's another task entirely so not right now)
+ if isinstance(start, bytes):
+ start = hashutil.hash_to_hex(start)
+ if isinstance(end, bytes):
+ end = hashutil.hash_to_hex(end)
+ results = []
+ _next = None
+ counter = 0
+ for m in self.state:
+ _id = m['id']
+ _tool_id = m['indexer_configuration_id']
+ if (start <= _id and _id <= end and
+ _tool_id == indexer_configuration_id):
+ results.append(_id)
+ if counter >= limit:
+ break
+ counter += 1
+
+ return {
+ 'ids': results,
+ 'next': _next
+ }
+
+ def content_mimetype_get_range(
+ self, start, end, indexer_configuration_id, limit=1000):
+ return self._internal_get_range(
+ start, end, indexer_configuration_id, limit=limit)
+
+ def content_fossology_license_get_range(
+ self, start, end, indexer_configuration_id, limit=1000):
+ return self._internal_get_range(
+ start, end, indexer_configuration_id, limit=limit)
+
+ def indexer_configuration_add(self, tools):
+ return [{
+ 'id': 10,
+ }]
+
+
+class IndexerRangeTest:
+ """Allows to factorize tests on range indexer.
+
+ """
+ def assert_results_ok(self, start, end, actual_results,
+ expected_results=None):
+ if expected_results is None:
+ expected_results = self.expected_results
+
+ for indexed_data in actual_results:
+ _id = indexed_data['id']
+ self.assertEqual(indexed_data, expected_results[_id])
+ self.assertTrue(start <= _id and _id <= end)
+ _tool_id = indexed_data['indexer_configuration_id']
+ self.assertEqual(_tool_id, self.indexer.tool['id'])
+
+ def test__index_contents(self):
+ """Indexing contents without existing data results in indexed data
+
+ """
+ start, end = [self.contents[0], self.contents[2]] # output hex ids
+ # given
+ actual_results = list(self.indexer._index_contents(
+ start, end, indexed={}))
+
+ self.assert_results_ok(start, end, actual_results)
+
+ def test__index_contents_with_indexed_data(self):
+ """Indexing contents with existing data results in less indexed data
+
+ """
+ start, end = [self.contents[0], self.contents[2]] # output hex ids
+ data_indexed = [self.id0, self.id2]
+
+ # given
+ actual_results = self.indexer._index_contents(
+ start, end, indexed=set(data_indexed))
+
+ # craft the expected results
+ expected_results = self.expected_results.copy()
+ for already_indexed_key in data_indexed:
+ expected_results.pop(already_indexed_key)
+
+ self.assert_results_ok(
+ start, end, actual_results, expected_results)
+
+ def test_generate_content_mimetype_get(self):
+ """Optimal indexing should result in indexed data
+
+ """
+ start, end = [self.contents[0], self.contents[2]] # output hex ids
+ # given
+ actual_results = self.indexer.run(start, end)
+
+ # then
+ self.assertTrue(actual_results)
+
+ def test_generate_content_mimetype_get_input_as_bytes(self):
+ """Optimal indexing should result in indexed data
+
+ Input are in bytes here.
+
+ """
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
+
+ # given
+ actual_results = self.indexer.run( # checks the bytes input this time
+ start, end, skip_existing=False) # no data so same result
+
+ # then
+ self.assertTrue(actual_results)
+
+ def test_generate_content_mimetype_get_no_result(self):
+ """No result indexed returns False"""
+ start, end = ['0000000000000000000000000000000000000000',
+ '0000000000000000000000000000000000000001']
+ # given
+ actual_results = self.indexer.run(
+ start, end, incremental=False)
+
+ # then
+ self.assertFalse(actual_results)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 3:44 PM (2 w, 8 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3224997
Attached To
D670: fossology_license: Open new range indexer
Event Timeline
Log In to Comment