D670.diff
No OneTemporary
Actions

Size

30 KB

Subscribers

None

D670.diff
View Options

	diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
	--- a/swh/indexer/fossology_license.py
	+++ b/swh/indexer/fossology_license.py
	@@ -1,4 +1,4 @@
	-# Copyright (C) 2016-2017 The Software Heritage developers
	+# Copyright (C) 2016-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information
	@@ -8,50 +8,14 @@

	from swh.model import hashutil

	-from .indexer import ContentIndexer, DiskIndexer
	+from .indexer import ContentIndexer, ContentRangeIndexer, DiskIndexer


	-def compute_license(path, log=None):
	- """Determine license from file at path.
	+class MixinFossologyLicenseIndexer:
	+ """Mixin fossology license indexer.

	- Args:
	- path: filepath to determine the license
	-
	- Returns:
	- A dict with the following keys:
	- - licenses ([str]): associated detected licenses to path
	- - path (bytes): content filepath
	- - tool (str): tool used to compute the output
	-
	- """
	- try:
	- properties = subprocess.check_output(['nomossa', path],
	- universal_newlines=True)
	- if properties:
	- res = properties.rstrip().split(' contains license(s) ')
	- licenses = res[1].split(',')
	-
	- return {
	- 'licenses': licenses,
	- 'path': path,
	- }
	- except subprocess.CalledProcessError:
	- if log:
	- from os import path as __path
	- log.exception('Problem during license detection for sha1 %s' %
	- __path.basename(path))
	- return {
	- 'licenses': [],
	- 'path': path,
	- }
	-
	-
	-class ContentFossologyLicenseIndexer(ContentIndexer, DiskIndexer):
	- """Indexer in charge of:
	- - filtering out content already indexed
	- - reading content from objstorage per the content's id (sha1)
	- - computing {license, encoding} from that content
	- - store result in storage
	+ See :class:`ContentFossologyLicenseIndexer` and
	+ :class:`FossologyLicenseRangeIndexer`

	"""
	ADDITIONAL_CONFIG = {
	@@ -72,22 +36,45 @@
	self.working_directory = self.config['workdir']
	self.tool = self.tools[0]

	- def filter(self, ids):
	- """Filter out known sha1s and return only missing ones.
	+ def compute_license(self, path, log=None):
	+ """Determine license from file at path.
	+
	+ Args:
	+ path: filepath to determine the license
	+
	+ Returns:
	+ A dict with the following keys:
	+ - licenses ([str]): associated detected licenses to path
	+ - path (bytes): content filepath
	+ - tool (str): tool used to compute the output

	"""
	- yield from self.idx_storage.content_fossology_license_missing((
	- {
	- 'id': sha1,
	- 'indexer_configuration_id': self.tool['id'],
	- } for sha1 in ids
	- ))
	+ try:
	+ properties = subprocess.check_output(['nomossa', path],
	+ universal_newlines=True)
	+ if properties:
	+ res = properties.rstrip().split(' contains license(s) ')
	+ licenses = res[1].split(',')
	+
	+ return {
	+ 'licenses': licenses,
	+ 'path': path,
	+ }
	+ except subprocess.CalledProcessError:
	+ if log:
	+ from os import path as __path
	+ log.exception('Problem during license detection for sha1 %s' %
	+ __path.basename(path))
	+ return {
	+ 'licenses': [],
	+ 'path': path,
	+ }

	def index(self, id, data):
	"""Index sha1s' content and store result.

	Args:
	- sha1 (bytes): content's identifier
	+ id (bytes): content's identifier
	raw_content (bytes): raw content in bytes

	Returns:
	@@ -97,13 +84,14 @@
	- path (bytes): path

	"""
	- filename = hashutil.hash_to_hex(id)
	+ if isinstance(id, str):
	+ id = hashutil.hash_to_hex(id)
	content_path = self.write_to_temp(
	- filename=filename,
	+ filename=id,
	data=data)

	try:
	- properties = compute_license(path=content_path, log=self.log)
	+ properties = self.compute_license(path=content_path, log=self.log)
	properties.update({
	'id': id,
	'indexer_configuration_id': self.tool['id'],
	@@ -130,11 +118,67 @@
	results, conflict_update=(policy_update == 'update-dups'))


	+class ContentFossologyLicenseIndexer(
	+ MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer):
	+ """Indexer in charge of:
	+ - filtering out content already indexed
	+ - reading content from objstorage per the content's id (sha1)
	+ - computing {license, encoding} from that content
	+ - store result in storage
	+
	+ """
	+ def filter(self, ids):
	+ """Filter out known sha1s and return only missing ones.
	+
	+ """
	+ yield from self.idx_storage.content_fossology_license_missing((
	+ {
	+ 'id': sha1,
	+ 'indexer_configuration_id': self.tool['id'],
	+ } for sha1 in ids
	+ ))
	+
	+
	+class FossologyLicenseRangeIndexer(
	+ MixinFossologyLicenseIndexer, DiskIndexer, ContentRangeIndexer):
	+ """FossologyLicense Range Indexer working on range of content identifiers.
	+
	+ It:
	+ - filters out the non textual content
	+ - (optionally) filters out content already indexed (cf :callable:`range`)
	+ - reads content from objstorage per the content's id (sha1)
	+ - computes {mimetype, encoding} from that content
	+ - stores result in storage
	+
	+ """
	+ def indexed_contents_in_range(self, start, end):
	+ """Retrieve indexed content id within range [start, end].
	+
	+ Args
	+ start (bytes): Starting bound from range identifier
	+ end (bytes): End range identifier
	+
	+ Yields:
	+ Content identifier (bytes) present in the range [start, end]
	+
	+ """
	+ while start:
	+ result = self.idx_storage.content_fossology_license_get_range(
	+ start, end, self.tool['id'])
	+ contents = result['ids']
	+ for _id in contents:
	+ yield _id
	+ start = result['next']
	+ if start is None:
	+ break
	+
	+
	@click.command(help='Compute license for path using tool')
	@click.option('--tool', default='nomossa', help="Path to tool")
	@click.option('--path', required=1, help="Path to execute index on")
	def main(tool, path):
	- print(compute_license(tool, path))
	+ indexer = ContentFossologyLicenseIndexer()
	+ print(indexer.compute_license(tool, path))


	if __name__ == '__main__':
	diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/indexer/tests/test_fossology_license.py
	@@ -0,0 +1,227 @@
	+# Copyright (C) 2017-2018 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+import unittest
	+import logging
	+
	+from swh.indexer.fossology_license import (
	+ ContentFossologyLicenseIndexer, FossologyLicenseRangeIndexer
	+)
	+
	+from swh.indexer.tests.test_utils import (
	+ MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
	+ SHA1_TO_LICENSES, IndexerRangeTest
	+)
	+
	+
	+class NoDiskIndexer:
	+ """Mixin to override the DiskIndexer behavior avoiding side-effects in
	+ tests.
	+
	+ """
	+
	+ def write_to_temp(self, filename, data): # noop
	+ return filename
	+
	+ def cleanup(self, content_path): # noop
	+ return None
	+
	+
	+class InjectLicenseIndexer:
	+ """Override license computations.
	+
	+ """
	+ def compute_license(self, path, log=None):
	+ """path is the content identifier
	+
	+ """
	+ return {
	+ 'licenses': SHA1_TO_LICENSES.get(path)
	+ }
	+
	+
	+class FossologyLicenseTestIndexer(
	+ NoDiskIndexer, InjectLicenseIndexer, ContentFossologyLicenseIndexer):
	+ """Specific mimetype whose configuration is enough to satisfy the
	+ indexing tests.
	+
	+ """
	+ def prepare(self):
	+ self.config = {
	+ 'tools': {
	+ 'name': 'nomos',
	+ 'version': '3.1.0rc2-31-ga2cbb8c',
	+ 'configuration': {
	+ 'command_line': 'nomossa <filepath>',
	+ },
	+ },
	+ }
	+ self.idx_storage = BasicMockIndexerStorage()
	+ self.log = logging.getLogger('swh.indexer')
	+ self.objstorage = MockObjStorage()
	+ self.tools = self.register_tools(self.config['tools'])
	+ self.tool = self.tools[0]
	+
	+
	+class FossologyLicenseIndexerUnknownToolTestStorage(
	+ FossologyLicenseTestIndexer):
	+ """Specific fossology license indexer whose configuration is not
	+ enough to satisfy the indexing checks
	+
	+ """
	+ def prepare(self):
	+ super().prepare()
	+ self.tools = None
	+
	+
	+class TestFossologyLicenseIndexerWithErrors(unittest.TestCase):
	+ def test_wrong_unknown_configuration_tool(self):
	+ """Indexer with unknown configuration tool should fail the check"""
	+ with self.assertRaisesRegex(ValueError, 'Tools None is unknown'):
	+ FossologyLicenseIndexerUnknownToolTestStorage()
	+
	+
	+class TestFossologyLicenseIndexer(unittest.TestCase):
	+ """Fossology license tests.
	+
	+ """
	+ def setUp(self):
	+ self.indexer = FossologyLicenseTestIndexer()
	+
	+ def test_index_no_update(self):
	+ """Index sha1s results in new computed licenses
	+
	+ """
	+ id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	+ id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
	+ sha1s = [id0, id1]
	+
	+ # when
	+ self.indexer.run(sha1s, policy_update='ignore-dups')
	+
	+ # then
	+ expected_results = [{
	+ 'id': id0,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[id0],
	+ }, {
	+ 'id': id1,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[id1],
	+ }]
	+
	+ self.assertFalse(self.indexer.idx_storage.conflict_update)
	+ self.assertEqual(expected_results, self.indexer.idx_storage.state)
	+
	+ def test_index_update(self):
	+ """Index sha1s results in new computed licenses
	+
	+ """
	+ id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	+ id1 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
	+ id2 = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # empty content
	+ sha1s = [id0, id1, id2]
	+
	+ # when
	+ self.indexer.run(sha1s, policy_update='update-dups')
	+
	+ # then
	+ expected_results = [{
	+ 'id': id0,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[id0],
	+ }, {
	+ 'id': id1,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[id1],
	+ }, {
	+ 'id': id2,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[id2],
	+ }]
	+
	+ self.assertTrue(self.indexer.idx_storage.conflict_update)
	+ self.assertEqual(expected_results, self.indexer.idx_storage.state)
	+
	+ def test_index_one_unknown_sha1(self):
	+ """Only existing contents are indexed
	+
	+ """
	+ # given
	+ id0 = '688a5ef812c53907562fe379d4b3851e69c7cb15'
	+ sha1s = [id0,
	+ '799a5ef812c53907562fe379d4b3851e69c7cb15', # unknown
	+ '800a5ef812c53907562fe379d4b3851e69c7cb15'] # unknown
	+
	+ # when
	+ self.indexer.run(sha1s, policy_update='update-dups')
	+
	+ # then
	+ expected_results = [{
	+ 'id': id0,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[id0],
	+ }]
	+
	+ self.assertTrue(self.indexer.idx_storage.conflict_update)
	+ self.assertEqual(expected_results, self.indexer.idx_storage.state)
	+
	+
	+class FossologyLicenseRangeIndexerTest(
	+ NoDiskIndexer, InjectLicenseIndexer, FossologyLicenseRangeIndexer):
	+ """Testing the range indexer on fossology license.
	+
	+ """
	+ def prepare(self):
	+ self.config = {
	+ 'tools': {
	+ 'name': 'nomos',
	+ 'version': '3.1.0rc2-31-ga2cbb8c',
	+ 'configuration': {
	+ 'command_line': 'nomossa <filepath>',
	+ },
	+ },
	+ 'write_batch_size': 100,
	+ }
	+ self.idx_storage = BasicMockIndexerStorage()
	+ self.log = logging.getLogger('swh.indexer')
	+ # this hardcodes some contents, will use this to setup the storage
	+ self.objstorage = MockObjStorage()
	+ # sync objstorage and storage
	+ contents = [{'sha1': c_id} for c_id in self.objstorage]
	+ self.storage = BasicMockStorage(contents)
	+ self.tools = self.register_tools(self.config['tools'])
	+ self.tool = self.tools[0]
	+
	+
	+class TestFossologyLicenseRangeIndexer(IndexerRangeTest, unittest.TestCase):
	+ def setUp(self):
	+ self.indexer = FossologyLicenseRangeIndexerTest()
	+ # will play along with the objstorage's mocked contents for now
	+ self.contents = sorted(self.indexer.objstorage)
	+ # FIXME: leverage swh.objstorage.in_memory_storage's
	+ # InMemoryObjStorage, swh.storage.tests's gen_contents, and
	+ # hypothesis to generate data to actually run indexer on those
	+
	+ self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	+ self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
	+ self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
	+ self.expected_results = {
	+ self.id0: {
	+ 'id': self.id0,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[self.id0]
	+ },
	+ self.id1: {
	+ 'id': self.id1,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[self.id1]
	+ },
	+ self.id2: {
	+ 'id': self.id2,
	+ 'indexer_configuration_id': 10,
	+ 'licenses': SHA1_TO_LICENSES[self.id2]
	+ }
	+ }
	diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
	--- a/swh/indexer/tests/test_mimetype.py
	+++ b/swh/indexer/tests/test_mimetype.py
	@@ -10,90 +10,9 @@
	ContentMimetypeIndexer, MimetypeRangeIndexer
	)

	-from swh.indexer.tests.test_utils import MockObjStorage
	-from swh.model import hashutil
	-
	-
	-class _MockStorage():
	- """In memory implementation to fake the content_get_range api.
	-
	- FIXME: To remove when the actual in-memory lands.
	-
	- """
	- contents = []
	-
	- def __init__(self, contents):
	- self.contents = contents
	-
	- def content_get_range(self, start, end, limit=1000):
	- # to make input test data consilient with actual runtime the
	- # other way of doing properly things would be to rewrite all
	- # tests (that's another task entirely so not right now)
	- if isinstance(start, bytes):
	- start = hashutil.hash_to_hex(start)
	- if isinstance(end, bytes):
	- end = hashutil.hash_to_hex(end)
	- results = []
	- _next_id = None
	- counter = 0
	- for c in self.contents:
	- _id = c['sha1']
	- if start <= _id and _id <= end:
	- results.append(c)
	- if counter >= limit:
	- break
	- counter += 1
	-
	- return {
	- 'contents': results,
	- 'next': _next_id
	- }
	-
	-
	-class _MockIndexerStorage():
	- """Mock storage to simplify reading indexers' outputs.
	-
	- """
	- state = []
	-
	- def content_mimetype_add(self, mimetypes, conflict_update=None):
	- self.state = mimetypes
	- self.conflict_update = conflict_update
	-
	- def content_mimetype_get_range(self, start, end, indexer_configuration_id,
	- limit=1000):
	- """Basic in-memory implementation (limit is unused).
	-
	- """
	- # to make input test data consilient with actual runtime the
	- # other way of doing properly things would be to rewrite all
	- # tests (that's another task entirely so not right now)
	- if isinstance(start, bytes):
	- start = hashutil.hash_to_hex(start)
	- if isinstance(end, bytes):
	- end = hashutil.hash_to_hex(end)
	- results = []
	- _next = None
	- counter = 0
	- for m in self.state:
	- _id = m['id']
	- _tool_id = m['indexer_configuration_id']
	- if (start <= _id and _id <= end and
	- _tool_id == indexer_configuration_id):
	- results.append(_id)
	- if counter >= limit:
	- break
	- counter += 1
	-
	- return {
	- 'ids': results,
	- 'next': _next
	- }
	-
	- def indexer_configuration_add(self, tools):
	- return [{
	- 'id': 10,
	- }]
	+from swh.indexer.tests.test_utils import (
	+ MockObjStorage, BasicMockStorage, BasicMockIndexerStorage, IndexerRangeTest
	+)


	class MimetypeTestIndexer(ContentMimetypeIndexer):
	@@ -112,7 +31,7 @@
	},
	},
	}
	- self.idx_storage = _MockIndexerStorage()
	+ self.idx_storage = BasicMockIndexerStorage()
	self.log = logging.getLogger('swh.indexer')
	self.objstorage = MockObjStorage()
	self.tools = self.register_tools(self.config['tools'])
	@@ -236,18 +155,19 @@
	},
	'write_batch_size': 100,
	}
	- self.idx_storage = _MockIndexerStorage()
	+ self.idx_storage = BasicMockIndexerStorage()
	self.log = logging.getLogger('swh.indexer')
	# this hardcodes some contents, will use this to setup the storage
	self.objstorage = MockObjStorage()
	# sync objstorage and storage
	contents = [{'sha1': c_id} for c_id in self.objstorage]
	- self.storage = _MockStorage(contents)
	+ self.storage = BasicMockStorage(contents)
	self.tools = self.register_tools(self.config['tools'])
	self.tool = self.tools[0]


	-class TestMimetypeRangeIndexer(unittest.TestCase):
	+class TestMimetypeRangeIndexer(IndexerRangeTest, unittest.TestCase):
	+ """Range Mimetype Indexer tests on """
	def setUp(self):
	self.indexer = MimetypeRangeIndexerTest()
	# will play along with the objstorage's mocked contents for now
	@@ -256,103 +176,23 @@
	# InMemoryObjStorage, swh.storage.tests's gen_contents, and
	# hypothesis to generate data to actually run indexer on those

	+ self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	+ self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
	+ self.id2 = '103bc087db1d26afc3a0283f38663d081e9b01e6'
	self.expected_results = {
	- '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': {
	+ self.id0: {
	'encoding': b'us-ascii',
	- 'id': '01c9379dfc33803963d07c1ccc748d3fe4c96bb5',
	+ 'id': self.id0,
	'indexer_configuration_id': 10,
	'mimetype': b'text/plain'},
	- '02fb2c89e14f7fab46701478c83779c7beb7b069': {
	+ self.id1: {
	'encoding': b'us-ascii',
	- 'id': '02fb2c89e14f7fab46701478c83779c7beb7b069',
	+ 'id': self.id1,
	'indexer_configuration_id': 10,
	'mimetype': b'text/x-python'},
	- '103bc087db1d26afc3a0283f38663d081e9b01e6': {
	+ self.id2: {
	'encoding': b'us-ascii',
	- 'id': '103bc087db1d26afc3a0283f38663d081e9b01e6',
	+ 'id': self.id2,
	'indexer_configuration_id': 10,
	'mimetype': b'text/plain'}
	}
	-
	- def assert_mimetypes_ok(self, start, end, actual_results,
	- expected_results=None):
	- if expected_results is None:
	- expected_results = self.expected_results
	-
	- for mimetype in actual_results:
	- _id = mimetype['id']
	- self.assertEqual(mimetype, expected_results[_id])
	- self.assertTrue(start <= _id and _id <= end)
	- _tool_id = mimetype['indexer_configuration_id']
	- self.assertEqual(_tool_id, self.indexer.tool['id'])
	-
	- def test__index_contents(self):
	- """Indexing contents without existing data results in indexed data
	-
	- """
	- start, end = [self.contents[0], self.contents[2]] # output hex ids
	- # given
	- actual_results = list(self.indexer._index_contents(
	- start, end, indexed={}))
	-
	- self.assert_mimetypes_ok(start, end, actual_results)
	-
	- def test__index_contents_with_indexed_data(self):
	- """Indexing contents with existing data results in less indexed data
	-
	- """
	- start, end = [self.contents[0], self.contents[2]] # output hex ids
	- data_indexed = [
	- '01c9379dfc33803963d07c1ccc748d3fe4c96bb5',
	- '103bc087db1d26afc3a0283f38663d081e9b01e6'
	- ]
	-
	- # given
	- actual_results = self.indexer._index_contents(
	- start, end, indexed=set(data_indexed))
	-
	- # craft the expected results
	- expected_results = self.expected_results.copy()
	- for already_indexed_key in data_indexed:
	- expected_results.pop(already_indexed_key)
	-
	- self.assert_mimetypes_ok(
	- start, end, actual_results, expected_results)
	-
	- def test_generate_content_mimetype_get(self):
	- """Optimal indexing should result in indexed data
	-
	- """
	- start, end = [self.contents[0], self.contents[2]] # output hex ids
	- # given
	- actual_results = self.indexer.run(start, end)
	-
	- # then
	- self.assertTrue(actual_results)
	-
	- def test_generate_content_mimetype_get_input_as_bytes(self):
	- """Optimal indexing should result in indexed data
	-
	- Input are in bytes here.
	-
	- """
	- _start, _end = [self.contents[0], self.contents[2]] # output hex ids
	- start, end = map(hashutil.hash_to_bytes, (_start, _end))
	-
	- # given
	- actual_results = self.indexer.run( # checks the bytes input this time
	- start, end, skip_existing=False) # no data so same result
	-
	- # then
	- self.assertTrue(actual_results)
	-
	- def test_generate_content_mimetype_get_no_result(self):
	- """No result indexed returns False"""
	- start, end = ['0000000000000000000000000000000000000000',
	- '0000000000000000000000000000000000000001']
	- # given
	- actual_results = self.indexer.run(
	- start, end, incremental=False)
	-
	- # then
	- self.assertFalse(actual_results)
	diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
	--- a/swh/indexer/tests/test_utils.py
	+++ b/swh/indexer/tests/test_utils.py
	@@ -1,10 +1,11 @@
	-# Copyright (C) 2017 The Software Heritage developers
	+# Copyright (C) 2017-2018 The Software Heritage developers
	# See the AUTHORS file at the top-level directory of this distribution
	# License: GNU General Public License version 3, or any later version
	# See top-level LICENSE file for more information


	from swh.objstorage.exc import ObjNotFoundError
	+from swh.model import hashutil

	ORIGINS = [
	{
	@@ -124,6 +125,15 @@
	}


	+SHA1_TO_LICENSES = {
	+ '01c9379dfc33803963d07c1ccc748d3fe4c96bb5': ['GPL'],
	+ '02fb2c89e14f7fab46701478c83779c7beb7b069': ['Apache2.0'],
	+ '103bc087db1d26afc3a0283f38663d081e9b01e6': ['MIT'],
	+ '688a5ef812c53907562fe379d4b3851e69c7cb15': ['AGPL'],
	+ 'da39a3ee5e6b4b0d3255bfef95601890afd80709': [],
	+}
	+
	+
	class MockObjStorage:
	"""Mock an swh-objstorage objstorage with predefined contents.

	@@ -398,3 +408,193 @@
	'status': None,
	'sha256': None
	}]
	+
	+
	+class BasicMockStorage():
	+ """In memory implementation to fake the content_get_range api.
	+
	+ FIXME: To remove when the actual in-memory lands.
	+
	+ """
	+ contents = []
	+
	+ def __init__(self, contents):
	+ self.contents = contents
	+
	+ def content_get_range(self, start, end, limit=1000):
	+ # to make input test data consilient with actual runtime the
	+ # other way of doing properly things would be to rewrite all
	+ # tests (that's another task entirely so not right now)
	+ if isinstance(start, bytes):
	+ start = hashutil.hash_to_hex(start)
	+ if isinstance(end, bytes):
	+ end = hashutil.hash_to_hex(end)
	+ results = []
	+ _next_id = None
	+ counter = 0
	+ for c in self.contents:
	+ _id = c['sha1']
	+ if start <= _id and _id <= end:
	+ results.append(c)
	+ if counter >= limit:
	+ break
	+ counter += 1
	+
	+ return {
	+ 'contents': results,
	+ 'next': _next_id
	+ }
	+
	+
	+class BasicMockIndexerStorage():
	+ """Mock Indexer storage to simplify reading indexers' outputs.
	+
	+ """
	+ state = []
	+
	+ def _internal_add(self, data, conflict_update=None):
	+ """All content indexer have the same structure. So reuse `data` as the
	+ same data. It's either mimetype, language,
	+ fossology_license, etc...
	+
	+ """
	+ self.state = data
	+ self.conflict_update = conflict_update
	+
	+ def content_mimetype_add(self, data, conflict_update=None):
	+ self._internal_add(data, conflict_update=conflict_update)
	+
	+ def content_fossology_license_add(self, data, conflict_update=None):
	+ self._internal_add(data, conflict_update=conflict_update)
	+
	+ def _internal_get_range(self, start, end,
	+ indexer_configuration_id, limit=1000):
	+ """Same logic as _internal_add, we retrieve indexed data given an
	+ identifier. So the code here does not change even though
	+ the underlying data does.
	+
	+ """
	+ # to make input test data consilient with actual runtime the
	+ # other way of doing properly things would be to rewrite all
	+ # tests (that's another task entirely so not right now)
	+ if isinstance(start, bytes):
	+ start = hashutil.hash_to_hex(start)
	+ if isinstance(end, bytes):
	+ end = hashutil.hash_to_hex(end)
	+ results = []
	+ _next = None
	+ counter = 0
	+ for m in self.state:
	+ _id = m['id']
	+ _tool_id = m['indexer_configuration_id']
	+ if (start <= _id and _id <= end and
	+ _tool_id == indexer_configuration_id):
	+ results.append(_id)
	+ if counter >= limit:
	+ break
	+ counter += 1
	+
	+ return {
	+ 'ids': results,
	+ 'next': _next
	+ }
	+
	+ def content_mimetype_get_range(
	+ self, start, end, indexer_configuration_id, limit=1000):
	+ return self._internal_get_range(
	+ start, end, indexer_configuration_id, limit=limit)
	+
	+ def content_fossology_license_get_range(
	+ self, start, end, indexer_configuration_id, limit=1000):
	+ return self._internal_get_range(
	+ start, end, indexer_configuration_id, limit=limit)
	+
	+ def indexer_configuration_add(self, tools):
	+ return [{
	+ 'id': 10,
	+ }]
	+
	+
	+class IndexerRangeTest:
	+ """Allows to factorize tests on range indexer.
	+
	+ """
	+ def assert_results_ok(self, start, end, actual_results,
	+ expected_results=None):
	+ if expected_results is None:
	+ expected_results = self.expected_results
	+
	+ for indexed_data in actual_results:
	+ _id = indexed_data['id']
	+ self.assertEqual(indexed_data, expected_results[_id])
	+ self.assertTrue(start <= _id and _id <= end)
	+ _tool_id = indexed_data['indexer_configuration_id']
	+ self.assertEqual(_tool_id, self.indexer.tool['id'])
	+
	+ def test__index_contents(self):
	+ """Indexing contents without existing data results in indexed data
	+
	+ """
	+ start, end = [self.contents[0], self.contents[2]] # output hex ids
	+ # given
	+ actual_results = list(self.indexer._index_contents(
	+ start, end, indexed={}))
	+
	+ self.assert_results_ok(start, end, actual_results)
	+
	+ def test__index_contents_with_indexed_data(self):
	+ """Indexing contents with existing data results in less indexed data
	+
	+ """
	+ start, end = [self.contents[0], self.contents[2]] # output hex ids
	+ data_indexed = [self.id0, self.id2]
	+
	+ # given
	+ actual_results = self.indexer._index_contents(
	+ start, end, indexed=set(data_indexed))
	+
	+ # craft the expected results
	+ expected_results = self.expected_results.copy()
	+ for already_indexed_key in data_indexed:
	+ expected_results.pop(already_indexed_key)
	+
	+ self.assert_results_ok(
	+ start, end, actual_results, expected_results)
	+
	+ def test_generate_content_mimetype_get(self):
	+ """Optimal indexing should result in indexed data
	+
	+ """
	+ start, end = [self.contents[0], self.contents[2]] # output hex ids
	+ # given
	+ actual_results = self.indexer.run(start, end)
	+
	+ # then
	+ self.assertTrue(actual_results)
	+
	+ def test_generate_content_mimetype_get_input_as_bytes(self):
	+ """Optimal indexing should result in indexed data
	+
	+ Input are in bytes here.
	+
	+ """
	+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
	+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
	+
	+ # given
	+ actual_results = self.indexer.run( # checks the bytes input this time
	+ start, end, skip_existing=False) # no data so same result
	+
	+ # then
	+ self.assertTrue(actual_results)
	+
	+ def test_generate_content_mimetype_get_no_result(self):
	+ """No result indexed returns False"""
	+ start, end = ['0000000000000000000000000000000000000000',
	+ '0000000000000000000000000000000000000001']
	+ # given
	+ actual_results = self.indexer.run(
	+ start, end, incremental=False)
	+
	+ # then
	+ self.assertFalse(actual_results)

File Metadata

Mime Type: text/plain
Expires: Thu, Jul 3, 3:44 PM (2 w, 12 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3224997

D670.diffNo OneTemporaryActions

D670.diffView Options

File Metadata

Event Timeline

D670.diff
No OneTemporary
Actions

D670.diff
View Options