D788.id2496.diff
No OneTemporary
Actions

Size

13 KB

Subscribers

None

D788.id2496.diff
View Options

	diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
	--- a/swh/indexer/fossology_license.py
	+++ b/swh/indexer/fossology_license.py
	@@ -104,6 +104,7 @@
	- indexer_configuration_id (int): tool used to compute the output

	"""
	+ assert isinstance(id, bytes)
	content_path = self.write_to_temp(
	filename=hashutil.hash_to_hex(id), # use the id as pathname
	data=data)
	diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
	--- a/swh/indexer/indexer.py
	+++ b/swh/indexer/indexer.py
	@@ -405,11 +405,14 @@
	bytes: Identifier of contents to index.

	"""
	+ if not isinstance(start, bytes) or not isinstance(end, bytes):
	+ raise TypeError('identifiers must be bytes, not %r and %r.' %
	+ (start, end))
	while start:
	result = self.storage.content_get_range(start, end)
	contents = result['contents']
	for c in contents:
	- _id = c['sha1']
	+ _id = hashutil.hash_to_bytes(c['sha1'])
	if _id in indexed:
	continue
	yield _id
	@@ -435,6 +438,10 @@
	hashutil.hash_to_hex(sha1))
	continue
	res = self.index(sha1, raw_content, **kwargs)
	+ if not isinstance(res['id'], bytes):
	+ raise TypeError(
	+ '%r.index should return ids as bytes, not %r' %
	+ (self.__class__.__name__, res['id']))
	if res:
	yield res

	diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
	--- a/swh/indexer/storage/in_memory.py
	+++ b/swh/indexer/storage/in_memory.py
	@@ -191,6 +191,31 @@
	"""
	yield from self._mimetypes.missing(mimetypes)

	+ def content_mimetype_get_range(
	+ self, start, end, indexer_configuration_id, limit=1000):
	+ """Retrieve mimetypes within range [start, end] bound by limit.
	+
	+ Args:
	+ start (bytes): Starting identifier range (expected smaller
	+ than end)
	+ end (bytes): Ending identifier range (expected larger
	+ than start)
	+ indexer_configuration_id (int): The tool used to index data
	+ limit (int): Limit result (default to 1000)
	+
	+ Raises:
	+ ValueError for limit to None
	+
	+ Returns:
	+ a dict with keys:
	+ - ids [bytes]: iterable of content ids within the range.
	+ - next (Optional[bytes]): The next range of sha1 starts at
	+ this sha1 if any
	+
	+ """
	+ return self._mimetypes.get_range(
	+ start, end, indexer_configuration_id, limit)
	+
	def content_mimetype_add(self, mimetypes, conflict_update=False):
	"""Add mimetypes not present in storage.

	@@ -207,6 +232,8 @@
	default)

	"""
	+ if not all(isinstance(x['id'], bytes) for x in mimetypes):
	+ raise TypeError('identifiers must be bytes.')
	self._mimetypes.add(mimetypes, conflict_update)

	def content_mimetype_get(self, ids, db=None, cur=None):
	@@ -281,6 +308,8 @@
	results

	"""
	+ if not all(isinstance(x['id'], bytes) for x in ctags):
	+ raise TypeError('identifiers must be bytes.')
	self._content_ctags.add_merge(ctags, conflict_update, 'ctags')

	def content_ctags_search(self, expression,
	@@ -351,6 +380,8 @@
	list: content_license entries which failed due to unknown licenses

	"""
	+ if not all(isinstance(x['id'], bytes) for x in licenses):
	+ raise TypeError('identifiers must be bytes.')
	self._licenses.add_merge(licenses, conflict_update, 'licenses')

	def content_fossology_license_get_range(
	@@ -425,6 +456,8 @@
	or skip duplicates (false, the default)

	"""
	+ if not all(isinstance(x['id'], bytes) for x in metadata):
	+ raise TypeError('identifiers must be bytes.')
	self._content_metadata.add(metadata, conflict_update)

	def revision_metadata_missing(self, metadata):
	@@ -473,6 +506,8 @@
	or skip duplicates (false, the default)

	"""
	+ if not all(isinstance(x['id'], bytes) for x in metadata):
	+ raise TypeError('identifiers must be bytes.')
	self._revision_metadata.add(metadata, conflict_update)

	def indexer_configuration_add(self, tools):
	diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py
	--- a/swh/indexer/tests/storage/test_in_memory.py
	+++ b/swh/indexer/tests/storage/test_in_memory.py
	@@ -63,15 +63,3 @@
	@pytest.mark.xfail
	def test_indexer_configuration_metadata_get(self):
	pass
	-
	- @pytest.mark.xfail
	- def test_generate_content_mimetype_get_range_limit_none(self):
	- pass
	-
	- @pytest.mark.xfail
	- def test_generate_content_mimetype_get_range_no_limit(self, mimetypes):
	- pass
	-
	- @pytest.mark.xfail
	- def test_generate_content_mimetype_get_range_limit(self, mimetypes):
	- pass
	diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
	--- a/swh/indexer/tests/test_fossology_license.py
	+++ b/swh/indexer/tests/test_fossology_license.py
	@@ -4,7 +4,6 @@
	# See top-level LICENSE file for more information

	import unittest
	-import logging

	from unittest.mock import patch

	@@ -14,10 +13,9 @@
	)

	from swh.indexer.tests.test_utils import (
	- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
	SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest,
	CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer,
	- BASE_TEST_CONFIG
	+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
	)


	@@ -91,6 +89,7 @@
	yield from self.idx_storage.content_ctags_get(ids)

	def setUp(self):
	+ super().setUp()
	self.indexer = FossologyLicenseTestIndexer()
	self.idx_storage = self.indexer.idx_storage

	@@ -137,15 +136,6 @@
	'write_batch_size': 100,
	}

	- def prepare(self):
	- super().prepare()
	- self.idx_storage = BasicMockIndexerStorage()
	- self.log = logging.getLogger('swh.indexer')
	- # this hardcodes some contents, will use this to setup the storage
	- self.objstorage = MockObjStorage()
	- contents = [{'sha1': c_id} for c_id in self.objstorage]
	- self.storage = BasicMockStorage(contents)
	-

	class TestFossologyLicenseRangeIndexer(
	CommonContentIndexerRangeTest, unittest.TestCase):
	@@ -158,12 +148,10 @@

	"""
	def setUp(self):
	+ super().setUp()
	self.indexer = FossologyLicenseRangeIndexerTest()
	- # will play along with the objstorage's mocked contents for now
	- self.contents = sorted(self.indexer.objstorage)
	- # FIXME: leverage swh.objstorage.in_memory_storage's
	- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
	- # hypothesis to generate data to actually run indexer on those
	+ fill_storage(self.indexer.storage)
	+ fill_obj_storage(self.indexer.objstorage)

	self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
	diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
	--- a/swh/indexer/tests/test_mimetype.py
	+++ b/swh/indexer/tests/test_mimetype.py
	@@ -12,10 +12,9 @@
	)

	from swh.indexer.tests.test_utils import (
	- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
	CommonContentIndexerTest, CommonContentIndexerRangeTest,
	CommonIndexerWithErrorsTest, CommonIndexerNoTool,
	- BASE_TEST_CONFIG
	+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
	)


	@@ -121,15 +120,6 @@
	'write_batch_size': 100,
	}

	- def prepare(self):
	- super().prepare()
	- self.idx_storage = BasicMockIndexerStorage()
	- # this hardcodes some contents, will use this to setup the storage
	- self.objstorage = MockObjStorage()
	- # sync objstorage and storage
	- contents = [{'sha1': c_id} for c_id in self.objstorage]
	- self.storage = BasicMockStorage(contents)
	-

	class TestMimetypeRangeIndexer(
	CommonContentIndexerRangeTest, unittest.TestCase):
	@@ -142,12 +132,10 @@

	"""
	def setUp(self):
	+ super().setUp()
	self.indexer = MimetypeRangeIndexerTest()
	- # will play along with the objstorage's mocked contents for now
	- self.contents = sorted(self.indexer.objstorage)
	- # FIXME: leverage swh.objstorage.in_memory_storage's
	- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
	- # hypothesis to generate data to actually run indexer on those
	+ fill_storage(self.indexer.storage)
	+ fill_obj_storage(self.indexer.objstorage)

	self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
	self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
	diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
	--- a/swh/indexer/tests/test_utils.py
	+++ b/swh/indexer/tests/test_utils.py
	@@ -4,10 +4,12 @@
	# See top-level LICENSE file for more information

	import datetime
	+import hashlib
	+import random

	from swh.objstorage.exc import ObjNotFoundError
	from swh.model import hashutil
	-from swh.model.hashutil import hash_to_bytes
	+from swh.model.hashutil import hash_to_bytes, hash_to_hex

	from swh.indexer.storage import INDEXER_CFG_KEY

	@@ -488,6 +490,21 @@
	'id': DIRECTORY_ID,
	'entries': DIRECTORY,
	}])
	+ for (obj_id, content) in OBJ_STORAGE_DATA.items():
	+ if hasattr(hashlib, 'blake2s'):
	+ blake2s256 = hashlib.blake2s(content, digest_size=32).digest()
	+ else:
	+ # fallback for Python <3.6
	+ blake2s256 = bytes([random.randint(0, 255) for _ in range(32)])
	+ storage.content_add([{
	+ 'data': content,
	+ 'length': len(content),
	+ 'status': 'visible',
	+ 'sha1': hash_to_bytes(obj_id),
	+ 'sha1_git': hash_to_bytes(obj_id),
	+ 'sha256': hashlib.sha256(content).digest(),
	+ 'blake2s256': blake2s256
	+ }])


	class MockStorage():
	@@ -664,6 +681,8 @@
	return self.indexer.idx_storage.state

	def assert_results_ok(self, sha1s, expected_results=None):
	+ sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
	+ for sha1 in sha1s]
	actual_results = self.get_indexer_results(sha1s)

	if expected_results is None:
	@@ -712,15 +731,22 @@
	"""Allows to factorize tests on range indexer.

	"""
	+ def setUp(self):
	+ self.contents = sorted(OBJ_STORAGE_DATA)
	+
	def assert_results_ok(self, start, end, actual_results,
	expected_results=None):
	if expected_results is None:
	expected_results = self.expected_results

	+ actual_results = list(actual_results)
	for indexed_data in actual_results:
	_id = indexed_data['id']
	- self.assertEqual(indexed_data, expected_results[_id])
	- self.assertTrue(start <= _id and _id <= end)
	+ assert isinstance(_id, bytes)
	+ indexed_data = indexed_data.copy()
	+ indexed_data['id'] = hash_to_hex(indexed_data['id'])
	+ self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
	+ self.assertTrue(start <= _id <= end)
	_tool_id = indexed_data['indexer_configuration_id']
	self.assertEqual(_tool_id, self.indexer.tool['id'])

	@@ -728,7 +754,8 @@
	"""Indexing contents without existing data results in indexed data

	"""
	- start, end = [self.contents[0], self.contents[2]] # output hex ids
	+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
	+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
	# given
	actual_results = list(self.indexer._index_contents(
	start, end, indexed={}))
	@@ -739,12 +766,13 @@
	"""Indexing contents with existing data results in less indexed data

	"""
	- start, end = [self.contents[0], self.contents[2]] # output hex ids
	+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
	+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
	data_indexed = [self.id0, self.id2]

	# given
	actual_results = self.indexer._index_contents(
	- start, end, indexed=set(data_indexed))
	+ start, end, indexed=set(map(hash_to_bytes, data_indexed)))

	# craft the expected results
	expected_results = self.expected_results.copy()
	@@ -758,7 +786,8 @@
	"""Optimal indexing should result in indexed data

	"""
	- start, end = [self.contents[0], self.contents[2]] # output hex ids
	+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
	+ start, end = map(hashutil.hash_to_bytes, (_start, _end))

	# given
	actual_results = self.indexer.run(start, end)
	@@ -785,8 +814,9 @@

	def test_generate_content_get_no_result(self):
	"""No result indexed returns False"""
	- start, end = ['0000000000000000000000000000000000000000',
	- '0000000000000000000000000000000000000001']
	+ _start, _end = ['0000000000000000000000000000000000000000',
	+ '0000000000000000000000000000000000000001']
	+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
	# given
	actual_results = self.indexer.run(
	start, end, incremental=False)

File Metadata

Mime Type: text/plain
Expires: Mar 17 2025, 7:08 PM (15 w, 1 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3231771

D788.id2496.diffNo OneTemporaryActions

D788.id2496.diffView Options

File Metadata

Event Timeline

D788.id2496.diff
No OneTemporary
Actions

D788.id2496.diff
View Options