Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7343072
D788.id2496.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
13 KB
Subscribers
None
D788.id2496.diff
View Options
diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -104,6 +104,7 @@
- indexer_configuration_id (int): tool used to compute the output
"""
+ assert isinstance(id, bytes)
content_path = self.write_to_temp(
filename=hashutil.hash_to_hex(id), # use the id as pathname
data=data)
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -405,11 +405,14 @@
bytes: Identifier of contents to index.
"""
+ if not isinstance(start, bytes) or not isinstance(end, bytes):
+ raise TypeError('identifiers must be bytes, not %r and %r.' %
+ (start, end))
while start:
result = self.storage.content_get_range(start, end)
contents = result['contents']
for c in contents:
- _id = c['sha1']
+ _id = hashutil.hash_to_bytes(c['sha1'])
if _id in indexed:
continue
yield _id
@@ -435,6 +438,10 @@
hashutil.hash_to_hex(sha1))
continue
res = self.index(sha1, raw_content, **kwargs)
+ if not isinstance(res['id'], bytes):
+ raise TypeError(
+ '%r.index should return ids as bytes, not %r' %
+ (self.__class__.__name__, res['id']))
if res:
yield res
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -191,6 +191,31 @@
"""
yield from self._mimetypes.missing(mimetypes)
+ def content_mimetype_get_range(
+ self, start, end, indexer_configuration_id, limit=1000):
+ """Retrieve mimetypes within range [start, end] bound by limit.
+
+ Args:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **indexer_configuration_id** (int): The tool used to index data
+ **limit** (int): Limit result (default to 1000)
+
+ Raises:
+ ValueError for limit to None
+
+ Returns:
+ a dict with keys:
+ - **ids** [bytes]: iterable of content ids within the range.
+ - **next** (Optional[bytes]): The next range of sha1 starts at
+ this sha1 if any
+
+ """
+ return self._mimetypes.get_range(
+ start, end, indexer_configuration_id, limit)
+
def content_mimetype_add(self, mimetypes, conflict_update=False):
"""Add mimetypes not present in storage.
@@ -207,6 +232,8 @@
default)
"""
+ if not all(isinstance(x['id'], bytes) for x in mimetypes):
+ raise TypeError('identifiers must be bytes.')
self._mimetypes.add(mimetypes, conflict_update)
def content_mimetype_get(self, ids, db=None, cur=None):
@@ -281,6 +308,8 @@
results
"""
+ if not all(isinstance(x['id'], bytes) for x in ctags):
+ raise TypeError('identifiers must be bytes.')
self._content_ctags.add_merge(ctags, conflict_update, 'ctags')
def content_ctags_search(self, expression,
@@ -351,6 +380,8 @@
list: content_license entries which failed due to unknown licenses
"""
+ if not all(isinstance(x['id'], bytes) for x in licenses):
+ raise TypeError('identifiers must be bytes.')
self._licenses.add_merge(licenses, conflict_update, 'licenses')
def content_fossology_license_get_range(
@@ -425,6 +456,8 @@
or skip duplicates (false, the default)
"""
+ if not all(isinstance(x['id'], bytes) for x in metadata):
+ raise TypeError('identifiers must be bytes.')
self._content_metadata.add(metadata, conflict_update)
def revision_metadata_missing(self, metadata):
@@ -473,6 +506,8 @@
or skip duplicates (false, the default)
"""
+ if not all(isinstance(x['id'], bytes) for x in metadata):
+ raise TypeError('identifiers must be bytes.')
self._revision_metadata.add(metadata, conflict_update)
def indexer_configuration_add(self, tools):
diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py
--- a/swh/indexer/tests/storage/test_in_memory.py
+++ b/swh/indexer/tests/storage/test_in_memory.py
@@ -63,15 +63,3 @@
@pytest.mark.xfail
def test_indexer_configuration_metadata_get(self):
pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_limit_none(self):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_no_limit(self, mimetypes):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_limit(self, mimetypes):
- pass
diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
--- a/swh/indexer/tests/test_fossology_license.py
+++ b/swh/indexer/tests/test_fossology_license.py
@@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
import unittest
-import logging
from unittest.mock import patch
@@ -14,10 +13,9 @@
)
from swh.indexer.tests.test_utils import (
- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest,
CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer,
- BASE_TEST_CONFIG
+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
)
@@ -91,6 +89,7 @@
yield from self.idx_storage.content_ctags_get(ids)
def setUp(self):
+ super().setUp()
self.indexer = FossologyLicenseTestIndexer()
self.idx_storage = self.indexer.idx_storage
@@ -137,15 +136,6 @@
'write_batch_size': 100,
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
- # this hardcodes some contents, will use this to setup the storage
- self.objstorage = MockObjStorage()
- contents = [{'sha1': c_id} for c_id in self.objstorage]
- self.storage = BasicMockStorage(contents)
-
class TestFossologyLicenseRangeIndexer(
CommonContentIndexerRangeTest, unittest.TestCase):
@@ -158,12 +148,10 @@
"""
def setUp(self):
+ super().setUp()
self.indexer = FossologyLicenseRangeIndexerTest()
- # will play along with the objstorage's mocked contents for now
- self.contents = sorted(self.indexer.objstorage)
- # FIXME: leverage swh.objstorage.in_memory_storage's
- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
- # hypothesis to generate data to actually run indexer on those
+ fill_storage(self.indexer.storage)
+ fill_obj_storage(self.indexer.objstorage)
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
--- a/swh/indexer/tests/test_mimetype.py
+++ b/swh/indexer/tests/test_mimetype.py
@@ -12,10 +12,9 @@
)
from swh.indexer.tests.test_utils import (
- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
CommonContentIndexerTest, CommonContentIndexerRangeTest,
CommonIndexerWithErrorsTest, CommonIndexerNoTool,
- BASE_TEST_CONFIG
+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
)
@@ -121,15 +120,6 @@
'write_batch_size': 100,
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- # this hardcodes some contents, will use this to setup the storage
- self.objstorage = MockObjStorage()
- # sync objstorage and storage
- contents = [{'sha1': c_id} for c_id in self.objstorage]
- self.storage = BasicMockStorage(contents)
-
class TestMimetypeRangeIndexer(
CommonContentIndexerRangeTest, unittest.TestCase):
@@ -142,12 +132,10 @@
"""
def setUp(self):
+ super().setUp()
self.indexer = MimetypeRangeIndexerTest()
- # will play along with the objstorage's mocked contents for now
- self.contents = sorted(self.indexer.objstorage)
- # FIXME: leverage swh.objstorage.in_memory_storage's
- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
- # hypothesis to generate data to actually run indexer on those
+ fill_storage(self.indexer.storage)
+ fill_obj_storage(self.indexer.objstorage)
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -4,10 +4,12 @@
# See top-level LICENSE file for more information
import datetime
+import hashlib
+import random
from swh.objstorage.exc import ObjNotFoundError
from swh.model import hashutil
-from swh.model.hashutil import hash_to_bytes
+from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.indexer.storage import INDEXER_CFG_KEY
@@ -488,6 +490,21 @@
'id': DIRECTORY_ID,
'entries': DIRECTORY,
}])
+ for (obj_id, content) in OBJ_STORAGE_DATA.items():
+ if hasattr(hashlib, 'blake2s'):
+ blake2s256 = hashlib.blake2s(content, digest_size=32).digest()
+ else:
+ # fallback for Python <3.6
+ blake2s256 = bytes([random.randint(0, 255) for _ in range(32)])
+ storage.content_add([{
+ 'data': content,
+ 'length': len(content),
+ 'status': 'visible',
+ 'sha1': hash_to_bytes(obj_id),
+ 'sha1_git': hash_to_bytes(obj_id),
+ 'sha256': hashlib.sha256(content).digest(),
+ 'blake2s256': blake2s256
+ }])
class MockStorage():
@@ -664,6 +681,8 @@
return self.indexer.idx_storage.state
def assert_results_ok(self, sha1s, expected_results=None):
+ sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
+ for sha1 in sha1s]
actual_results = self.get_indexer_results(sha1s)
if expected_results is None:
@@ -712,15 +731,22 @@
"""Allows to factorize tests on range indexer.
"""
+ def setUp(self):
+ self.contents = sorted(OBJ_STORAGE_DATA)
+
def assert_results_ok(self, start, end, actual_results,
expected_results=None):
if expected_results is None:
expected_results = self.expected_results
+ actual_results = list(actual_results)
for indexed_data in actual_results:
_id = indexed_data['id']
- self.assertEqual(indexed_data, expected_results[_id])
- self.assertTrue(start <= _id and _id <= end)
+ assert isinstance(_id, bytes)
+ indexed_data = indexed_data.copy()
+ indexed_data['id'] = hash_to_hex(indexed_data['id'])
+ self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
+ self.assertTrue(start <= _id <= end)
_tool_id = indexed_data['indexer_configuration_id']
self.assertEqual(_tool_id, self.indexer.tool['id'])
@@ -728,7 +754,8 @@
"""Indexing contents without existing data results in indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = list(self.indexer._index_contents(
start, end, indexed={}))
@@ -739,12 +766,13 @@
"""Indexing contents with existing data results in less indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
data_indexed = [self.id0, self.id2]
# given
actual_results = self.indexer._index_contents(
- start, end, indexed=set(data_indexed))
+ start, end, indexed=set(map(hash_to_bytes, data_indexed)))
# craft the expected results
expected_results = self.expected_results.copy()
@@ -758,7 +786,8 @@
"""Optimal indexing should result in indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end)
@@ -785,8 +814,9 @@
def test_generate_content_get_no_result(self):
"""No result indexed returns False"""
- start, end = ['0000000000000000000000000000000000000000',
- '0000000000000000000000000000000000000001']
+ _start, _end = ['0000000000000000000000000000000000000000',
+ '0000000000000000000000000000000000000001']
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(
start, end, incremental=False)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mar 17 2025, 7:08 PM (7 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3231771
Attached To
D788: Remove mocks from range tests of mimetype and license indexers.
Event Timeline
Log In to Comment