Page MenuHomeSoftware Heritage

D788.id2496.diff
No OneTemporary

D788.id2496.diff

diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py
--- a/swh/indexer/fossology_license.py
+++ b/swh/indexer/fossology_license.py
@@ -104,6 +104,7 @@
- indexer_configuration_id (int): tool used to compute the output
"""
+ assert isinstance(id, bytes)
content_path = self.write_to_temp(
filename=hashutil.hash_to_hex(id), # use the id as pathname
data=data)
diff --git a/swh/indexer/indexer.py b/swh/indexer/indexer.py
--- a/swh/indexer/indexer.py
+++ b/swh/indexer/indexer.py
@@ -405,11 +405,14 @@
bytes: Identifier of contents to index.
"""
+ if not isinstance(start, bytes) or not isinstance(end, bytes):
+ raise TypeError('identifiers must be bytes, not %r and %r.' %
+ (start, end))
while start:
result = self.storage.content_get_range(start, end)
contents = result['contents']
for c in contents:
- _id = c['sha1']
+ _id = hashutil.hash_to_bytes(c['sha1'])
if _id in indexed:
continue
yield _id
@@ -435,6 +438,10 @@
hashutil.hash_to_hex(sha1))
continue
res = self.index(sha1, raw_content, **kwargs)
+ if not isinstance(res['id'], bytes):
+ raise TypeError(
+ '%r.index should return ids as bytes, not %r' %
+ (self.__class__.__name__, res['id']))
if res:
yield res
diff --git a/swh/indexer/storage/in_memory.py b/swh/indexer/storage/in_memory.py
--- a/swh/indexer/storage/in_memory.py
+++ b/swh/indexer/storage/in_memory.py
@@ -191,6 +191,31 @@
"""
yield from self._mimetypes.missing(mimetypes)
+ def content_mimetype_get_range(
+ self, start, end, indexer_configuration_id, limit=1000):
+ """Retrieve mimetypes within range [start, end] bound by limit.
+
+ Args:
+ **start** (bytes): Starting identifier range (expected smaller
+ than end)
+ **end** (bytes): Ending identifier range (expected larger
+ than start)
+ **indexer_configuration_id** (int): The tool used to index data
+ **limit** (int): Limit result (default to 1000)
+
+ Raises:
+ ValueError for limit to None
+
+ Returns:
+ a dict with keys:
+ - **ids** [bytes]: iterable of content ids within the range.
+ - **next** (Optional[bytes]): The next range of sha1 starts at
+ this sha1 if any
+
+ """
+ return self._mimetypes.get_range(
+ start, end, indexer_configuration_id, limit)
+
def content_mimetype_add(self, mimetypes, conflict_update=False):
"""Add mimetypes not present in storage.
@@ -207,6 +232,8 @@
default)
"""
+ if not all(isinstance(x['id'], bytes) for x in mimetypes):
+ raise TypeError('identifiers must be bytes.')
self._mimetypes.add(mimetypes, conflict_update)
def content_mimetype_get(self, ids, db=None, cur=None):
@@ -281,6 +308,8 @@
results
"""
+ if not all(isinstance(x['id'], bytes) for x in ctags):
+ raise TypeError('identifiers must be bytes.')
self._content_ctags.add_merge(ctags, conflict_update, 'ctags')
def content_ctags_search(self, expression,
@@ -351,6 +380,8 @@
list: content_license entries which failed due to unknown licenses
"""
+ if not all(isinstance(x['id'], bytes) for x in licenses):
+ raise TypeError('identifiers must be bytes.')
self._licenses.add_merge(licenses, conflict_update, 'licenses')
def content_fossology_license_get_range(
@@ -425,6 +456,8 @@
or skip duplicates (false, the default)
"""
+ if not all(isinstance(x['id'], bytes) for x in metadata):
+ raise TypeError('identifiers must be bytes.')
self._content_metadata.add(metadata, conflict_update)
def revision_metadata_missing(self, metadata):
@@ -473,6 +506,8 @@
or skip duplicates (false, the default)
"""
+ if not all(isinstance(x['id'], bytes) for x in metadata):
+ raise TypeError('identifiers must be bytes.')
self._revision_metadata.add(metadata, conflict_update)
def indexer_configuration_add(self, tools):
diff --git a/swh/indexer/tests/storage/test_in_memory.py b/swh/indexer/tests/storage/test_in_memory.py
--- a/swh/indexer/tests/storage/test_in_memory.py
+++ b/swh/indexer/tests/storage/test_in_memory.py
@@ -63,15 +63,3 @@
@pytest.mark.xfail
def test_indexer_configuration_metadata_get(self):
pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_limit_none(self):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_no_limit(self, mimetypes):
- pass
-
- @pytest.mark.xfail
- def test_generate_content_mimetype_get_range_limit(self, mimetypes):
- pass
diff --git a/swh/indexer/tests/test_fossology_license.py b/swh/indexer/tests/test_fossology_license.py
--- a/swh/indexer/tests/test_fossology_license.py
+++ b/swh/indexer/tests/test_fossology_license.py
@@ -4,7 +4,6 @@
# See top-level LICENSE file for more information
import unittest
-import logging
from unittest.mock import patch
@@ -14,10 +13,9 @@
)
from swh.indexer.tests.test_utils import (
- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
SHA1_TO_LICENSES, CommonContentIndexerTest, CommonContentIndexerRangeTest,
CommonIndexerWithErrorsTest, CommonIndexerNoTool, NoDiskIndexer,
- BASE_TEST_CONFIG
+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
)
@@ -91,6 +89,7 @@
yield from self.idx_storage.content_ctags_get(ids)
def setUp(self):
+ super().setUp()
self.indexer = FossologyLicenseTestIndexer()
self.idx_storage = self.indexer.idx_storage
@@ -137,15 +136,6 @@
'write_batch_size': 100,
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- self.log = logging.getLogger('swh.indexer')
- # this hardcodes some contents, will use this to setup the storage
- self.objstorage = MockObjStorage()
- contents = [{'sha1': c_id} for c_id in self.objstorage]
- self.storage = BasicMockStorage(contents)
-
class TestFossologyLicenseRangeIndexer(
CommonContentIndexerRangeTest, unittest.TestCase):
@@ -158,12 +148,10 @@
"""
def setUp(self):
+ super().setUp()
self.indexer = FossologyLicenseRangeIndexerTest()
- # will play along with the objstorage's mocked contents for now
- self.contents = sorted(self.indexer.objstorage)
- # FIXME: leverage swh.objstorage.in_memory_storage's
- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
- # hypothesis to generate data to actually run indexer on those
+ fill_storage(self.indexer.storage)
+ fill_obj_storage(self.indexer.objstorage)
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
diff --git a/swh/indexer/tests/test_mimetype.py b/swh/indexer/tests/test_mimetype.py
--- a/swh/indexer/tests/test_mimetype.py
+++ b/swh/indexer/tests/test_mimetype.py
@@ -12,10 +12,9 @@
)
from swh.indexer.tests.test_utils import (
- MockObjStorage, BasicMockStorage, BasicMockIndexerStorage,
CommonContentIndexerTest, CommonContentIndexerRangeTest,
CommonIndexerWithErrorsTest, CommonIndexerNoTool,
- BASE_TEST_CONFIG
+ BASE_TEST_CONFIG, fill_storage, fill_obj_storage
)
@@ -121,15 +120,6 @@
'write_batch_size': 100,
}
- def prepare(self):
- super().prepare()
- self.idx_storage = BasicMockIndexerStorage()
- # this hardcodes some contents, will use this to setup the storage
- self.objstorage = MockObjStorage()
- # sync objstorage and storage
- contents = [{'sha1': c_id} for c_id in self.objstorage]
- self.storage = BasicMockStorage(contents)
-
class TestMimetypeRangeIndexer(
CommonContentIndexerRangeTest, unittest.TestCase):
@@ -142,12 +132,10 @@
"""
def setUp(self):
+ super().setUp()
self.indexer = MimetypeRangeIndexerTest()
- # will play along with the objstorage's mocked contents for now
- self.contents = sorted(self.indexer.objstorage)
- # FIXME: leverage swh.objstorage.in_memory_storage's
- # InMemoryObjStorage, swh.storage.tests's gen_contents, and
- # hypothesis to generate data to actually run indexer on those
+ fill_storage(self.indexer.storage)
+ fill_obj_storage(self.indexer.objstorage)
self.id0 = '01c9379dfc33803963d07c1ccc748d3fe4c96bb5'
self.id1 = '02fb2c89e14f7fab46701478c83779c7beb7b069'
diff --git a/swh/indexer/tests/test_utils.py b/swh/indexer/tests/test_utils.py
--- a/swh/indexer/tests/test_utils.py
+++ b/swh/indexer/tests/test_utils.py
@@ -4,10 +4,12 @@
# See top-level LICENSE file for more information
import datetime
+import hashlib
+import random
from swh.objstorage.exc import ObjNotFoundError
from swh.model import hashutil
-from swh.model.hashutil import hash_to_bytes
+from swh.model.hashutil import hash_to_bytes, hash_to_hex
from swh.indexer.storage import INDEXER_CFG_KEY
@@ -488,6 +490,21 @@
'id': DIRECTORY_ID,
'entries': DIRECTORY,
}])
+ for (obj_id, content) in OBJ_STORAGE_DATA.items():
+ if hasattr(hashlib, 'blake2s'):
+ blake2s256 = hashlib.blake2s(content, digest_size=32).digest()
+ else:
+ # fallback for Python <3.6
+ blake2s256 = bytes([random.randint(0, 255) for _ in range(32)])
+ storage.content_add([{
+ 'data': content,
+ 'length': len(content),
+ 'status': 'visible',
+ 'sha1': hash_to_bytes(obj_id),
+ 'sha1_git': hash_to_bytes(obj_id),
+ 'sha256': hashlib.sha256(content).digest(),
+ 'blake2s256': blake2s256
+ }])
class MockStorage():
@@ -664,6 +681,8 @@
return self.indexer.idx_storage.state
def assert_results_ok(self, sha1s, expected_results=None):
+ sha1s = [sha1 if isinstance(sha1, bytes) else hash_to_bytes(sha1)
+ for sha1 in sha1s]
actual_results = self.get_indexer_results(sha1s)
if expected_results is None:
@@ -712,15 +731,22 @@
"""Allows to factorize tests on range indexer.
"""
+ def setUp(self):
+ self.contents = sorted(OBJ_STORAGE_DATA)
+
def assert_results_ok(self, start, end, actual_results,
expected_results=None):
if expected_results is None:
expected_results = self.expected_results
+ actual_results = list(actual_results)
for indexed_data in actual_results:
_id = indexed_data['id']
- self.assertEqual(indexed_data, expected_results[_id])
- self.assertTrue(start <= _id and _id <= end)
+ assert isinstance(_id, bytes)
+ indexed_data = indexed_data.copy()
+ indexed_data['id'] = hash_to_hex(indexed_data['id'])
+ self.assertEqual(indexed_data, expected_results[hash_to_hex(_id)])
+ self.assertTrue(start <= _id <= end)
_tool_id = indexed_data['indexer_configuration_id']
self.assertEqual(_tool_id, self.indexer.tool['id'])
@@ -728,7 +754,8 @@
"""Indexing contents without existing data results in indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = list(self.indexer._index_contents(
start, end, indexed={}))
@@ -739,12 +766,13 @@
"""Indexing contents with existing data results in less indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
data_indexed = [self.id0, self.id2]
# given
actual_results = self.indexer._index_contents(
- start, end, indexed=set(data_indexed))
+ start, end, indexed=set(map(hash_to_bytes, data_indexed)))
# craft the expected results
expected_results = self.expected_results.copy()
@@ -758,7 +786,8 @@
"""Optimal indexing should result in indexed data
"""
- start, end = [self.contents[0], self.contents[2]] # output hex ids
+ _start, _end = [self.contents[0], self.contents[2]] # output hex ids
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(start, end)
@@ -785,8 +814,9 @@
def test_generate_content_get_no_result(self):
"""No result indexed returns False"""
- start, end = ['0000000000000000000000000000000000000000',
- '0000000000000000000000000000000000000001']
+ _start, _end = ['0000000000000000000000000000000000000000',
+ '0000000000000000000000000000000000000001']
+ start, end = map(hashutil.hash_to_bytes, (_start, _end))
# given
actual_results = self.indexer.run(
start, end, incremental=False)

File Metadata

Mime Type
text/plain
Expires
Mar 17 2025, 7:08 PM (7 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3231771

Event Timeline