diff --git a/swh/objstorage/backends/azure.py b/swh/objstorage/backends/azure.py --- a/swh/objstorage/backends/azure.py +++ b/swh/objstorage/backends/azure.py @@ -205,12 +205,8 @@ """ return sum(1 for i in self) - def add(self, content, obj_id=None, check_presence=True): + def add(self, content, obj_id, check_presence=True): """Add an obj in storage if it's not there already.""" - if obj_id is None: - # Checksum is missing, compute it on the fly. - obj_id = compute_hash(content) - if check_presence and obj_id in self: return obj_id diff --git a/swh/objstorage/backends/generator.py b/swh/objstorage/backends/generator.py --- a/swh/objstorage/backends/generator.py +++ b/swh/objstorage/backends/generator.py @@ -204,7 +204,7 @@ def get(self, obj_id, *args, **kwargs): return next(self.content_generator) - def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): + def add(self, content, obj_id, check_presence=True, *args, **kwargs): pass def check(self, obj_id, *args, **kwargs): diff --git a/swh/objstorage/backends/http.py b/swh/objstorage/backends/http.py --- a/swh/objstorage/backends/http.py +++ b/swh/objstorage/backends/http.py @@ -53,7 +53,7 @@ def __len__(self): raise exc.NonIterableObjStorage("__len__") - def add(self, content, obj_id=None, check_presence=True): + def add(self, content, obj_id, check_presence=True): raise exc.ReadOnlyObjStorage("add") def delete(self, obj_id): diff --git a/swh/objstorage/backends/in_memory.py b/swh/objstorage/backends/in_memory.py --- a/swh/objstorage/backends/in_memory.py +++ b/swh/objstorage/backends/in_memory.py @@ -30,10 +30,7 @@ def __iter__(self): return iter(sorted(self.state)) - def add(self, content, obj_id=None, check_presence=True): - if obj_id is None: - obj_id = compute_hash(content) - + def add(self, content, obj_id, check_presence=True): if check_presence and obj_id in self: return obj_id diff --git a/swh/objstorage/backends/libcloud.py b/swh/objstorage/backends/libcloud.py --- a/swh/objstorage/backends/libcloud.py +++ b/swh/objstorage/backends/libcloud.py @@ -156,11 +156,7 @@ """ return sum(1 for i in self) - def add(self, content, obj_id=None, check_presence=True): - if obj_id is None: - # Checksum is missing, compute it on the fly. - obj_id = compute_hash(content) - + def add(self, content, obj_id, check_presence=True): if check_presence and obj_id in self: return obj_id diff --git a/swh/objstorage/backends/noop.py b/swh/objstorage/backends/noop.py --- a/swh/objstorage/backends/noop.py +++ b/swh/objstorage/backends/noop.py @@ -22,7 +22,7 @@ def __contains__(self, obj_id, *args, **kwargs): return False - def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): + def add(self, content, obj_id, check_presence=True, *args, **kwargs): pass def get(self, obj_id, *args, **kwargs): diff --git a/swh/objstorage/backends/pathslicing.py b/swh/objstorage/backends/pathslicing.py --- a/swh/objstorage/backends/pathslicing.py +++ b/swh/objstorage/backends/pathslicing.py @@ -20,7 +20,6 @@ ID_HEXDIGEST_LENGTH, ObjStorage, compressors, - compute_hash, decompressors, ) @@ -230,9 +229,7 @@ """ return sum(1 for i in self) - def add(self, content, obj_id=None, check_presence=True): - if obj_id is None: - obj_id = compute_hash(content) + def add(self, content, obj_id, check_presence=True): if check_presence and obj_id in self: # If the object is already present, return immediately. return obj_id diff --git a/swh/objstorage/backends/seaweedfs/objstorage.py b/swh/objstorage/backends/seaweedfs/objstorage.py --- a/swh/objstorage/backends/seaweedfs/objstorage.py +++ b/swh/objstorage/backends/seaweedfs/objstorage.py @@ -72,11 +72,7 @@ """ return sum(1 for i in self) - def add(self, content, obj_id=None, check_presence=True): - if obj_id is None: - # Checksum is missing, compute it on the fly. - obj_id = compute_hash(content) - + def add(self, content, obj_id, check_presence=True): if check_presence and obj_id in self: return obj_id diff --git a/swh/objstorage/backends/winery/objstorage.py b/swh/objstorage/backends/winery/objstorage.py --- a/swh/objstorage/backends/winery/objstorage.py +++ b/swh/objstorage/backends/winery/objstorage.py @@ -6,7 +6,6 @@ import logging from multiprocessing import Process -from swh.model import hashutil from swh.objstorage import exc from swh.objstorage.objstorage import ObjStorage @@ -18,18 +17,6 @@ logger = logging.getLogger(__name__) -def compute_hash(content): - algo = "sha256" - return ( - hashutil.MultiHash.from_data( - content, - hash_names=[algo], - ) - .digest() - .get(algo) - ) - - class WineryObjStorage(ObjStorage): def __init__(self, **kwargs): super().__init__(**kwargs) @@ -50,7 +37,7 @@ def __contains__(self, obj_id): return obj_id in self.winery - def add(self, content, obj_id=None, check_presence=True): + def add(self, content, obj_id, check_presence=True): return self.winery.add(content, obj_id, check_presence) def check(self, obj_id): @@ -153,10 +140,7 @@ self.shard.uninit() super().uninit() - def add(self, content, obj_id=None, check_presence=True): - if obj_id is None: - obj_id = compute_hash(content) - + def add(self, content, obj_id, check_presence=True): if check_presence and obj_id in self: return obj_id diff --git a/swh/objstorage/cli.py b/swh/objstorage/cli.py --- a/swh/objstorage/cli.py +++ b/swh/objstorage/cli.py @@ -94,6 +94,7 @@ def import_directories(ctx, directory): """Import a local directory in an existing objstorage.""" from swh.objstorage.factory import get_objstorage + from swh.objstorage.objstorage import compute_hash objstorage = get_objstorage(**ctx.obj["config"]["objstorage"]) nobj = 0 @@ -104,9 +105,10 @@ for name in files: path = os.path.join(root, name) with open(path, "rb") as f: - objstorage.add(f.read()) - volume += os.stat(path).st_size - nobj += 1 + content = f.read() + objstorage.add(content, obj_id=compute_hash(content)) + volume += os.stat(path).st_size + nobj += 1 click.echo( "Imported %d files for a volume of %s bytes in %d seconds" % (nobj, volume, time.time() - t0) diff --git a/swh/objstorage/interface.py b/swh/objstorage/interface.py --- a/swh/objstorage/interface.py +++ b/swh/objstorage/interface.py @@ -66,15 +66,13 @@ ... @remote_api_endpoint("content/add") - def add(self, content, obj_id=None, check_presence=True): + def add(self, content, obj_id, check_presence=True): """Add a new object to the object storage. Args: content (bytes): object's raw content to add in storage. obj_id (bytes): checksum of [bytes] using [ID_HASH_ALGO] - algorithm. When given, obj_id will be trusted to match - the bytes. If missing, obj_id will be computed on the - fly. + algorithm. It is trusted to match the bytes. check_presence (bool): indicate if the presence of the content should be verified before adding the file. diff --git a/swh/objstorage/multiplexer/filter/filter.py b/swh/objstorage/multiplexer/filter/filter.py --- a/swh/objstorage/multiplexer/filter/filter.py +++ b/swh/objstorage/multiplexer/filter/filter.py @@ -58,7 +58,7 @@ """ return self.storage.__len__() - def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): + def add(self, content, obj_id, check_presence=True, *args, **kwargs): return self.storage.add(content, obj_id, check_presence, *args, **kwargs) def restore(self, content, obj_id=None, *args, **kwargs): diff --git a/swh/objstorage/multiplexer/filter/id_filter.py b/swh/objstorage/multiplexer/filter/id_filter.py --- a/swh/objstorage/multiplexer/filter/id_filter.py +++ b/swh/objstorage/multiplexer/filter/id_filter.py @@ -37,9 +37,7 @@ def __iter__(self): yield from filter(lambda id: self.is_valid(id), iter(self.storage)) - def add(self, content, obj_id=None, check_presence=True, *args, **kwargs): - if obj_id is None: - obj_id = compute_hash(content) + def add(self, content, obj_id, check_presence=True, *args, **kwargs): if self.is_valid(obj_id): return self.storage.add(content, *args, obj_id=obj_id, **kwargs) diff --git a/swh/objstorage/multiplexer/multiplexer_objstorage.py b/swh/objstorage/multiplexer/multiplexer_objstorage.py --- a/swh/objstorage/multiplexer/multiplexer_objstorage.py +++ b/swh/objstorage/multiplexer/multiplexer_objstorage.py @@ -222,7 +222,7 @@ return obj_iterator() - def add(self, content, obj_id=None, check_presence=True): + def add(self, content, obj_id, check_presence=True): """Add a new object to the object storage. If the adding step works in all the storages that accept this content, diff --git a/swh/objstorage/objstorage.py b/swh/objstorage/objstorage.py --- a/swh/objstorage/objstorage.py +++ b/swh/objstorage/objstorage.py @@ -29,7 +29,7 @@ """Default number of results of ``list_content``.""" -def compute_hash(content): +def compute_hash(content, algo=ID_HASH_ALGO): """Compute the content's hash. Args: @@ -43,10 +43,10 @@ return ( hashutil.MultiHash.from_data( content, - hash_names=[ID_HASH_ALGO], + hash_names=[algo], ) .digest() - .get(ID_HASH_ALGO) + .get(algo) ) @@ -99,7 +99,7 @@ pass @abc.abstractmethod - def add(self, content, obj_id=None, check_presence=True): + def add(self, content, obj_id, check_presence=True): pass def add_batch(self, contents, check_presence=True) -> Dict: diff --git a/swh/objstorage/tests/objstorage_testing.py b/swh/objstorage/tests/objstorage_testing.py --- a/swh/objstorage/tests/objstorage_testing.py +++ b/swh/objstorage/tests/objstorage_testing.py @@ -83,12 +83,6 @@ self.assertEqual(obj_id, r) self.assertContentMatch(obj_id, content) - def test_add_get_wo_id(self): - content, obj_id = self.hash_content(b"add_get_wo_id") - r = self.storage.add(content) - self.assertEqual(obj_id, r) - self.assertContentMatch(obj_id, content) - def test_add_get_batch(self): content1, obj_id1 = self.hash_content(b"add_get_batch_1") content2, obj_id2 = self.hash_content(b"add_get_batch_2") diff --git a/swh/objstorage/tests/test_multiplexer_filter.py b/swh/objstorage/tests/test_multiplexer_filter.py --- a/swh/objstorage/tests/test_multiplexer_filter.py +++ b/swh/objstorage/tests/test_multiplexer_filter.py @@ -32,7 +32,6 @@ "slicing": "0:5", } base_storage = get_objstorage(**pstorage) - base_storage.id = compute_hash self.storage = get_objstorage( "filtered", storage_conf=pstorage, filters_conf=[read_only()] ) @@ -41,12 +40,13 @@ self.true_invalid_content = b"Anything that is not correct" self.absent_content = b"non-existent content" # Create a valid content. - self.valid_id = base_storage.add(self.valid_content) + self.valid_id = compute_hash(self.valid_content) + base_storage.add(self.valid_content, obj_id=self.valid_id) # Create an invalid id and add a content with it. - self.invalid_id = base_storage.id(self.true_invalid_content) + self.invalid_id = compute_hash(self.true_invalid_content) base_storage.add(self.invalid_content, obj_id=self.invalid_id) # Compute an id for a non-existing content. - self.absent_id = base_storage.id(self.absent_content) + self.absent_id = compute_hash(self.absent_content) def tearDown(self): super().tearDown() @@ -113,56 +113,54 @@ storage = get_objstorage(**self.sconf) self.base_storage = storage self.storage = self.filter_storage(self.sconf) - # Set the id calculators - storage.id = compute_hash # Present content with valid id self.present_valid_content = self.ensure_valid(b"yroqdtotji") - self.present_valid_id = storage.id(self.present_valid_content) + self.present_valid_id = compute_hash(self.present_valid_content) # Present content with invalid id self.present_invalid_content = self.ensure_invalid(b"glxddlmmzb") - self.present_invalid_id = storage.id(self.present_invalid_content) + self.present_invalid_id = compute_hash(self.present_invalid_content) # Missing content with valid id self.missing_valid_content = self.ensure_valid(b"rmzkdclkez") - self.missing_valid_id = storage.id(self.missing_valid_content) + self.missing_valid_id = compute_hash(self.missing_valid_content) # Missing content with invalid id self.missing_invalid_content = self.ensure_invalid(b"hlejfuginh") - self.missing_invalid_id = storage.id(self.missing_invalid_content) + self.missing_invalid_id = compute_hash(self.missing_invalid_content) # Present corrupted content with valid id self.present_corrupted_valid_content = self.ensure_valid(b"cdsjwnpaij") self.true_present_corrupted_valid_content = self.ensure_valid(b"mgsdpawcrr") - self.present_corrupted_valid_id = storage.id( + self.present_corrupted_valid_id = compute_hash( self.true_present_corrupted_valid_content ) # Present corrupted content with invalid id self.present_corrupted_invalid_content = self.ensure_invalid(b"pspjljnrco") self.true_present_corrupted_invalid_content = self.ensure_invalid(b"rjocbnnbso") - self.present_corrupted_invalid_id = storage.id( + self.present_corrupted_invalid_id = compute_hash( self.true_present_corrupted_invalid_content ) # Missing (potentially) corrupted content with valid id self.missing_corrupted_valid_content = self.ensure_valid(b"zxkokfgtou") self.true_missing_corrupted_valid_content = self.ensure_valid(b"royoncooqa") - self.missing_corrupted_valid_id = storage.id( + self.missing_corrupted_valid_id = compute_hash( self.true_missing_corrupted_valid_content ) # Missing (potentially) corrupted content with invalid id self.missing_corrupted_invalid_content = self.ensure_invalid(b"hxaxnrmnyk") self.true_missing_corrupted_invalid_content = self.ensure_invalid(b"qhbolyuifr") - self.missing_corrupted_invalid_id = storage.id( + self.missing_corrupted_invalid_id = compute_hash( self.true_missing_corrupted_invalid_content ) # Add the content that are supposed to be present - self.storage.add(self.present_valid_content) - self.storage.add(self.present_invalid_content) + self.storage.add(self.present_valid_content, obj_id=self.present_valid_id) + self.storage.add(self.present_invalid_content, obj_id=self.present_invalid_id) self.storage.add( self.present_corrupted_valid_content, obj_id=self.present_corrupted_valid_id ) @@ -183,14 +181,14 @@ def ensure_valid(self, content=None): if content is None: content = get_random_content() - while not self.storage.is_valid(self.base_storage.id(content)): + while not self.storage.is_valid(compute_hash(content)): content = get_random_content() return content def ensure_invalid(self, content=None): if content is None: content = get_random_content() - while self.storage.is_valid(self.base_storage.id(content)): + while self.storage.is_valid(compute_hash(content)): content = get_random_content() return content @@ -274,20 +272,20 @@ # Add valid and invalid contents to the storage and check their # presence with the unfiltered storage. valid_content = self.ensure_valid(b"ulepsrjbgt") - valid_id = self.base_storage.id(valid_content) + valid_id = compute_hash(valid_content) invalid_content = self.ensure_invalid(b"znvghkjked") - invalid_id = self.base_storage.id(invalid_content) - self.storage.add(valid_content) - self.storage.add(invalid_content) + invalid_id = compute_hash(invalid_content) + self.storage.add(valid_content, obj_id=valid_id) + self.storage.add(invalid_content, obj_id=invalid_id) self.assertTrue(valid_id in self.base_storage) self.assertFalse(invalid_id in self.base_storage) def test_restore(self): # Add corrupted content to the storage and the try to restore it valid_content = self.ensure_valid(b"ulepsrjbgt") - valid_id = self.base_storage.id(valid_content) + valid_id = compute_hash(valid_content) corrupted_content = self.ensure_valid(b"ltjkjsloyb") - corrupted_id = self.base_storage.id(corrupted_content) + corrupted_id = compute_hash(corrupted_content) self.storage.add(corrupted_content, obj_id=valid_id) with self.assertRaises(ObjNotFoundError): self.storage.check(corrupted_id) diff --git a/swh/objstorage/tests/test_objstorage_http.py b/swh/objstorage/tests/test_objstorage_http.py --- a/swh/objstorage/tests/test_objstorage_http.py +++ b/swh/objstorage/tests/test_objstorage_http.py @@ -9,6 +9,7 @@ from swh.objstorage import exc from swh.objstorage.factory import get_objstorage +from swh.objstorage.objstorage import compute_hash def build_objstorage(): @@ -24,7 +25,8 @@ sto_back = get_objstorage(cls="memory") objids = [] for i in range(100): - objids.append(sto_back.add(f"some content {i}".encode())) + content = f"some content {i}".encode() + objids.append(sto_back.add(content, obj_id=compute_hash(content))) url = "http://127.0.0.1/content/" sto_front = get_objstorage(cls="http", url=url) @@ -93,8 +95,10 @@ def test_http_objstorage_read_only(): sto_front, sto_back, objids = build_objstorage() + content = b"" + obj_id = compute_hash(content) with pytest.raises(exc.ReadOnlyObjStorage): - sto_front.add(b"") + sto_front.add(content, obj_id=obj_id) with pytest.raises(exc.ReadOnlyObjStorage): sto_front.restore(b"") with pytest.raises(exc.ReadOnlyObjStorage): diff --git a/swh/objstorage/tests/test_objstorage_multiplexer.py b/swh/objstorage/tests/test_objstorage_multiplexer.py --- a/swh/objstorage/tests/test_objstorage_multiplexer.py +++ b/swh/objstorage/tests/test_objstorage_multiplexer.py @@ -55,7 +55,7 @@ def test_get_random_contents(self): content, obj_id = self.hash_content(b"get_random_content") - self.storage.add(content) + self.storage.add(content, obj_id=obj_id) random_contents = list(self.storage.get_random(1)) self.assertEqual(1, len(random_contents)) self.assertIn(obj_id, random_contents) @@ -63,6 +63,6 @@ def test_access_readonly(self): # Add a content to the readonly storage content, obj_id = self.hash_content(b"content in read-only") - self.storage_v1.add(content) + self.storage_v1.add(content, obj_id=obj_id) # Try to retrieve it on the main storage self.assertIn(obj_id, self.storage) diff --git a/swh/objstorage/tests/test_objstorage_striping.py b/swh/objstorage/tests/test_objstorage_striping.py --- a/swh/objstorage/tests/test_objstorage_striping.py +++ b/swh/objstorage/tests/test_objstorage_striping.py @@ -47,9 +47,6 @@ def tearDown(self): shutil.rmtree(self.base_dir) - def test_add_get_wo_id(self): - self.skipTest("can't add without id in the multiplexer storage") - def test_add_striping_behavior(self): exp_storage_counts = [0, 0] storage_counts = [0, 0] diff --git a/swh/objstorage/tests/test_objstorage_winery.py b/swh/objstorage/tests/test_objstorage_winery.py --- a/swh/objstorage/tests/test_objstorage_winery.py +++ b/swh/objstorage/tests/test_objstorage_winery.py @@ -20,6 +20,7 @@ Throttler, ) from swh.objstorage.factory import get_objstorage +from swh.objstorage.objstorage import compute_hash from swh.objstorage.utils import call_async from .winery_benchmark import Bench, work @@ -96,7 +97,7 @@ def test_winery_add_get(winery): shard = winery.base.whoami content = b"SOMETHING" - obj_id = winery.add(content=content) + obj_id = winery.add(content=content, obj_id=compute_hash(content, "sha256")) assert ( obj_id.hex() == "866878b165607851782d8d233edf0c261172ff67926330d3bbd10c705b92d24f" @@ -115,7 +116,7 @@ mocker.patch("swh.objstorage.backends.winery.objstorage.pack", return_value=True) shard = winery.base.whoami content = b"SOMETHING" - obj_id = winery.add(content=content) + obj_id = winery.add(content=content, obj_id=compute_hash(content, "sha256")) assert ( obj_id.hex() == "866878b165607851782d8d233edf0c261172ff67926330d3bbd10c705b92d24f"