diff --git a/swh/perfecthash/__init__.py b/swh/perfecthash/__init__.py --- a/swh/perfecthash/__init__.py +++ b/swh/perfecthash/__init__.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -33,6 +33,10 @@ def __del__(self): lib.shard_destroy(self.shard) + @staticmethod + def key_len(): + return lib.shard_key_len + def create(self, objects_count: int) -> "Shard": """Wipe out the content of the Read Shard. It must be followed by **object_count** calls to the **write** method otherwise the content @@ -62,7 +66,7 @@ """Create the perfect hash table the **lookup** method relies on to find the content of the objects. - It must be called after **create** an **write** otherwise the + It must be called after **create** and **write** otherwise the content of the Read Shard will be inconsistent. Returns: @@ -103,4 +107,6 @@ Returns: 0 on success, -1 on error. """ + if len(key) != Shard.key_len(): + raise ValueError(f"key length is {len(key)} instead of {Shard.key_len()}") return lib.shard_object_write(self.shard, key, object, len(object)) diff --git a/swh/perfecthash/build.py b/swh/perfecthash/build.py --- a/swh/perfecthash/build.py +++ b/swh/perfecthash/build.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -29,6 +29,7 @@ int shard_lookup_object_size(shard_t *shard, const char *key, uint64_t *object_size); int shard_lookup_object(shard_t *shard, char *object, uint64_t object_size); +extern const int shard_key_len; """ ) diff --git a/swh/perfecthash/hash.h b/swh/perfecthash/hash.h --- a/swh/perfecthash/hash.h +++ b/swh/perfecthash/hash.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021 The Software Heritage developers + * Copyright (C) 2021-2022 The Software Heritage developers * See the AUTHORS file at the top-level directory of this distribution * License: GNU General Public License version 3, or any later version * See top-level LICENSE file for more information @@ -11,7 +11,8 @@ #define SHARD_OFFSET_MAGIC 32 #define SHARD_OFFSET_HEADER 512 -#define SHARD_KEY_LEN 32 +#define SHARD_KEY_LEN 20 +extern const int shard_key_len; #define SHARD_MAGIC "SWHShard" #define SHARD_VERSION 1 diff --git a/swh/perfecthash/hash.c b/swh/perfecthash/hash.c --- a/swh/perfecthash/hash.c +++ b/swh/perfecthash/hash.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021 The Software Heritage developers + * Copyright (C) 2021-2022 The Software Heritage developers * See the AUTHORS file at the top-level directory of this distribution * License: GNU General Public License version 3, or any later version * See top-level LICENSE file for more information @@ -18,6 +18,8 @@ #include "swh/perfecthash/hash.h" +const int shard_key_len = SHARD_KEY_LEN; + #ifdef HASH_DEBUG #define debug(...) printf(__VA_ARGS__) #else diff --git a/swh/perfecthash/test_hash.cpp b/swh/perfecthash/test_hash.cpp --- a/swh/perfecthash/test_hash.cpp +++ b/swh/perfecthash/test_hash.cpp @@ -113,6 +113,10 @@ ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0); ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0); + std::random_device dev; + std::mt19937 prng(dev()); + std::uniform_int_distribution rand(0, 80 * 1024); + // // Populate a Read Shard with multiple objects (objects_count) // The object content and their keys are from a random source @@ -125,8 +129,8 @@ int objects_count = 10; ASSERT_GE(shard_create(shard, objects_count), 0); for (int i = 0; i < objects_count; i++) { - std::string key = gen_random(32); - std::string object = gen_random(50); + std::string key = gen_random(SHARD_KEY_LEN); + std::string object = gen_random(rand(prng)); key2object[key] = object; std::cout << key << std::endl; ASSERT_GE(shard_object_write(shard, key.c_str(), object.c_str(), diff --git a/swh/perfecthash/tests/test_hash.py b/swh/perfecthash/tests/test_hash.py --- a/swh/perfecthash/tests/test_hash.py +++ b/swh/perfecthash/tests/test_hash.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -18,10 +18,10 @@ os.truncate(f, 10 * 1024 * 1024) s = Shard(f).create(2) - keyA = b"A" * 32 + keyA = b"A" * Shard.key_len() objectA = b"AAAA" s.write(keyA, objectA) - keyB = b"B" * 32 + keyB = b"B" * Shard.key_len() objectB = b"BBBB" s.write(keyB, objectB) s.save() @@ -48,7 +48,7 @@ # def test_build_speed(request, tmpdir, payload): start = time.time() - os.system(f"cp {payload} {tmpdir}/shard ; rm {tmpdir}/shard") + os.system(f"cp {payload} {tmpdir}/shard") baseline = time.time() - start write_duration, build_duration, _ = shard_build(request, tmpdir, payload) duration = write_duration + build_duration @@ -108,8 +108,8 @@ size = 0 with open(payload, "rb") as f: while True: - key = f.read(32) - if len(key) < 32: + key = f.read(Shard.key_len()) + if len(key) < Shard.key_len(): break assert key not in objects object = f.read(random.randrange(512, object_max_size)) @@ -128,8 +128,8 @@ size = 0 with open(payload, "rb") as f: while True: - key = f.read(32) - if len(key) < 32: + key = f.read(Shard.key_len()) + if len(key) < Shard.key_len(): break if key not in objects: break