diff --git a/swh/perfecthash/__init__.py b/swh/perfecthash/__init__.py index f9d8c38..a0ba2a3 100644 --- a/swh/perfecthash/__init__.py +++ b/swh/perfecthash/__init__.py @@ -1,106 +1,112 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import NewType from cffi import FFI from swh.perfecthash._hash_cffi import lib Key = NewType("Key", bytes) HashObject = NewType("HashObject", bytes) class Shard: """Low level management for files indexed with a perfect hash table. This class allows creating a Read Shard by adding key/object pairs and looking up the content of an object when given the key. """ def __init__(self, path: str): """Initialize with an existing Read Shard. Args: path: path to an existing Read Shard file or device """ self.ffi = FFI() self.shard = lib.shard_init(path.encode("utf-8")) def __del__(self): lib.shard_destroy(self.shard) + @staticmethod + def key_len(): + return lib.shard_key_len + def create(self, objects_count: int) -> "Shard": """Wipe out the content of the Read Shard. It must be followed by **object_count** calls to the **write** method otherwise the content of the Read Shard will be inconsistent. When all objects are inserted, the Read Shard must be made persistent by calling the **save** method. Args: objects_count: number of objects in the Read Shard. Returns: self. """ assert lib.shard_create(self.shard, objects_count) != -1 return self def load(self) -> "Shard": """Open the Read Shard file in read-only mode. Returns: self. """ assert lib.shard_load(self.shard) != -1 return self def save(self) -> int: """Create the perfect hash table the **lookup** method relies on to find the content of the objects. - It must be called after **create** an **write** otherwise the + It must be called after **create** and **write** otherwise the content of the Read Shard will be inconsistent. Returns: 0 on success, -1 on error. """ return lib.shard_save(self.shard) def lookup(self, key: Key) -> HashObject: """Fetch the object matching the key in the Read Shard. Fetching an object is O(1): one lookup in the index to obtain the offset of the object in the Read Shard and one read to get the payload. Args: key: the key associated with the object to retrieve. Returns: the object as bytes. """ object_size_pointer = self.ffi.new("uint64_t*") lib.shard_lookup_object_size(self.shard, key, object_size_pointer) object_size = object_size_pointer[0] object_pointer = self.ffi.new("char[]", object_size) lib.shard_lookup_object(self.shard, object_pointer, object_size) return self.ffi.buffer(object_pointer, object_size) def write(self, key: Key, object: HashObject) -> int: """Add the key/object pair to the Read Shard. The **create** method must have been called prior to calling the **write** method. Args: key: the unique key associated with the object. object: the object Returns: 0 on success, -1 on error. """ + if len(key) != Shard.key_len(): + raise ValueError(f"key length is {len(key)} instead of {Shard.key_len()}") return lib.shard_object_write(self.shard, key, object, len(object)) diff --git a/swh/perfecthash/build.py b/swh/perfecthash/build.py index 016db22..ac8e59b 100644 --- a/swh/perfecthash/build.py +++ b/swh/perfecthash/build.py @@ -1,47 +1,48 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from cffi import FFI ffibuilder = FFI() # cdef() expects a single string declaring the C types, functions and # globals needed to use the shared object. It must be in valid C syntax. # # The following is only the necessary part parsed by cffi to generate python bindings. # ffibuilder.cdef( """ typedef struct shard_t shard_t; shard_t* shard_init(const char* path); int shard_destroy(shard_t* shard); int shard_create(shard_t* shard, uint64_t objects_count); int shard_object_write(shard_t* shard, const char* key, const char* object, uint64_t object_size); int shard_save(shard_t* shard); int shard_load(shard_t* shard); int shard_lookup_object_size(shard_t *shard, const char *key, uint64_t *object_size); int shard_lookup_object(shard_t *shard, char *object, uint64_t object_size); +extern const int shard_key_len; """ ) ffibuilder.set_source( "swh.perfecthash._hash_cffi", """ #include "swh/perfecthash/hash.h" """, sources=["swh/perfecthash/hash.c"], include_dirs=["."], libraries=["cmph"], extra_compile_args=["-D_FILE_OFFSET_BITS=64"], ) # library name, for the linker if __name__ == "__main__": ffibuilder.compile(verbose=True) diff --git a/swh/perfecthash/hash.c b/swh/perfecthash/hash.c index 6912ec0..69de9d1 100644 --- a/swh/perfecthash/hash.c +++ b/swh/perfecthash/hash.c @@ -1,436 +1,438 @@ /* - * Copyright (C) 2021 The Software Heritage developers + * Copyright (C) 2021-2022 The Software Heritage developers * See the AUTHORS file at the top-level directory of this distribution * License: GNU General Public License version 3, or any later version * See top-level LICENSE file for more information */ #include #include #include #include #include #include #include #include #include #include #include "swh/perfecthash/hash.h" +const int shard_key_len = SHARD_KEY_LEN; + #ifdef HASH_DEBUG #define debug(...) printf(__VA_ARGS__) #else #define debug(...) #endif #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ uint64_t ntohq(uint64_t v) { return __builtin_bswap64(v); } uint64_t htonq(uint64_t v) { return __builtin_bswap64(v); } #else /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */ uint64_t ntohq(uint64_t v) { return v; } uint64_t htonq(uint64_t v) { return v; } #endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */ /*********************************************************** * wrappers around FILE functions that: * - return -1 on error * - print a meaningful message when an error occurs * */ int shard_open(shard_t *shard, const char *mode) { shard->f = fopen(shard->path, mode); if (shard->f == NULL) { printf("shard_open: open(%s, %s): %s\n", shard->path, mode, strerror(errno)); return -1; } else return 0; } int shard_close(shard_t *shard) { if (shard->f == NULL) return 0; int r = fclose(shard->f); if (r < 0) printf("shard_close: fclose(%p): %s\n", shard->f, strerror(errno)); return r; } int shard_seek(shard_t *shard, uint64_t offset, int whence) { if (offset > INT64_MAX) { printf("shard_seek: %ld > %ld (INT64_MAX)", offset, INT64_MAX); return -1; } int r = fseeko(shard->f, offset, whence); if (r < 0) printf("shard_seek: fseeko(%p, %ld, %d): %s\n", shard->f, offset, whence, strerror(errno)); return r; } uint64_t shard_tell(shard_t *shard) { off_t r = ftello(shard->f); if (r < 0) printf("shard_tell: ftello(%p): %s\n", shard->f, strerror(errno)); return r; } int shard_read(shard_t *shard, void *ptr, uint64_t size) { uint64_t read; if ((read = fread(ptr, 1, size, shard->f)) != size) { printf("shard_read: read %ld instead of %ld\n", read, size); return -1; } return 0; } int shard_read_uint64_t(shard_t *shard, uint64_t *ptr) { uint64_t n_size; if (shard_read(shard, &n_size, sizeof(uint64_t)) < 0) { printf("shard_read_uint64_t: shard_read\n"); return -1; } *ptr = ntohq(n_size); return 0; } int shard_write(shard_t *shard, const void *ptr, uint64_t nmemb) { uint64_t wrote; if ((wrote = fwrite(ptr, 1, nmemb, shard->f)) != nmemb) { printf("shard_write: wrote %ld instead of %ld\n", wrote, nmemb); return -1; } return 0; } /*********************************************************** * load or save a SHARD_MAGIC */ int shard_magic_load(shard_t *shard) { if (shard_seek(shard, 0, SEEK_SET) < 0) { printf("shard_magic_load: seek\n"); return -1; } char magic[sizeof(SHARD_MAGIC)]; if (shard_read(shard, (void *)magic, sizeof(SHARD_MAGIC)) < 0) { printf("shard_magic_load: read\n"); return -1; } if (memcmp(magic, SHARD_MAGIC, sizeof(SHARD_MAGIC)) != 0) { printf("shard_magic_load: memcmp(%.*s, %s)\n", (int)sizeof(SHARD_MAGIC), magic, SHARD_MAGIC); return -1; } return 0; } int shard_magic_save(shard_t *shard) { if (shard_seek(shard, 0, SEEK_SET) < 0) { printf("shard_magic_save: seek\n"); return -1; } if (shard_write(shard, (void *)SHARD_MAGIC, sizeof(SHARD_MAGIC)) < 0) { printf("shard_magic_save: write\n"); return -1; } return 0; } /*********************************************************** * load or save a shard_header_t */ int shard_header_print(shard_header_t *header) { #define PRINT(name) debug("shard_header_print: " #name " %ld\n", header->name) PRINT(version); PRINT(objects_count); PRINT(objects_position); PRINT(objects_size); PRINT(index_position); PRINT(index_size); PRINT(hash_position); #undef PRINT return 0; } int shard_header_load(shard_t *shard) { if (shard_seek(shard, SHARD_OFFSET_MAGIC, SEEK_SET) < 0) { printf("shard_header_load\n"); return -1; } shard_header_t header; #define LOAD(name) \ if (shard_read(shard, (void *)&header.name, sizeof(uint64_t)) < 0) { \ printf("shard_header_load: " #name "\n"); \ return -1; \ } \ shard->header.name = ntohq(header.name) LOAD(version); LOAD(objects_count); LOAD(objects_position); LOAD(objects_size); LOAD(index_position); LOAD(index_size); LOAD(hash_position); #undef LOAD shard_header_print(&shard->header); if (shard->header.version != SHARD_VERSION) { printf("shard_header_load: unexpected version, got %ld instead of %d\n", shard->header.version, SHARD_VERSION); return -1; } return 0; } int shard_header_save(shard_t *shard) { if (shard_seek(shard, SHARD_OFFSET_MAGIC, SEEK_SET) < 0) { printf("shard_header_save\n"); return -1; } shard_header_print(&shard->header); shard_header_t header; #define SAVE(name) \ header.name = htonq(shard->header.name); \ if (shard_write(shard, (void *)&header.name, sizeof(uint64_t)) < 0) { \ printf("shard_header_save " #name "\n"); \ return -1; \ } SAVE(version); SAVE(objects_count); SAVE(objects_position); SAVE(objects_size); SAVE(index_position); SAVE(index_size); SAVE(hash_position); #undef SAVE return 0; } int shard_header_reset(shard_header_t *header) { memset((void *)header, '\0', sizeof(shard_header_t)); header->version = SHARD_VERSION; header->objects_position = SHARD_OFFSET_HEADER; return 0; } /*********************************************************** * Create the Read Shard */ int shard_object_write(shard_t *shard, const char *key, const char *object, uint64_t object_size) { // save key & index to later build the hash shard_index_t *index = &shard->index[shard->index_offset]; memcpy((void *)index->key, key, SHARD_KEY_LEN); index->object_offset = shard_tell(shard); shard->index_offset++; // write the object size and the object itself uint64_t n_object_size = htonq(object_size); if (shard_write(shard, (void *)&n_object_size, sizeof(uint64_t)) < 0) { printf("shard_object_write: object_size\n"); return -1; } if (shard_write(shard, (void *)object, object_size) < 0) { printf("shard_object_write: object\n"); return -1; } return 0; } static int io_read(void *data, char **key, cmph_uint32 *keylen) { shard_t *shard = (shard_t *)data; *key = shard->index[shard->index_offset].key; *keylen = SHARD_KEY_LEN; shard->index_offset++; return shard->index_offset >= shard->header.objects_count ? -1 : 0; } static void io_dispose(void *data, char *key, cmph_uint32 keylen) {} static void io_rewind(void *data) { shard_t *shard = (shard_t *)data; shard->index_offset = 0; } static cmph_io_adapter_t *io_adapter(shard_t *shard) { cmph_io_adapter_t *key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); if (key_source == NULL) return NULL; key_source->data = (void *)shard; key_source->nkeys = shard->header.objects_count; key_source->read = io_read; key_source->dispose = io_dispose; key_source->rewind = io_rewind; return key_source; } int shard_hash_create(shard_t *shard) { shard->source = io_adapter(shard); shard->config = cmph_config_new(shard->source); cmph_config_set_algo(shard->config, CMPH_CHD_PH); cmph_config_set_keys_per_bin(shard->config, 1); cmph_config_set_b(shard->config, 4); shard->hash = cmph_new(shard->config); return 0; } int shard_index_save(shard_t *shard) { shard->header.index_position = shard->header.objects_position + shard->header.objects_size; debug("shard_index_save: index_position %ld\n", shard->header.index_position); assert(shard->header.index_position == shard_tell(shard)); cmph_uint32 count = cmph_size(shard->hash); debug("shard_index_save: count = %d\n", count); shard->header.index_size = count * sizeof(uint64_t); uint64_t *index = (uint64_t *)calloc(1, shard->header.index_size); for (uint64_t i = 0; i < shard->index_offset; i++) { cmph_uint32 h = cmph_search(shard->hash, shard->index[i].key, SHARD_KEY_LEN); debug("shard_index_save: i = %ld, h = %d, offset = %ld\n", i, h, shard->index[i].object_offset); assert(h < count); index[h] = htonq(shard->index[i].object_offset); } uint64_t index_size = shard->header.index_size; if (shard_write(shard, (void *)index, index_size) < 0) { printf("shard_index_save\n"); return -1; } free(index); return 0; } int shard_hash_save(shard_t *shard) { shard->header.hash_position = shard->header.index_position + shard->header.index_size; debug("shard_hash_save: hash_position %ld\n", shard->header.hash_position); cmph_dump(shard->hash, shard->f); return 0; } int shard_save(shard_t *shard) { shard->header.objects_size = shard_tell(shard) - shard->header.objects_position; return (shard_hash_create(shard) < 0 || shard_index_save(shard) < 0 || shard_hash_save(shard) < 0 || shard_header_save(shard) < 0 || shard_magic_save(shard) < 0) ? -1 : 0; } int shard_reset(shard_t *shard) { if (shard_header_reset(&shard->header) < 0) return -1; return shard_seek(shard, SHARD_OFFSET_HEADER, SEEK_SET); } int shard_create(shard_t *shard, uint64_t objects_count) { if (shard_open(shard, "w+") < 0) return -1; if (shard_reset(shard) < 0) return -1; shard->header.objects_count = objects_count; shard->index = (shard_index_t *)malloc(sizeof(shard_index_t) * objects_count); return 0; } /********************************************************** * Lookup objects from a Read Shard */ int shard_lookup_object_size(shard_t *shard, const char *key, uint64_t *object_size) { debug("shard_lookup_object_size\n"); cmph_uint32 h = cmph_search(shard->hash, key, SHARD_KEY_LEN); debug("shard_lookup_object_size: h = %d\n", h); uint64_t index_offset = shard->header.index_position + h * sizeof(uint64_t); debug("shard_lookup_object_size: index_offset = %ld\n", index_offset); if (shard_seek(shard, index_offset, SEEK_SET) < 0) { printf("shard_lookup_object_size: index_offset\n"); return -1; } uint64_t object_offset; if (shard_read_uint64_t(shard, &object_offset) < 0) { printf("shard_lookup_object_size: object_offset\n"); return -1; } debug("shard_lookup_object_size: object_offset = %ld\n", object_offset); if (shard_seek(shard, object_offset, SEEK_SET) < 0) { printf("shard_lookup_object_size: object_offset\n"); return -1; } if (shard_read_uint64_t(shard, object_size) < 0) { printf("shard_lookup_object_size: object_size\n"); return -1; } debug("shard_lookup_object_size: object_size = %ld\n", *object_size); return 0; } int shard_lookup_object(shard_t *shard, char *object, uint64_t object_size) { if (shard_read(shard, (void *)object, object_size) < 0) { printf("shard_lookup_object: object\n"); return -1; } return 0; } int shard_hash_load(shard_t *shard) { if (shard_seek(shard, shard->header.hash_position, SEEK_SET) < 0) { printf("shard_hash_load\n"); return -1; } debug("shard_hash_load: hash_position %ld\n", shard->header.hash_position); shard->hash = cmph_load(shard->f); if (shard->hash == NULL) { printf("shard_hash_load: cmph_load\n"); return -1; } return 0; } int shard_load(shard_t *shard) { debug("shard_load\n"); if (shard_open(shard, "r") < 0) return -1; if (shard_magic_load(shard) < 0) return -1; if (shard_header_load(shard) < 0) return -1; return shard_hash_load(shard); } /********************************************************** * Initialize and destroy a Read Shard */ shard_t *shard_init(const char *path) { debug("shard_init\n"); shard_t *shard = (shard_t *)malloc(sizeof(shard_t)); if (shard == NULL) return NULL; memset((void *)shard, '\0', sizeof(shard_t)); shard->path = strdup(path); return shard; } int shard_destroy(shard_t *shard) { if (shard->source) free(shard->source); if (shard->config) cmph_config_destroy(shard->config); if (shard->hash) cmph_destroy(shard->hash); if (shard->index) free(shard->index); free(shard->path); int r = shard_close(shard); free(shard); return r; } diff --git a/swh/perfecthash/hash.h b/swh/perfecthash/hash.h index 72c5735..98d7551 100644 --- a/swh/perfecthash/hash.h +++ b/swh/perfecthash/hash.h @@ -1,58 +1,59 @@ /* - * Copyright (C) 2021 The Software Heritage developers + * Copyright (C) 2021-2022 The Software Heritage developers * See the AUTHORS file at the top-level directory of this distribution * License: GNU General Public License version 3, or any later version * See top-level LICENSE file for more information */ #include #include #include #define SHARD_OFFSET_MAGIC 32 #define SHARD_OFFSET_HEADER 512 -#define SHARD_KEY_LEN 32 +#define SHARD_KEY_LEN 20 +extern const int shard_key_len; #define SHARD_MAGIC "SWHShard" #define SHARD_VERSION 1 typedef struct { uint64_t version; uint64_t objects_count; uint64_t objects_position; uint64_t objects_size; uint64_t index_position; uint64_t index_size; uint64_t hash_position; } shard_header_t; typedef struct { char key[SHARD_KEY_LEN]; uint64_t object_offset; } shard_index_t; typedef struct { char *path; FILE *f; shard_header_t header; cmph_t *hash; // The following fields are only used when creating the Read Shard cmph_io_adapter_t *source; cmph_config_t *config; shard_index_t *index; uint64_t index_offset; } shard_t; shard_t *shard_init(const char *path); int shard_destroy(shard_t *shard); int shard_create(shard_t *shard, uint64_t objects_count); int shard_object_write(shard_t *shard, const char *key, const char *object, uint64_t object_size); int shard_save(shard_t *shard); int shard_load(shard_t *shard); int shard_lookup_object_size(shard_t *shard, const char *key, uint64_t *object_size); int shard_lookup_object(shard_t *shard, char *object, uint64_t object_size); diff --git a/swh/perfecthash/test_hash.cpp b/swh/perfecthash/test_hash.cpp index 0a474ac..843557b 100644 --- a/swh/perfecthash/test_hash.cpp +++ b/swh/perfecthash/test_hash.cpp @@ -1,166 +1,170 @@ /* * Copyright (C) 2021 The Software Heritage developers * See the AUTHORS file at the top-level directory of this distribution * License: GNU General Public License version 3, or any later version * See top-level LICENSE file for more information */ #include #include #include #include #include #include extern "C" { #include "hash.h" } using namespace std::experimental; filesystem::path create_temporary_directory(unsigned long long max_tries = 1000) { auto tmp_dir = filesystem::temp_directory_path(); unsigned long long i = 0; std::random_device dev; std::mt19937 prng(dev()); std::uniform_int_distribution rand(0); filesystem::path path; while (true) { std::stringstream ss; ss << std::hex << rand(prng); path = tmp_dir / ss.str(); // true if the directory was created. if (filesystem::create_directory(path)) { break; } if (i == max_tries) { throw std::runtime_error("could not find non-existing directory"); } i++; } return path; } std::string gen_random(const int len) { std::string tmp_s; static const char alphanum[] = "0123456789" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz"; tmp_s.reserve(len); std::random_device dev; std::mt19937 prng(dev()); std::uniform_int_distribution rand(0, strlen(alphanum) - 1); for (int i = 0; i < len; ++i) tmp_s += alphanum[rand(prng)]; return tmp_s; } TEST(HashTest, One) { auto tmpdir = create_temporary_directory(); filesystem::path tmpfile = tmpdir / std::string("shard"); ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0); ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0); // // Create a Read Shard and write a single object // shard_t *shard = shard_init(tmpfile.c_str()); ASSERT_NE(shard, nullptr); ASSERT_GE(shard_create(shard, 1), 0); const char *keyA = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; const char *objectA = "AAAAA"; size_t objectA_size = strlen(objectA); ASSERT_GE(shard_object_write(shard, keyA, objectA, objectA_size), 0); ASSERT_GE(shard_save(shard), 0); size_t found_size = 0; ASSERT_GE(shard_lookup_object_size(shard, keyA, &found_size), 0); ASSERT_EQ(objectA_size, found_size); char *found = (char *)malloc(found_size); ASSERT_GE(shard_lookup_object(shard, found, found_size), 0); ASSERT_EQ(memcmp((const void *)objectA, (const void *)found, found_size), 0); free(found); ASSERT_GE(shard_destroy(shard), 0); // // Open the Read Shard created above and verify the object can be // looked up. // shard = shard_init(tmpfile.c_str()); ASSERT_NE(shard, nullptr); ASSERT_GE(shard_load(shard), 0); found_size = 0; ASSERT_GE(shard_lookup_object_size(shard, keyA, &found_size), 0); ASSERT_EQ(objectA_size, found_size); found = (char *)malloc(found_size); ASSERT_GE(shard_lookup_object(shard, found, found_size), 0); ASSERT_EQ(memcmp((const void *)objectA, (const void *)found, found_size), 0); free(found); ASSERT_GE(shard_destroy(shard), 0); filesystem::remove_all(tmpdir); } TEST(HashTest, Many) { auto tmpdir = create_temporary_directory(); filesystem::path tmpfile = tmpdir / std::string("shard"); ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0); ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0); + std::random_device dev; + std::mt19937 prng(dev()); + std::uniform_int_distribution rand(0, 80 * 1024); + // // Populate a Read Shard with multiple objects (objects_count) // The object content and their keys are from a random source // A map is kept in memory in key2object for verification // std::map key2object; shard_t *shard = shard_init(tmpfile.c_str()); ASSERT_NE(shard, nullptr); int objects_count = 10; ASSERT_GE(shard_create(shard, objects_count), 0); for (int i = 0; i < objects_count; i++) { - std::string key = gen_random(32); - std::string object = gen_random(50); + std::string key = gen_random(SHARD_KEY_LEN); + std::string object = gen_random(rand(prng)); key2object[key] = object; std::cout << key << std::endl; ASSERT_GE(shard_object_write(shard, key.c_str(), object.c_str(), object.length()), 0); } ASSERT_GE(shard_save(shard), 0); ASSERT_GE(shard_destroy(shard), 0); // // Open the Read Shard for lookups, lookup every key inserted // and verify the matching object has the expected content. // shard = shard_init(tmpfile.c_str()); ASSERT_NE(shard, nullptr); ASSERT_GE(shard_load(shard), 0); for (std::pair p : key2object) { size_t found_size = 0; ASSERT_GE(shard_lookup_object_size(shard, p.first.c_str(), &found_size), 0); ASSERT_EQ(p.second.length(), found_size); char *found = (char *)malloc(found_size); ASSERT_GE(shard_lookup_object(shard, found, found_size), 0); ASSERT_EQ(memcmp((const void *)p.second.c_str(), (const void *)found, found_size), 0); free(found); } ASSERT_GE(shard_destroy(shard), 0); filesystem::remove_all(tmpdir); } int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/swh/perfecthash/tests/test_hash.py b/swh/perfecthash/tests/test_hash.py index ad019c0..f09feeb 100644 --- a/swh/perfecthash/tests/test_hash.py +++ b/swh/perfecthash/tests/test_hash.py @@ -1,149 +1,149 @@ -# Copyright (C) 2021 The Software Heritage developers +# Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import random import time import pytest from swh.perfecthash import Shard def test_all(tmpdir): f = f"{tmpdir}/shard" open(f, "w").close() os.truncate(f, 10 * 1024 * 1024) s = Shard(f).create(2) - keyA = b"A" * 32 + keyA = b"A" * Shard.key_len() objectA = b"AAAA" s.write(keyA, objectA) - keyB = b"B" * 32 + keyB = b"B" * Shard.key_len() objectB = b"BBBB" s.write(keyB, objectB) s.save() del s s = Shard(f).load() assert s.lookup(keyA) == objectA assert s.lookup(keyB) == objectB del s @pytest.fixture def payload(request): size = request.config.getoption("--shard-size") path = request.config.getoption("--shard-path") if not os.path.exists(path) or os.path.getsize(path) != size * 1024 * 1024: os.system(f"dd if=/dev/urandom of={path} count={size} bs=1024k") return path # # PYTHONMALLOC=malloc valgrind --tool=memcheck .tox/py3/bin/pytest \ # -k test_build_speed swh/perfecthash/tests/test_hash.py |& tee /tmp/v # def test_build_speed(request, tmpdir, payload): start = time.time() - os.system(f"cp {payload} {tmpdir}/shard ; rm {tmpdir}/shard") + os.system(f"cp {payload} {tmpdir}/shard") baseline = time.time() - start write_duration, build_duration, _ = shard_build(request, tmpdir, payload) duration = write_duration + build_duration print( f"baseline {baseline}, " f"write_duration {write_duration}, " f"build_duration {build_duration}, " f"total_duration {duration}" ) # # According to the docs/benchmarks.rst analysis, the duration is # below 5 times the baseline time This assertion is here to ensure # we do not not regress in the future... # assert duration < baseline * 5 def test_lookup_speed(request, tmpdir, payload): _, _, objects = shard_build(request, tmpdir, payload) for i in range(request.config.getoption("--shard-count")): os.system(f"cp {tmpdir}/shard {tmpdir}/shard{i}") start = time.time() shard_lookups(request, tmpdir, objects) duration = time.time() - start lookups = request.config.getoption("--lookups") key_per_sec = lookups / duration print(f"key lookups speed = {key_per_sec:.2f}/s") def shard_lookups(request, tmpdir, objects): shard_path = f"{tmpdir}/shard" shards = [] for i in range(request.config.getoption("--shard-count")): shards.append(Shard(f"{shard_path}{i}").load()) lookups = request.config.getoption("--lookups") count = 0 while True: for key, object_size in objects.items(): if count >= lookups: return for shard in shards: object = shard.lookup(key) assert len(object) == object_size count += 1 def shard_build(request, tmpdir, payload): shard_size = request.config.getoption("--shard-size") * 1024 * 1024 shard_path = f"{tmpdir}/shard" open(shard_path, "w").close() os.truncate(shard_path, shard_size * 2) object_max_size = request.config.getoption("--object-max-size") objects = {} count = 0 size = 0 with open(payload, "rb") as f: while True: - key = f.read(32) - if len(key) < 32: + key = f.read(Shard.key_len()) + if len(key) < Shard.key_len(): break assert key not in objects object = f.read(random.randrange(512, object_max_size)) if len(object) < 512: break objects[key] = len(object) size += len(object) count += 1 print(f"number of objects = {count}, total size = {size}") assert size < shard_size start = time.time() shard = Shard(shard_path).create(len(objects)) count = 0 size = 0 with open(payload, "rb") as f: while True: - key = f.read(32) - if len(key) < 32: + key = f.read(Shard.key_len()) + if len(key) < Shard.key_len(): break if key not in objects: break object = f.read(objects[key]) assert len(object) == objects[key] count += 1 size += len(object) assert shard.write(key, object) >= 0, ( f"object count {count}/{len(objects)}, " f"size {size}, " f"object size {len(object)}" ) write_duration = time.time() - start start = time.time() shard.save() build_duration = time.time() - start return write_duration, build_duration, objects