diff --git a/.gitignore b/.gitignore --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,9 @@ version.txt .mypy_cache/ .vscode/ +*.o +swh/perfecthash/hash +swh/perfecthash/hash.gcda +swh/perfecthash/hash.gcno +swh/perfecthash/hash.lcov +swh/perfecthash/html diff --git a/AUTHORS b/AUTHORS --- a/AUTHORS +++ b/AUTHORS @@ -1,3 +1,3 @@ -Copyright (C) 2019 The Software Heritage developers +Copyright (C) 2021 The Software Heritage developers See http://www.softwareheritage.org/ for more information. diff --git a/MANIFEST.in b/MANIFEST.in --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include Makefile include requirements*.txt include version.txt -include README.md +include README.rst recursive-include swh py.typed +include swh/perfecthash/hash.[ch] diff --git a/docs/README.rst b/docs/README.rst deleted file mode 100644 --- a/docs/README.rst +++ /dev/null @@ -1,4 +0,0 @@ -Software Heritage - Python module template -========================================== - -Python module template, used as skeleton to create new modules. diff --git a/docs/index.rst b/docs/index.rst --- a/docs/index.rst +++ b/docs/index.rst @@ -1,15 +1,16 @@ -.. _swh-py-template: +.. _swh-perfecthash: -.. include:: README.rst +Software Heritage - Object storage perfect hash +=============================================== + +Low level management for read-only content-addressable object storage +indexed with a perfect hash table. -.. toctree:: - :maxdepth: 2 - :caption: Contents: +Reference Documentation +----------------------- -Indices and tables ------------------- +.. toctree:: + :maxdepth: 2 -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` + /apidoc/swh.perfecthash diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -11,5 +11,8 @@ [mypy-pytest.*] ignore_missing_imports = True -# [mypy-add_your_lib_here.*] -# ignore_missing_imports = True +[mypy-_hash_cffi.*] +ignore_missing_imports = True + +[mypy-cffi.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html +cffi diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -48,9 +48,10 @@ packages=find_packages(), # packages's modules install_requires=parse_requirements(None, "swh"), tests_require=parse_requirements("test"), - setup_requires=["setuptools-scm"], + setup_requires=["setuptools-scm", "cffi"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, + cffi_modules=["swh/perfecthash/build.py:ffibuilder"], classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", diff --git a/swh/perfecthash/.clang-format b/swh/perfecthash/.clang-format new file mode 100644 --- /dev/null +++ b/swh/perfecthash/.clang-format @@ -0,0 +1,2 @@ +--- +IndentWidth: 4 diff --git a/swh/perfecthash/Makefile b/swh/perfecthash/Makefile new file mode 100644 --- /dev/null +++ b/swh/perfecthash/Makefile @@ -0,0 +1,20 @@ +CFLAGS=-DHASH_DEBUG -Wall -I../.. -g -std=c++17 -fprofile-arcs -ftest-coverage +LDFLAGS=-lcmph -lgtest -lpthread -lstdc++ -lstdc++fs -fprofile-arcs -ftest-coverage + +hash: hash.o test_hash.o + $(CXX) -o $@ $^ $(LDFLAGS) + +hash.c: hash.h +test_hash.cpp: hash.h + +format: + clang-format -i hash.c hash.h test_hash.cpp + +check: hash + valgrind --leak-check=full --tool=memcheck ./hash + lcov -d . -c -o hash.lcov + rm -fr html ; genhtml -o html hash.lcov + firefox html/perfecthash/hash.c.gcov.html + +clean: + rm -f *.o hash diff --git a/swh/perfecthash/__init__.py b/swh/perfecthash/__init__.py --- a/swh/perfecthash/__init__.py +++ b/swh/perfecthash/__init__.py @@ -0,0 +1,93 @@ +from _hash_cffi import lib +from cffi import FFI + + +class Shard: + """Low level management for files indexed with a perfect hash table. + + This class allows creating a Read Shard by adding key/object pairs + and looking up the content of an object when given the key. + """ + + def __init__(self, path): + """Initialize with an existing Read Shard. + + Args: + path (str): path to an existing Read Shard file or device + + """ + self.ffi = FFI() + self.shard = lib.shard_init(path.encode("utf-8")) + + def __del__(self): + lib.shard_destroy(self.shard) + + def create(self, objects_count): + """Wipe out the content of the Read Shard. It must be followed by + **object_count** calls to the **write** method otherwise the content + of the Read Shard will be inconsistent. When all objects are inserted, + the Read Shard must be made persistent by calling the **save** method. + + Args: + objects_count (int): number of objects in the Read Shard. + + Returns: + self. + + """ + lib.shard_create(self.shard, objects_count) + return self + + def load(self): + """Open the Read Shard file in read-only mode. + + Returns: + self. + """ + lib.shard_load(self.shard) + return self + + def save(self): + """Create the perfect hash table the **lookup** method + relies on to find the content of the objects. + + It must be called after **create** an **write** otherwise the + content of the Read Shard will be inconsistent. + + Returns: + 0 on success, -1 on error. + """ + return lib.shard_save(self.shard) + + def lookup(self, key): + """Fetch the object matching the key in the Read Shard. + + Fetching an object is O(1): one lookup in the index to obtain + the offset of the object in the Read Shard and one read to get + the payload. + + Args: + key (32 bytes): the key associated with the object to retrieve. + + Returns: + the object as bytes. + """ + object_pointer = self.ffi.new("char**") + object_size_pointer = self.ffi.new("size_t*") + lib.shard_lookup(self.shard, key, object_pointer, object_size_pointer) + return self.ffi.buffer(object_pointer[0], object_size_pointer[0]) + + def write(self, key, object): + """Add the key/object pair to the Read Shard. + + The **create** method must have been called prior to calling + the **write** method. + + Args: + key (32 bytes): the unique key associated with the object. + object (bytes): the object + + Returns: + 0 on success, -1 on error. + """ + return lib.shard_object_write(self.shard, key, object, len(object)) diff --git a/swh/perfecthash/build.py b/swh/perfecthash/build.py new file mode 100644 --- /dev/null +++ b/swh/perfecthash/build.py @@ -0,0 +1,36 @@ +from cffi import FFI + +ffibuilder = FFI() + +# cdef() expects a single string declaring the C types, functions and +# globals needed to use the shared object. It must be in valid C syntax. +ffibuilder.cdef( + """ +typedef struct shard_t shard_t; + +shard_t* shard_init(const char* path); +int shard_destroy(shard_t* shard); + +int shard_create(shard_t* shard, size_t objects_count); +int shard_object_write(shard_t* shard, const char* key, + const char* object, size_t object_size); +int shard_save(shard_t* shard); + +int shard_load(shard_t* shard); +int shard_lookup(shard_t* shard, const char* key, char** object, size_t* object_size); + +""" +) + +ffibuilder.set_source( + "_hash_cffi", + """ + #include "swh/perfecthash/hash.h" + """, + sources=["swh/perfecthash/hash.c"], + include_dirs=["."], + libraries=["cmph"], +) # library name, for the linker + +if __name__ == "__main__": + ffibuilder.compile(verbose=True) diff --git a/swh/perfecthash/hash.h b/swh/perfecthash/hash.h new file mode 100644 --- /dev/null +++ b/swh/perfecthash/hash.h @@ -0,0 +1,44 @@ +#include +#include + +#define SHARD_OFFSET_HEADER 512 +#define SHARD_KEY_LEN 32 + +typedef struct { + size_t objects_count; + size_t objects_position; + size_t objects_size; + size_t index_position; + size_t index_size; + size_t hash_position; +} shard_header_t; + +typedef struct { + char key[SHARD_KEY_LEN]; + size_t object_offset; +} shard_index_t; + +typedef struct { + cmph_io_adapter_t *source; + cmph_config_t *config; + cmph_t *hash; + char *addr; + char *path; + shard_index_t *index; + size_t index_offset; + size_t file_size; + shard_header_t header; + size_t object_offset; +} shard_t; + +shard_t *shard_init(const char *path); +int shard_destroy(shard_t *shard); + +int shard_create(shard_t *shard, size_t objects_count); +int shard_object_write(shard_t *shard, const char *key, const char *object, + size_t object_size); +int shard_save(shard_t *shard); + +int shard_load(shard_t *shard); +int shard_lookup(shard_t *shard, const char *key, char **object, + size_t *object_size); diff --git a/swh/perfecthash/hash.c b/swh/perfecthash/hash.c new file mode 100644 --- /dev/null +++ b/swh/perfecthash/hash.c @@ -0,0 +1,258 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "swh/perfecthash/hash.h" + +#ifdef HASH_DEBUG +#define debug(...) printf(__VA_ARGS__) +#else +#define debug(...) +#endif + +size_t ntohq(size_t v) { return __builtin_bswap64(v); } + +size_t htonq(size_t v) { return __builtin_bswap64(v); } + +int shard_size_align(size_t s) { + int r = sizeof(size_t); + if (s % r) + return s + r - (s % r); + else + return s; +} + +int shard_header_load(shard_header_t *header, int fd) { + if (lseek(fd, 0, SEEK_SET) < 0) { + perror("lseek"); + return -1; + } + if (read(fd, (void *)header, sizeof(shard_header_t)) < 0) { + perror("read"); + return -1; + } + return 0; +} + +int shard_header_reset(shard_header_t *header) { + memset((void *)header, '\0', sizeof(shard_header_t)); + header->objects_position = SHARD_OFFSET_HEADER; + return 0; +} + +int shard_header_save(shard_t *shard) { + memcpy((void *)shard->addr, (void *)&(shard->header), + sizeof(shard_header_t)); + return 0; +} + +int shard_map(shard_t *shard, int flags) { + int fd = open(shard->path, flags); + if (fd < 0) { + perror("open"); + return -1; + } + struct stat sb; + if (fstat(fd, &sb) == -1) { + perror("fstat"); + return -1; + } + shard->file_size = sb.st_size; + int prot = flags == O_RDONLY ? PROT_READ : PROT_WRITE; + shard->addr = mmap(NULL, shard->file_size, prot, MAP_SHARED, fd, 0); + if (shard->addr == NULL) { + perror("mmap"); + return -1; + } + if (shard_header_load(&shard->header, fd) < 0) + return -1; + if (close(fd) < 0) { + perror("close"); + return -1; + } + return 0; +} + +int shard_uninit(shard_t *shard) { + if (shard->addr) + return munmap(shard->addr, shard->file_size); + else + return 0; +} + +int shard_object_write(shard_t *shard, const char *key, const char *object, + size_t object_size) { + // save key & index to later build the hash + shard_index_t *index = &shard->index[shard->index_offset]; + memcpy((void *)index->key, key, SHARD_KEY_LEN); + index->object_offset = shard->object_offset; + shard->index_offset++; + // write the object size and the object itself + size_t n_object_size = htonq(object_size); + debug("shard_object_write: object_size = %ld n_object_size = %ld\n", + object_size, n_object_size); + size_t object_offset = + shard->header.objects_position + shard->object_offset; + debug("shard_object_write: object_offset = %ld\n", object_offset); + size_t next_object_offset = shard->object_offset + shard_size_align(sizeof(size_t) + object_size); + if (next_object_offset >= shard->file_size) + return -1; + memcpy((void *)(shard->addr + object_offset), (const void *)&n_object_size, + sizeof(size_t)); + memcpy((void *)(shard->addr + object_offset + sizeof(size_t)), + (const void *)object, object_size); + shard->object_offset = next_object_offset; + return 0; +} + +static int io_read(void *data, char **key, cmph_uint32 *keylen) { + shard_t *shard = (shard_t *)data; + *key = shard->index[shard->index_offset].key; + *keylen = SHARD_KEY_LEN; + shard->index_offset++; + return shard->index_offset >= shard->header.objects_count ? -1 : 0; +} + +static void io_dispose(void *data, char *key, cmph_uint32 keylen) {} + +static void io_rewind(void *data) { + shard_t *shard = (shard_t *)data; + shard->index_offset = 0; +} + +static cmph_io_adapter_t *io_adapter(shard_t *shard) { + cmph_io_adapter_t *key_source = + (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); + if (key_source == NULL) + return NULL; + key_source->data = (void *)shard; + key_source->nkeys = shard->header.objects_count; + key_source->read = io_read; + key_source->dispose = io_dispose; + key_source->rewind = io_rewind; + return key_source; +} + +int shard_lookup(shard_t *shard, const char *key, char **object, + size_t *object_size) { + cmph_uint32 h = cmph_search(shard->hash, key, SHARD_KEY_LEN); + debug("shard_lookup: h = %d\n", h); + size_t index_offset = shard->header.index_position + h * sizeof(size_t); + debug("shard_lookup: index_offset = %ld\n", index_offset); + size_t object_offset = shard->header.objects_position + + ntohq(*(size_t *)(shard->addr + index_offset)); + debug("shard_lookup: object_offset = %ld\n", object_offset); + *object_size = ntohq(*(size_t *)(shard->addr + object_offset)); + debug("shard_lookup: object_size = %ld\n", *object_size); + *object = shard->addr + object_offset + sizeof(size_t); + return 0; +} + +int shard_hash_create(shard_t *shard) { + shard->source = io_adapter(shard); + shard->config = cmph_config_new(shard->source); + cmph_config_set_algo(shard->config, CMPH_CHD_PH); + cmph_config_set_keys_per_bin(shard->config, 1); + cmph_config_set_b(shard->config, 4); + shard->hash = cmph_new(shard->config); + return 0; +} + +int shard_index_save(shard_t *shard) { + shard->header.index_position = shard_size_align( + shard->header.objects_position + shard->header.objects_size); + cmph_uint32 count = cmph_size(shard->hash); + debug("shard_index_save: count = %d\n", count); + shard->header.index_size = count * sizeof(size_t); + size_t *index = (size_t *)malloc(shard->header.index_size); + for (size_t i = 0; i < shard->index_offset; i++) { + cmph_uint32 h = + cmph_search(shard->hash, shard->index[i].key, SHARD_KEY_LEN); + debug("shard_index_save: i = %ld, h = %d, offset = %ld\n", i, h, + shard->index[i].object_offset); + index[h] = htonq(shard->index[i].object_offset); + } + memcpy((void *)(shard->addr + shard->header.index_position), index, + shard->header.index_size); + free(index); + return 0; +} + +int shard_hash_save(shard_t *shard) { + shard->header.hash_position = shard_size_align( + shard->header.index_position + shard->header.index_size); + FILE *fd = fopen(shard->path, "r+"); + fseek(fd, shard->header.hash_position, SEEK_SET); + cmph_dump(shard->hash, fd); + fclose(fd); + return 0; +} + +int shard_hash_load(shard_t *shard) { + FILE *fd = fopen(shard->path, "r"); + fseek(fd, shard->header.hash_position, SEEK_SET); + shard->hash = cmph_load(fd); + assert(shard->hash != NULL); + fclose(fd); + return 0; +} + +int shard_save(shard_t *shard) { + shard->header.objects_size = shard->object_offset; + if (shard_hash_create(shard) < 0) + return -1; + if (shard_index_save(shard) < 0) + return -1; + if (shard_hash_save(shard) < 0) + return -1; + if (shard_header_save(shard) < 0) + return -1; + return 0; +} + +int shard_destroy(shard_t *shard) { + if (shard->source) + free(shard->source); + if (shard->config) + cmph_config_destroy(shard->config); + if (shard->hash) + cmph_destroy(shard->hash); + shard_uninit(shard); + if (shard->index) + free(shard->index); + free(shard->path); + free(shard); + return 0; +} + +shard_t *shard_init(const char *path) { + shard_t *shard = (shard_t *)malloc(sizeof(shard_t)); + if (shard == NULL) + return NULL; + memset((void *)shard, '\0', sizeof(shard_t)); + shard->path = strdup(path); + return shard; +} + +int shard_reset(shard_t *shard) { return shard_header_reset(&shard->header); } + +int shard_create(shard_t *shard, size_t objects_count) { + if (shard_map(shard, O_RDWR) < 0) + return -1; + if (shard_reset(shard) < 0) + return -1; + shard->header.objects_count = objects_count; + shard->index = + (shard_index_t *)malloc(sizeof(shard_index_t) * objects_count); + return 0; +} + +int shard_load(shard_t *shard) { + if (shard_map(shard, O_RDONLY) < 0) + return -1; + return shard_hash_load(shard); +} diff --git a/swh/perfecthash/test_hash.cpp b/swh/perfecthash/test_hash.cpp new file mode 100644 --- /dev/null +++ b/swh/perfecthash/test_hash.cpp @@ -0,0 +1,154 @@ +#include +#include +#include +#include +#include +#include + +extern "C" { +#include "hash.h" +} + +using namespace std::experimental; + +filesystem::path +create_temporary_directory(unsigned long long max_tries = 1000) { + auto tmp_dir = filesystem::temp_directory_path(); + unsigned long long i = 0; + std::random_device dev; + std::mt19937 prng(dev()); + std::uniform_int_distribution rand(0); + filesystem::path path; + while (true) { + std::stringstream ss; + ss << std::hex << rand(prng); + path = tmp_dir / ss.str(); + // true if the directory was created. + if (filesystem::create_directory(path)) { + break; + } + if (i == max_tries) { + throw std::runtime_error("could not find non-existing directory"); + } + i++; + } + return path; +} + +std::string gen_random(const int len) { + + std::string tmp_s; + static const char alphanum[] = "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; + + tmp_s.reserve(len); + std::random_device dev; + std::mt19937 prng(dev()); + std::uniform_int_distribution rand(0, strlen(alphanum) - 1); + + for (int i = 0; i < len; ++i) + tmp_s += alphanum[rand(prng)]; + + return tmp_s; +} + +TEST(HashTest, One) { + auto tmpdir = create_temporary_directory(); + filesystem::path tmpfile = tmpdir / std::string("shard"); + ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0); + ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0); + + shard_t *shard = shard_init(tmpfile.c_str()); + ASSERT_NE(shard, nullptr); + ASSERT_GE(shard_create(shard, 1), 0); + const char *keyA = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + const char *objectA = "AAAA"; + size_t objectA_size = strlen(objectA); + ASSERT_GE(shard_object_write(shard, keyA, objectA, objectA_size), 0); + ASSERT_GE(shard_save(shard), 0); + char *found = NULL; + size_t found_size = 0; + ASSERT_GE(shard_lookup(shard, keyA, &found, &found_size), 0); + ASSERT_EQ(objectA_size, found_size); + ASSERT_EQ(memcmp((const void *)objectA, (const void *)found, found_size), + 0); + ASSERT_GE(shard_destroy(shard), 0); + + shard = shard_init(tmpfile.c_str()); + ASSERT_NE(shard, nullptr); + ASSERT_GE(shard_load(shard), 0); + found = NULL; + found_size = 0; + ASSERT_GE(shard_lookup(shard, keyA, &found, &found_size), 0); + ASSERT_EQ(objectA_size, found_size); + ASSERT_EQ(memcmp((const void *)objectA, (const void *)found, found_size), + 0); + ASSERT_GE(shard_destroy(shard), 0); + + filesystem::remove_all(tmpdir); +} + +TEST(HashTest, TooBig) { + auto tmpdir = create_temporary_directory(); + filesystem::path tmpfile = tmpdir / std::string("shard"); + ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0); + auto small_size = 10 * 1024; + ASSERT_GE(truncate(tmpfile.c_str(), small_size), 0); + + shard_t *shard = shard_init(tmpfile.c_str()); + ASSERT_NE(shard, nullptr); + ASSERT_GE(shard_create(shard, 1), 0); + const char *keyA = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + std::string objectA = gen_random(small_size * 2); + ASSERT_EQ(shard_object_write(shard, keyA, objectA.c_str(), objectA.length()), -1); + ASSERT_GE(shard_destroy(shard), 0); + + filesystem::remove_all(tmpdir); +} + +TEST(HashTest, Many) { + auto tmpdir = create_temporary_directory(); + filesystem::path tmpfile = tmpdir / std::string("shard"); + ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0); + ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0); + + std::map key2object; + + shard_t *shard = shard_init(tmpfile.c_str()); + ASSERT_NE(shard, nullptr); + int objects_count = 10; + ASSERT_GE(shard_create(shard, objects_count), 0); + for (int i = 0; i < objects_count; i++) { + std::string key = gen_random(32); + std::string object = gen_random(50); + key2object[key] = object; + std::cout << key << std::endl; + ASSERT_GE(shard_object_write(shard, key.c_str(), object.c_str(), + object.length()), + 0); + } + ASSERT_GE(shard_save(shard), 0); + ASSERT_GE(shard_destroy(shard), 0); + + shard = shard_init(tmpfile.c_str()); + ASSERT_NE(shard, nullptr); + ASSERT_GE(shard_load(shard), 0); + for (std::pair p : key2object) { + char *found = NULL; + size_t found_size = 0; + ASSERT_GE(shard_lookup(shard, p.first.c_str(), &found, &found_size), 0); + ASSERT_EQ(p.second.length(), found_size); + ASSERT_EQ(memcmp((const void *)p.second.c_str(), (const void *)found, + found_size), + 0); + } + ASSERT_GE(shard_destroy(shard), 0); + + filesystem::remove_all(tmpdir); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/swh/perfecthash/tests/test_hash.py b/swh/perfecthash/tests/test_hash.py new file mode 100644 --- /dev/null +++ b/swh/perfecthash/tests/test_hash.py @@ -0,0 +1,24 @@ +import os + +from swh.perfecthash import Shard + + +def test_all(tmpdir): + f = f"{tmpdir}/shard" + open(f, "w").close() + os.truncate(f, 10 * 1024 * 1024) + + s = Shard(f).create(2) + keyA = b"A" * 32 + objectA = b"AAAA" + s.write(keyA, objectA) + keyB = b"B" * 32 + objectB = b"BBBB" + s.write(keyB, objectB) + s.save() + del s + + s = Shard(f).load() + assert s.lookup(keyA) == objectA + assert s.lookup(keyB) == objectB + del s