diff --git a/AUTHORS b/AUTHORS --- a/AUTHORS +++ b/AUTHORS @@ -1,3 +1,3 @@ -Copyright (C) 2019 The Software Heritage developers +Copyright (C) 2021 The Software Heritage developers See http://www.softwareheritage.org/ for more information. diff --git a/MANIFEST.in b/MANIFEST.in --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include Makefile include requirements*.txt include version.txt -include README.md +include README.rst recursive-include swh py.typed +include swh/perfecthash/hash.[ch] diff --git a/docs/index.rst b/docs/index.rst --- a/docs/index.rst +++ b/docs/index.rst @@ -1,4 +1,4 @@ -.. _swh-py-template: +.. _swh-perfecthash: .. include:: README.rst diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -11,5 +11,8 @@ [mypy-pytest.*] ignore_missing_imports = True -# [mypy-add_your_lib_here.*] -# ignore_missing_imports = True +[mypy-_hash_cffi.*] +ignore_missing_imports = True + +[mypy-cffi.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html +cffi diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -48,9 +48,10 @@ packages=find_packages(), # packages's modules install_requires=parse_requirements(None, "swh"), tests_require=parse_requirements("test"), - setup_requires=["setuptools-scm"], + setup_requires=["setuptools-scm", "cffi"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, + cffi_modules=["swh/perfecthash/build.py:ffibuilder"], classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", diff --git a/swh/perfecthash/build.py b/swh/perfecthash/build.py new file mode 100644 --- /dev/null +++ b/swh/perfecthash/build.py @@ -0,0 +1,32 @@ +from cffi import FFI + +ffibuilder = FFI() + +# cdef() expects a single string declaring the C types, functions and +# globals needed to use the shared object. It must be in valid C syntax. +ffibuilder.cdef( + """ +#include "swh/perfecthash/hash.h" + +shard_t* shard_create(const char* path); +int shard_object_write(shard_t* shard, const char* object, size_t object_size, const char* key); +int shard_save(shard_t* shard); +int shard_destroy(shard_t* shard); + +shard_t* shard_load(const char* path); +const char* shard_lookup(shard_t* shard, const char* key) +""" +) + +ffibuilder.set_source( + "_hash_cffi", + """ + #include "swh/perfecthash/hash.h" + """, + sources=["swh/perfecthash/hash.c"], + include_dirs=["."], + libraries=["cmph"], +) # library name, for the linker + +if __name__ == "__main__": + ffibuilder.compile(verbose=True) diff --git a/swh/perfecthash/hash.h b/swh/perfecthash/hash.h new file mode 100644 --- /dev/null +++ b/swh/perfecthash/hash.h @@ -0,0 +1,30 @@ +#include + +#include "swh/perfecthash/hash.h" + +#define SHARD_OFFSET_HEADER 512 +#define SHARD_KEY_LEN 32 + +typedef struct { + size_t objects_count; + size_t objects_position; + size_t objects_size; + size_t index_position; + size_t index_size; + size_t hash_position; + size_t hash_size; +} shard_header_t; + +typedef struct { + char key[SHARD_KEY_LEN]; + size_t object_offset; +} shard_index_t; + +typedef struct { + char* addr; + shard_index_t* index; + size_t index_offset; + size_t file_size; + shard_header_t header; + size_t object_offset; +} shard_t; diff --git a/swh/perfecthash/hash.c b/swh/perfecthash/hash.c new file mode 100644 --- /dev/null +++ b/swh/perfecthash/hash.c @@ -0,0 +1,208 @@ +#include "swh/perfecthash/hash.h" + +#include +#include + +#include + +int shard_header_load(shard_header_t *header, int fd) { + if (lseek(fd, 0, SEEK_SET) < 0) { + perror("lseek"); + return -1; + } + if (read(fd, (void*)header, sizeof(shard_header_t)) < 0) { + perror("read"); + return -1; + } + return 0; +} + +int shard_header_reset(shard_header_t* header) { + memset((void*)header, '\0', sizeof(shard_header_t)); + return 0; +} + +int shard_header_save(shard_header_t *header, int fd) { + if (lseek(fd, 0, SEEK_SET) < 0) { + perror("seek"); + return -1; + } + if (write(fd, (void*)header, sizeof(shard_header_t)) < 0) { + perror("write"); + return -1; + } + return 0; +} + +int shard_init(shard_t *shard, int fd) +{ + struct stat sb; + if (fstat(fd, &sb) == -1) { + perror("fstat"); + return -1; + } + shard->file_size = sb.st_size; + shard->addr = mmap(NULL, shard->size, PROT_READ, MAP_PRIVATE, fd, 0); + if (shard->addr == NULL) { + perror("mmap"); + return -1; + } + shard_header_load(&shard->header, fd); + return 0; +} + +int shard_uninit(shard_t *shard) { + return munmap(shard->addr, shard->file_size); +} + +int shard_object_write(shard_t *shard, const char* object, size_t object_size, const char* key) { + // save key & index to later build the hash + shard_index_t* index = &shard->index[shard->index_offset]; + memcpy((void*)&index->key, key, SHARD_KEY_LEN); + index->object_offset = shard->object_offset; + shard->index_offset++; + // write the object + memcpy((void*)(shard->addr + shard->object_offset), (const void*)object, object_size); + shard->object_offset += object_size; +} + +static int io_read(void *data, char **key, cmph_uint32 *keylen) { + shard_t *shard = (shard_t *)data; + *key = shard->index[shard->index_offset].key; + *keylen = SHARD_KEY_LEN; + shard->key_offset += SHARD_KEY_LEN; +} + +static void io_dispose(void* data, char* key, cmph_uint32 keylen) { +} + +static void io_rewind(void *data) { + shard_t* shard = (shard_t*)data; + shard->key_offset = 0; +} + +static cmph_io_adapter_t *io_adapter(shard_t* shard) { + cmph_io_adapter_t* key_source = (cmph_io_adapter_t*)malloc(sizeof(cmph_io_adapter_t)); + if (key_source == NULL) + return NULL + key_source->data = (void*)shard; + key_source->nkeys = shard->header.objects_count; + key_source->read = io_read; + key_source->dispose = io_dispose; + key_source->rewind = io_rewind; + return key_source; +} + +int shard_lookup(shard_t* shard, const char* key, char** object, size_t* object_size) { + cmph_uint32 h = cmph_search(shard->hash, key, SHARD_KEY_LEN); +} + +int shard_hash_create(shard_t *shard) { + cmph_io_adapter_t* source = io_adapter(shard); + cmph_config_t* config = cmph_config_new(source); + cmph_config_set_algo(config, CMPH_CHD_PH); + // cmph_config_set_keys_per_bin + // cmph_config_set_b + shard->hash = cmph_new(config); +} + +int shard_index_save(shard_t* shard) { + shard->header.index_position = shard->header.objects_position + shard->header.objects_size; + cmph_uint32 count = cmph_size(shard->hash); + shard->header.index_size = count * sizeof(size_t); + size_t *index = (size_t*)malloc(shard->header.index_size); + for (i = 0; i < shard->index_offset; i++) { + cmph_uint32 h = cmph_search(shard->hash, shard->index[i].key, SHARD_KEY_LEN); + index[h] = shard->index[i].object_offset; + } + memcpy((void*)(shard->addr + shard->header.index_position), index, shard->header.index_size); +} + +int shard_hash_save(shard_t* shard) { + shard->header.hash_position = shard->header.index_position + shard->header.index_size; + FILE* fd = fopen(path, "r+"); + fseek(fd, shard->header.hash_position, SEEK_SET); + cmph_dump(shard->hash, fd); + fclose(fd); +} + +int shard_hash_load(shard_t* shard) { + FILE* fd = fopen(path, "r"); + fseek(fd, shard->header.hash_position, SEEK_SET); + shard->hash = cmph_load(fd); + fclose(fd); +} + +int shard_save(shard_t* shard) { + if (shard_header_save() < 0) + return -1; + if (shard_index_save() < 0) + return -1; + if (shard_hash_save(shard) < 0) + return -1; + return 0; +} + +int shard_destroy(shard_t* shard) { + FILE* mphf_fd = fopen(path, "a"); + fseek(mphf_fd, shard->header.offset_hash); + cmph_config_destroy(config); + cmph_dump(hash, mphf_fd); + cmph_destroy(hash); + fclose(mphf_fd); +} + +int shard_read(shard_t *shard, char **key, cmph_uint32 *keylen) { + *key = (char *)(shard->data); + *keylen = (cmph_uint32)SHARD_KEY_LEN; + size_t size = *(size_t *)(shard->data + SHARD_KEY_LEN); + shard->offset += SHARD_KEY_LEN + sizeof(size_t) + size; +} + +void shard_hash_rewind(shard_t *shard) { + shard->offset = shard->header.hash_offset; +} + +int shard_keys_create(shard_t* shard, size_t keys_count) { + shard->keys = (shard_key_t*)malloc(keys_count * sizeof(shard_key_t)); +} + +int shard_keys_destroy(shard_t* shard, size_t keys_count) { + free((void*)shard->keys); +} + +shard_t* shard_load_base(char *path) { + int fd = open(path, "r"); + if (fd < 0) { + perror("open"); + return -1; + } + shard_t* shard = (shard_t*)malloc(sizeof(shard_t)); + if (shard == NULL) + return NULL; + if (shard_init(shard, fd) < 0) + return NULL; + return shard; +} + +int shard_reset(shard_t* shard) { + return shard_header_reset(&shard->header); +} + +shard_t* shard_create(char *path) { + shard_t* shard = shard_load(path); + if (shard == NULL) + return NULL; + if (shard_reset(shard) < 0) + return NULL; + return shard; +} + +shard_t* shard_load(char *path) { + shard_t* shard = shard_load(path); + if (shard == NULL) + return NULL; + if (shard_hash_load() < 0) + return NULL; + return shard; +} diff --git a/swh/perfecthash/tests/test_hash.py b/swh/perfecthash/tests/test_hash.py new file mode 100644 --- /dev/null +++ b/swh/perfecthash/tests/test_hash.py @@ -0,0 +1,5 @@ +from _hash_cffi import lib + + +def test_build(): + assert lib.build(b"path") == 0