diff --git a/AUTHORS b/AUTHORS --- a/AUTHORS +++ b/AUTHORS @@ -1,3 +1,3 @@ -Copyright (C) 2019 The Software Heritage developers +Copyright (C) 2021 The Software Heritage developers See http://www.softwareheritage.org/ for more information. diff --git a/MANIFEST.in b/MANIFEST.in --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include Makefile include requirements*.txt include version.txt -include README.md +include README.rst recursive-include swh py.typed +include swh/perfecthash/hash.[ch] diff --git a/docs/index.rst b/docs/index.rst --- a/docs/index.rst +++ b/docs/index.rst @@ -1,4 +1,4 @@ -.. _swh-py-template: +.. _swh-perfecthash: .. include:: README.rst diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -11,5 +11,8 @@ [mypy-pytest.*] ignore_missing_imports = True -# [mypy-add_your_lib_here.*] -# ignore_missing_imports = True +[mypy-_hash_cffi.*] +ignore_missing_imports = True + +[mypy-cffi.*] +ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ # should match https://pypi.python.org/pypi names. For the full spec or # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html +cffi diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -48,9 +48,10 @@ packages=find_packages(), # packages's modules install_requires=parse_requirements(None, "swh"), tests_require=parse_requirements("test"), - setup_requires=["setuptools-scm"], + setup_requires=["setuptools-scm", "cffi"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, + cffi_modules=["swh/perfecthash/build.py:ffibuilder"], classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", diff --git a/swh/perfecthash/Makefile b/swh/perfecthash/Makefile new file mode 100644 --- /dev/null +++ b/swh/perfecthash/Makefile @@ -0,0 +1,8 @@ +CFLAGS=-Wall -I../.. -g +LDFLAGS=-lcmph -lgtest -lpthread + +hash: hash.o test_hash.o + $(CXX) -o $@ $^ $(LDFLAGS) + +hash.c: hash.h +test_hash.cpp: hash.h diff --git a/swh/perfecthash/__init__.py b/swh/perfecthash/__init__.py --- a/swh/perfecthash/__init__.py +++ b/swh/perfecthash/__init__.py @@ -0,0 +1,30 @@ +from _hash_cffi import lib + + +class Shard: + + def __init__(self, path): + self.shard = lib.shard_init(path.encode('utf-8')) + + def __del__(self): + lib.shard_destroy(self.shard) + + def create(self): + lib.shard_create(self.shard) + return self + + def load(self): + lib.shard_load(self.shard) + return self + + def save(self): + lib.shard_save(self.shard) + + def lookup(self, key): + object_pointer = ffi.new("char**") + object_size_pointer = ffi.new("size_t*") + lib.shard_lookup(self.shard, key, object, object_size) + return ffi.buffer(object_pointer[0], object_size_pointer[0]) + + def write(self, key, object): + lib.shard_object_write(self.shard, key, object, len(object)) diff --git a/swh/perfecthash/build.py b/swh/perfecthash/build.py new file mode 100644 --- /dev/null +++ b/swh/perfecthash/build.py @@ -0,0 +1,35 @@ +from cffi import FFI + +ffibuilder = FFI() + +# cdef() expects a single string declaring the C types, functions and +# globals needed to use the shared object. It must be in valid C syntax. +ffibuilder.cdef( + """ +typedef struct shard_t shard_t; + +shard_t* shard_init(const char* path); +int shard_destroy(shard_t* shard); + +int shard_create(shard_t* shard); +int shard_object_write(shard_t* shard, const char* key, const char* object, size_t object_size); +int shard_save(shard_t* shard); + +int shard_load(shard_t* shard); +int shard_lookup(shard_t* shard, const char* key, char** object, size_t* object_size); + +""" +) + +ffibuilder.set_source( + "_hash_cffi", + """ + #include "swh/perfecthash/hash.h" + """, + sources=["swh/perfecthash/hash.c"], + include_dirs=["."], + libraries=["cmph"], +) # library name, for the linker + +if __name__ == "__main__": + ffibuilder.compile(verbose=True) diff --git a/swh/perfecthash/hash.h b/swh/perfecthash/hash.h new file mode 100644 --- /dev/null +++ b/swh/perfecthash/hash.h @@ -0,0 +1,42 @@ +#include +#include + +#define SHARD_OFFSET_HEADER 512 +#define SHARD_KEY_LEN 32 + +typedef struct { + size_t objects_count; + size_t objects_position; + size_t objects_size; + size_t index_position; + size_t index_size; + size_t hash_position; + size_t hash_size; +} shard_header_t; + +typedef struct { + char key[SHARD_KEY_LEN]; + size_t object_offset; +} shard_index_t; + +typedef struct { + cmph_t* hash; + cmph_config_t* config; + char* addr; + char* path; + shard_index_t* index; + size_t index_offset; + size_t file_size; + shard_header_t header; + size_t object_offset; +} shard_t; + +shard_t* shard_init(const char* path); +int shard_destroy(shard_t* shard); + +int shard_create(shard_t* shard); +int shard_object_write(shard_t* shard, const char* key, const char* object, size_t object_size); +int shard_save(shard_t* shard); + +int shard_load(shard_t* shard); +int shard_lookup(shard_t* shard, const char* key, char** object, size_t* object_size); diff --git a/swh/perfecthash/hash.c b/swh/perfecthash/hash.c new file mode 100644 --- /dev/null +++ b/swh/perfecthash/hash.c @@ -0,0 +1,202 @@ +#include +#include +#include +#include +#include +#include + +#include "swh/perfecthash/hash.h" + +size_t ntohq(size_t v) { + return __builtin_bswap64(v); +} + +size_t htonq(size_t v) { + return __builtin_bswap64(v); +} + +int shard_size_align(size_t s) { + int r = sizeof(size_t); + if (s % r) + return s + r - (s % r); + else + return s; +} + +int shard_header_load(shard_header_t *header, int fd) { + if (lseek(fd, 0, SEEK_SET) < 0) { + perror("lseek"); + return -1; + } + if (read(fd, (void*)header, sizeof(shard_header_t)) < 0) { + perror("read"); + return -1; + } + return 0; +} + +int shard_header_reset(shard_header_t* header) { + memset((void*)header, '\0', sizeof(shard_header_t)); + return 0; +} + +int shard_header_save(shard_t *shard) { + memcpy((void*)shard->addr, (void*)&(shard->header), sizeof(shard_header_t)); + return 0; +} + +int shard_map(shard_t *shard, int flags) +{ + int fd = open(shard->path, flags); + if (fd < 0) { + perror("open"); + return -1; + } + struct stat sb; + if (fstat(fd, &sb) == -1) { + perror("fstat"); + return -1; + } + shard->file_size = sb.st_size; + int prot = flags == O_RDONLY ? PROT_READ : PROT_WRITE; + shard->addr = mmap(NULL, shard->file_size, prot, MAP_SHARED, fd, 0); + if (shard->addr == NULL) { + perror("mmap"); + return -1; + } + shard_header_load(&shard->header, fd); + return 0; +} + +int shard_uninit(shard_t *shard) { + return munmap(shard->addr, shard->file_size); +} + +int shard_object_write(shard_t *shard, const char* key, const char* object, size_t object_size) { + // save key & index to later build the hash + shard_index_t* index = &shard->index[shard->index_offset]; + memcpy((void*)&index->key, key, SHARD_KEY_LEN); + index->object_offset = shard->object_offset; + shard->index_offset++; + // write the object size and the object itself + size_t n_object_size = htonq(object_size); + memcpy((void*)(shard->addr + shard->object_offset), (const void*)&n_object_size, sizeof(size_t)); + memcpy((void*)(shard->addr + shard->object_offset + sizeof(size_t)), (const void*)object, object_size); + shard->object_offset += shard_size_align(sizeof(size_t) + object_size); + shard->header.objects_count++; +} + +static int io_read(void *data, char **key, cmph_uint32 *keylen) { + shard_t *shard = (shard_t *)data; + *key = shard->index[shard->index_offset].key; + *keylen = SHARD_KEY_LEN; + shard->index_offset++; + return shard->index_offset >= shard->header.objects_count ? -1 : 0; +} + +static void io_dispose(void* data, char* key, cmph_uint32 keylen) { +} + +static void io_rewind(void *data) { + shard_t* shard = (shard_t*)data; + shard->index_offset = 0; +} + +static cmph_io_adapter_t *io_adapter(shard_t* shard) { + cmph_io_adapter_t* key_source = (cmph_io_adapter_t*)malloc(sizeof(cmph_io_adapter_t)); + if (key_source == NULL) + return NULL; + key_source->data = (void*)shard; + key_source->nkeys = shard->header.objects_count; + key_source->read = io_read; + key_source->dispose = io_dispose; + key_source->rewind = io_rewind; + return key_source; +} + +int shard_lookup(shard_t* shard, const char* key, char** object, size_t* object_size) { + cmph_uint32 h = cmph_search(shard->hash, key, SHARD_KEY_LEN); + size_t index = ntohq(*(size_t*)(shard->addr + shard->header.index_position + h)); + *object_size = ntohq(*(size_t*)(shard->addr + index)); + *object = shard->addr + index + sizeof(size_t); + return 0; +} + +int shard_hash_create(shard_t *shard) { + cmph_io_adapter_t* source = io_adapter(shard); + shard->config = cmph_config_new(source); + cmph_config_set_algo(shard->config, CMPH_CHD_PH); + // cmph_config_set_keys_per_bin + // cmph_config_set_b + shard->hash = cmph_new(shard->config); +} + +int shard_index_save(shard_t* shard) { + shard->header.index_position = shard_size_align(shard->header.objects_position + shard->header.objects_size); + cmph_uint32 count = cmph_size(shard->hash); + shard->header.index_size = count * sizeof(size_t); + size_t *index = (size_t*)malloc(shard->header.index_size); + for (size_t i = 0; i < shard->index_offset; i++) { + cmph_uint32 h = cmph_search(shard->hash, shard->index[i].key, SHARD_KEY_LEN); + index[h] = htonq(shard->index[i].object_offset); + } + memcpy((void*)(shard->addr + shard->header.index_position), index, shard->header.index_size); + free(index); +} + +int shard_hash_save(shard_t* shard) { + shard->header.hash_position = shard_size_align(shard->header.index_position + shard->header.index_size); + FILE* fd = fopen(shard->path, "r+"); + fseek(fd, shard->header.hash_position, SEEK_SET); + cmph_dump(shard->hash, fd); + fclose(fd); +} + +int shard_hash_load(shard_t* shard) { + FILE* fd = fopen(shard->path, "r"); + fseek(fd, shard->header.hash_position, SEEK_SET); + shard->hash = cmph_load(fd); + fclose(fd); +} + +int shard_save(shard_t* shard) { + if (shard_header_save(shard) < 0) + return -1; + if (shard_index_save(shard) < 0) + return -1; + if (shard_hash_save(shard) < 0) + return -1; + return 0; +} + +int shard_destroy(shard_t* shard) { + cmph_config_destroy(shard->config); + cmph_destroy(shard->hash); + shard_uninit(shard); + free(shard->path); + free(shard); +} + +shard_t* shard_init(const char *path) { + shard_t* shard = (shard_t*)malloc(sizeof(shard_t)); + if (shard == NULL) + return NULL; + shard->path = strdup(path); + return shard; +} + +int shard_reset(shard_t* shard) { + return shard_header_reset(&shard->header); +} + +int shard_create(shard_t* shard) { + if (shard_map(shard, O_RDWR) < 0) + return -1; + return shard_reset(shard); +} + +int shard_load(shard_t* shard) { + if (shard_map(shard, O_RDONLY) < 0) + return -1; + return shard_hash_load(shard); +} diff --git a/swh/perfecthash/test_hash.cpp b/swh/perfecthash/test_hash.cpp new file mode 100644 --- /dev/null +++ b/swh/perfecthash/test_hash.cpp @@ -0,0 +1,13 @@ +#include +extern "C" { +#include "hash.h" +} + +TEST(HashTest, All) { + ASSERT_EQ(2, 2); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/swh/perfecthash/tests/test_hash.py b/swh/perfecthash/tests/test_hash.py new file mode 100644 --- /dev/null +++ b/swh/perfecthash/tests/test_hash.py @@ -0,0 +1,24 @@ +import os +from swh.perfecthash import Shard + + +def test_all(tmpdir): + f = f'{tmpdir}/shard' + open(f, 'w').close() + os.truncate(f, 10*1024*1024) + + s = Shard(f).create() + keyA = "A"*32 + objectA = "AAAA" + s.write(keyA, objectA) + keyB = "B"*32 + objectB = "BBBB" + s.write(keyB, objectB) + s.save() + del s + + s = Shard(f).load() + assert s.lookup(keyA) == objectA + assert s.lookup(keyB) == objectB + assert s.lookup("UNKNOWN") is None + del s