Page MenuHomeSoftware Heritage

D6424.id23469.diff
No OneTemporary

D6424.id23469.diff

diff --git a/.gitignore b/.gitignore
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,9 @@
version.txt
.mypy_cache/
.vscode/
+*.o
+swh/perfecthash/hash
+swh/perfecthash/hash.gcda
+swh/perfecthash/hash.gcno
+swh/perfecthash/hash.lcov
+swh/perfecthash/html
diff --git a/AUTHORS b/AUTHORS
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,3 +1,3 @@
-Copyright (C) 2019 The Software Heritage developers
+Copyright (C) 2021 The Software Heritage developers
See http://www.softwareheritage.org/ for more information.
diff --git a/MANIFEST.in b/MANIFEST.in
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,6 @@
include Makefile
include requirements*.txt
include version.txt
-include README.md
+include README.rst
recursive-include swh py.typed
+include swh/perfecthash/hash.[ch]
diff --git a/docs/index.rst b/docs/index.rst
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,4 +1,4 @@
-.. _swh-py-template:
+.. _swh-perfecthash:
.. include:: README.rst
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -11,5 +11,8 @@
[mypy-pytest.*]
ignore_missing_imports = True
-# [mypy-add_your_lib_here.*]
-# ignore_missing_imports = True
+[mypy-_hash_cffi.*]
+ignore_missing_imports = True
+
+[mypy-cffi.*]
+ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@
# should match https://pypi.python.org/pypi names. For the full spec or
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
+cffi
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -48,9 +48,10 @@
packages=find_packages(), # packages's modules
install_requires=parse_requirements(None, "swh"),
tests_require=parse_requirements("test"),
- setup_requires=["setuptools-scm"],
+ setup_requires=["setuptools-scm", "cffi"],
use_scm_version=True,
extras_require={"testing": parse_requirements("test")},
+ cffi_modules=["swh/perfecthash/build.py:ffibuilder"],
classifiers=[
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
diff --git a/swh/perfecthash/.clang-format b/swh/perfecthash/.clang-format
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/.clang-format
@@ -0,0 +1,2 @@
+---
+IndentWidth: 4
diff --git a/swh/perfecthash/Makefile b/swh/perfecthash/Makefile
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/Makefile
@@ -0,0 +1,20 @@
+CFLAGS=-DHASH_DEBUG -Wall -I../.. -g -std=c++17 -fprofile-arcs -ftest-coverage
+LDFLAGS=-lcmph -lgtest -lpthread -lstdc++ -lstdc++fs -fprofile-arcs -ftest-coverage
+
+hash: hash.o test_hash.o
+ $(CXX) -o $@ $^ $(LDFLAGS)
+
+hash.c: hash.h
+test_hash.cpp: hash.h
+
+format:
+ clang-format -i hash.c hash.h test_hash.cpp
+
+check: hash
+ valgrind --leak-check=full --tool=memcheck ./hash
+ lcov -d . -c -o hash.lcov
+ rm -fr html ; genhtml -o html hash.lcov
+ firefox html/perfecthash/hash.c.gcov.html
+
+clean:
+ rm -f *.o hash
diff --git a/swh/perfecthash/__init__.py b/swh/perfecthash/__init__.py
--- a/swh/perfecthash/__init__.py
+++ b/swh/perfecthash/__init__.py
@@ -0,0 +1,30 @@
+from _hash_cffi import lib
+
+
+class Shard:
+
+ def __init__(self, path):
+ self.shard = lib.shard_init(path.encode('utf-8'))
+
+ def __del__(self):
+ lib.shard_destroy(self.shard)
+
+ def create(self):
+ lib.shard_create(self.shard)
+ return self
+
+ def load(self):
+ lib.shard_load(self.shard)
+ return self
+
+ def save(self):
+ lib.shard_save(self.shard)
+
+ def lookup(self, key):
+ object_pointer = ffi.new("char**")
+ object_size_pointer = ffi.new("size_t*")
+ lib.shard_lookup(self.shard, key, object, object_size)
+ return ffi.buffer(object_pointer[0], object_size_pointer[0])
+
+ def write(self, key, object):
+ lib.shard_object_write(self.shard, key, object, len(object))
diff --git a/swh/perfecthash/build.py b/swh/perfecthash/build.py
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/build.py
@@ -0,0 +1,35 @@
+from cffi import FFI
+
+ffibuilder = FFI()
+
+# cdef() expects a single string declaring the C types, functions and
+# globals needed to use the shared object. It must be in valid C syntax.
+ffibuilder.cdef(
+ """
+typedef struct shard_t shard_t;
+
+shard_t* shard_init(const char* path);
+int shard_destroy(shard_t* shard);
+
+int shard_create(shard_t* shard);
+int shard_object_write(shard_t* shard, const char* key, const char* object, size_t object_size);
+int shard_save(shard_t* shard);
+
+int shard_load(shard_t* shard);
+int shard_lookup(shard_t* shard, const char* key, char** object, size_t* object_size);
+
+"""
+)
+
+ffibuilder.set_source(
+ "_hash_cffi",
+ """
+ #include "swh/perfecthash/hash.h"
+ """,
+ sources=["swh/perfecthash/hash.c"],
+ include_dirs=["."],
+ libraries=["cmph"],
+) # library name, for the linker
+
+if __name__ == "__main__":
+ ffibuilder.compile(verbose=True)
diff --git a/swh/perfecthash/hash.h b/swh/perfecthash/hash.h
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/hash.h
@@ -0,0 +1,44 @@
+#include <cmph.h>
+#include <cmph_types.h>
+
+#define SHARD_OFFSET_HEADER 512
+#define SHARD_KEY_LEN 32
+
+typedef struct {
+ size_t objects_count;
+ size_t objects_position;
+ size_t objects_size;
+ size_t index_position;
+ size_t index_size;
+ size_t hash_position;
+} shard_header_t;
+
+typedef struct {
+ char key[SHARD_KEY_LEN];
+ size_t object_offset;
+} shard_index_t;
+
+typedef struct {
+ cmph_io_adapter_t *source;
+ cmph_config_t *config;
+ cmph_t *hash;
+ char *addr;
+ char *path;
+ shard_index_t *index;
+ size_t index_offset;
+ size_t file_size;
+ shard_header_t header;
+ size_t object_offset;
+} shard_t;
+
+shard_t *shard_init(const char *path);
+int shard_destroy(shard_t *shard);
+
+int shard_create(shard_t *shard, size_t objects_count);
+int shard_object_write(shard_t *shard, const char *key, const char *object,
+ size_t object_size);
+int shard_save(shard_t *shard);
+
+int shard_load(shard_t *shard);
+int shard_lookup(shard_t *shard, const char *key, char **object,
+ size_t *object_size);
diff --git a/swh/perfecthash/hash.c b/swh/perfecthash/hash.c
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/hash.c
@@ -0,0 +1,255 @@
+#include <assert.h>
+#include <fcntl.h>
+#include <memory.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "swh/perfecthash/hash.h"
+
+#ifdef HASH_DEBUG
+#define debug(...) printf(__VA_ARGS__)
+#else
+#define debug(...)
+#endif
+
+size_t ntohq(size_t v) { return __builtin_bswap64(v); }
+
+size_t htonq(size_t v) { return __builtin_bswap64(v); }
+
+int shard_size_align(size_t s) {
+ int r = sizeof(size_t);
+ if (s % r)
+ return s + r - (s % r);
+ else
+ return s;
+}
+
+int shard_header_load(shard_header_t *header, int fd) {
+ if (lseek(fd, 0, SEEK_SET) < 0) {
+ perror("lseek");
+ return -1;
+ }
+ if (read(fd, (void *)header, sizeof(shard_header_t)) < 0) {
+ perror("read");
+ return -1;
+ }
+ return 0;
+}
+
+int shard_header_reset(shard_header_t *header) {
+ memset((void *)header, '\0', sizeof(shard_header_t));
+ header->objects_position = SHARD_OFFSET_HEADER;
+ return 0;
+}
+
+int shard_header_save(shard_t *shard) {
+ memcpy((void *)shard->addr, (void *)&(shard->header),
+ sizeof(shard_header_t));
+ return 0;
+}
+
+int shard_map(shard_t *shard, int flags) {
+ int fd = open(shard->path, flags);
+ if (fd < 0) {
+ perror("open");
+ return -1;
+ }
+ struct stat sb;
+ if (fstat(fd, &sb) == -1) {
+ perror("fstat");
+ return -1;
+ }
+ shard->file_size = sb.st_size;
+ int prot = flags == O_RDONLY ? PROT_READ : PROT_WRITE;
+ shard->addr = mmap(NULL, shard->file_size, prot, MAP_SHARED, fd, 0);
+ if (shard->addr == NULL) {
+ perror("mmap");
+ return -1;
+ }
+ if (shard_header_load(&shard->header, fd) < 0)
+ return -1;
+ if (close(fd) < 0) {
+ perror("close");
+ return -1;
+ }
+ return 0;
+}
+
+int shard_uninit(shard_t *shard) {
+ if (shard->addr)
+ return munmap(shard->addr, shard->file_size);
+ else
+ return 0;
+}
+
+int shard_object_write(shard_t *shard, const char *key, const char *object,
+ size_t object_size) {
+ // save key & index to later build the hash
+ shard_index_t *index = &shard->index[shard->index_offset];
+ memcpy((void *)index->key, key, SHARD_KEY_LEN);
+ index->object_offset = shard->object_offset;
+ shard->index_offset++;
+ // write the object size and the object itself
+ size_t n_object_size = htonq(object_size);
+ debug("shard_object_write: object_size = %ld n_object_size = %ld\n",
+ object_size, n_object_size);
+ size_t object_offset =
+ shard->header.objects_position + shard->object_offset;
+ debug("shard_object_write: object_offset = %ld\n", object_offset);
+ memcpy((void *)(shard->addr + object_offset), (const void *)&n_object_size,
+ sizeof(size_t));
+ memcpy((void *)(shard->addr + object_offset + sizeof(size_t)),
+ (const void *)object, object_size);
+ shard->object_offset += shard_size_align(sizeof(size_t) + object_size);
+ return 0;
+}
+
+static int io_read(void *data, char **key, cmph_uint32 *keylen) {
+ shard_t *shard = (shard_t *)data;
+ *key = shard->index[shard->index_offset].key;
+ *keylen = SHARD_KEY_LEN;
+ shard->index_offset++;
+ return shard->index_offset >= shard->header.objects_count ? -1 : 0;
+}
+
+static void io_dispose(void *data, char *key, cmph_uint32 keylen) {}
+
+static void io_rewind(void *data) {
+ shard_t *shard = (shard_t *)data;
+ shard->index_offset = 0;
+}
+
+static cmph_io_adapter_t *io_adapter(shard_t *shard) {
+ cmph_io_adapter_t *key_source =
+ (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
+ if (key_source == NULL)
+ return NULL;
+ key_source->data = (void *)shard;
+ key_source->nkeys = shard->header.objects_count;
+ key_source->read = io_read;
+ key_source->dispose = io_dispose;
+ key_source->rewind = io_rewind;
+ return key_source;
+}
+
+int shard_lookup(shard_t *shard, const char *key, char **object,
+ size_t *object_size) {
+ cmph_uint32 h = cmph_search(shard->hash, key, SHARD_KEY_LEN);
+ debug("shard_lookup: h = %d\n", h);
+ size_t index_offset = shard->header.index_position + h * sizeof(size_t);
+ debug("shard_lookup: index_offset = %ld\n", index_offset);
+ size_t object_offset = shard->header.objects_position +
+ ntohq(*(size_t *)(shard->addr + index_offset));
+ debug("shard_lookup: object_offset = %ld\n", object_offset);
+ *object_size = ntohq(*(size_t *)(shard->addr + object_offset));
+ debug("shard_lookup: object_size = %ld\n", *object_size);
+ *object = shard->addr + object_offset + sizeof(size_t);
+ return 0;
+}
+
+int shard_hash_create(shard_t *shard) {
+ shard->source = io_adapter(shard);
+ shard->config = cmph_config_new(shard->source);
+ cmph_config_set_algo(shard->config, CMPH_CHD_PH);
+ cmph_config_set_keys_per_bin(shard->config, 1);
+ cmph_config_set_b(shard->config, 4);
+ shard->hash = cmph_new(shard->config);
+ return 0;
+}
+
+int shard_index_save(shard_t *shard) {
+ shard->header.index_position = shard_size_align(
+ shard->header.objects_position + shard->header.objects_size);
+ cmph_uint32 count = cmph_size(shard->hash);
+ debug("shard_index_save: count = %d\n", count);
+ shard->header.index_size = count * sizeof(size_t);
+ size_t *index = (size_t *)malloc(shard->header.index_size);
+ for (size_t i = 0; i < shard->index_offset; i++) {
+ cmph_uint32 h =
+ cmph_search(shard->hash, shard->index[i].key, SHARD_KEY_LEN);
+ debug("shard_index_save: i = %ld, h = %d, offset = %ld\n", i, h,
+ shard->index[i].object_offset);
+ index[h] = htonq(shard->index[i].object_offset);
+ }
+ memcpy((void *)(shard->addr + shard->header.index_position), index,
+ shard->header.index_size);
+ free(index);
+ return 0;
+}
+
+int shard_hash_save(shard_t *shard) {
+ shard->header.hash_position = shard_size_align(
+ shard->header.index_position + shard->header.index_size);
+ FILE *fd = fopen(shard->path, "r+");
+ fseek(fd, shard->header.hash_position, SEEK_SET);
+ cmph_dump(shard->hash, fd);
+ fclose(fd);
+ return 0;
+}
+
+int shard_hash_load(shard_t *shard) {
+ FILE *fd = fopen(shard->path, "r");
+ fseek(fd, shard->header.hash_position, SEEK_SET);
+ shard->hash = cmph_load(fd);
+ assert(shard->hash != NULL);
+ fclose(fd);
+ return 0;
+}
+
+int shard_save(shard_t *shard) {
+ shard->header.objects_size = shard->object_offset;
+ if (shard_hash_create(shard) < 0)
+ return -1;
+ if (shard_index_save(shard) < 0)
+ return -1;
+ if (shard_hash_save(shard) < 0)
+ return -1;
+ if (shard_header_save(shard) < 0)
+ return -1;
+ return 0;
+}
+
+int shard_destroy(shard_t *shard) {
+ if (shard->source)
+ free(shard->source);
+ if (shard->config)
+ cmph_config_destroy(shard->config);
+ if (shard->hash)
+ cmph_destroy(shard->hash);
+ shard_uninit(shard);
+ if (shard->index)
+ free(shard->index);
+ free(shard->path);
+ free(shard);
+ return 0;
+}
+
+shard_t *shard_init(const char *path) {
+ shard_t *shard = (shard_t *)malloc(sizeof(shard_t));
+ if (shard == NULL)
+ return NULL;
+ memset((void *)shard, '\0', sizeof(shard_t));
+ shard->path = strdup(path);
+ return shard;
+}
+
+int shard_reset(shard_t *shard) { return shard_header_reset(&shard->header); }
+
+int shard_create(shard_t *shard, size_t objects_count) {
+ if (shard_map(shard, O_RDWR) < 0)
+ return -1;
+ if (shard_reset(shard) < 0)
+ return -1;
+ shard->header.objects_count = objects_count;
+ shard->index =
+ (shard_index_t *)malloc(sizeof(shard_index_t) * objects_count);
+ return 0;
+}
+
+int shard_load(shard_t *shard) {
+ if (shard_map(shard, O_RDONLY) < 0)
+ return -1;
+ return shard_hash_load(shard);
+}
diff --git a/swh/perfecthash/test_hash.cpp b/swh/perfecthash/test_hash.cpp
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/test_hash.cpp
@@ -0,0 +1,136 @@
+#include <experimental/filesystem>
+#include <fcntl.h>
+#include <gtest/gtest.h>
+#include <random>
+#include <sys/types.h>
+#include <unistd.h>
+
+extern "C" {
+#include "hash.h"
+}
+
+using namespace std::experimental;
+
+filesystem::path
+create_temporary_directory(unsigned long long max_tries = 1000) {
+ auto tmp_dir = filesystem::temp_directory_path();
+ unsigned long long i = 0;
+ std::random_device dev;
+ std::mt19937 prng(dev());
+ std::uniform_int_distribution<int> rand(0);
+ filesystem::path path;
+ while (true) {
+ std::stringstream ss;
+ ss << std::hex << rand(prng);
+ path = tmp_dir / ss.str();
+ // true if the directory was created.
+ if (filesystem::create_directory(path)) {
+ break;
+ }
+ if (i == max_tries) {
+ throw std::runtime_error("could not find non-existing directory");
+ }
+ i++;
+ }
+ return path;
+}
+
+std::string gen_random(const int len) {
+
+ std::string tmp_s;
+ static const char alphanum[] = "0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz";
+
+ tmp_s.reserve(len);
+ std::random_device dev;
+ std::mt19937 prng(dev());
+ std::uniform_int_distribution<int> rand(0, strlen(alphanum) - 1);
+
+ for (int i = 0; i < len; ++i)
+ tmp_s += alphanum[rand(prng)];
+
+ return tmp_s;
+}
+
+TEST(HashTest, One) {
+ auto tmpdir = create_temporary_directory();
+ filesystem::path tmpfile = tmpdir / std::string("shard");
+ ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0);
+ ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0);
+
+ shard_t *shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ ASSERT_GE(shard_create(shard, 1), 0);
+ const char *keyA = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
+ const char *objectA = "AAAA";
+ size_t objectA_size = strlen(objectA);
+ ASSERT_GE(shard_object_write(shard, keyA, objectA, objectA_size), 0);
+ ASSERT_GE(shard_save(shard), 0);
+ char *found = NULL;
+ size_t found_size = 0;
+ ASSERT_GE(shard_lookup(shard, keyA, &found, &found_size), 0);
+ ASSERT_EQ(objectA_size, found_size);
+ ASSERT_EQ(memcmp((const void *)objectA, (const void *)found, found_size),
+ 0);
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ ASSERT_GE(shard_load(shard), 0);
+ found = NULL;
+ found_size = 0;
+ ASSERT_GE(shard_lookup(shard, keyA, &found, &found_size), 0);
+ ASSERT_EQ(objectA_size, found_size);
+ ASSERT_EQ(memcmp((const void *)objectA, (const void *)found, found_size),
+ 0);
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ filesystem::remove_all(tmpdir);
+}
+
+TEST(HashTest, Many) {
+ auto tmpdir = create_temporary_directory();
+ filesystem::path tmpfile = tmpdir / std::string("shard");
+ ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0);
+ ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0);
+
+ std::map<std::string, std::string> key2object;
+
+ shard_t *shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ int objects_count = 10;
+ ASSERT_GE(shard_create(shard, objects_count), 0);
+ for (int i = 0; i < objects_count; i++) {
+ std::string key = gen_random(32);
+ std::string object = gen_random(50);
+ key2object[key] = object;
+ std::cout << key << std::endl;
+ ASSERT_GE(shard_object_write(shard, key.c_str(), object.c_str(),
+ object.length()),
+ 0);
+ }
+ ASSERT_GE(shard_save(shard), 0);
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ ASSERT_GE(shard_load(shard), 0);
+ for (std::pair<std::string, std::string> p : key2object) {
+ char *found = NULL;
+ size_t found_size = 0;
+ ASSERT_GE(shard_lookup(shard, p.first.c_str(), &found, &found_size), 0);
+ ASSERT_EQ(p.second.length(), found_size);
+ ASSERT_EQ(memcmp((const void *)p.second.c_str(), (const void *)found,
+ found_size),
+ 0);
+ }
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ filesystem::remove_all(tmpdir);
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/swh/perfecthash/tests/test_hash.py b/swh/perfecthash/tests/test_hash.py
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/tests/test_hash.py
@@ -0,0 +1,24 @@
+import os
+from swh.perfecthash import Shard
+
+
+def test_all(tmpdir):
+ f = f'{tmpdir}/shard'
+ open(f, 'w').close()
+ os.truncate(f, 10*1024*1024)
+
+ s = Shard(f).create()
+ keyA = "A"*32
+ objectA = "AAAA"
+ s.write(keyA, objectA)
+ keyB = "B"*32
+ objectB = "BBBB"
+ s.write(keyB, objectB)
+ s.save()
+ del s
+
+ s = Shard(f).load()
+ assert s.lookup(keyA) == objectA
+ assert s.lookup(keyB) == objectB
+ assert s.lookup("UNKNOWN") is None
+ del s

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 24, 5:56 PM (6 d, 3 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226946

Event Timeline