Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9749545
D6424.id23469.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
19 KB
Subscribers
None
D6424.id23469.diff
View Options
diff --git a/.gitignore b/.gitignore
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,9 @@
version.txt
.mypy_cache/
.vscode/
+*.o
+swh/perfecthash/hash
+swh/perfecthash/hash.gcda
+swh/perfecthash/hash.gcno
+swh/perfecthash/hash.lcov
+swh/perfecthash/html
diff --git a/AUTHORS b/AUTHORS
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,3 +1,3 @@
-Copyright (C) 2019 The Software Heritage developers
+Copyright (C) 2021 The Software Heritage developers
See http://www.softwareheritage.org/ for more information.
diff --git a/MANIFEST.in b/MANIFEST.in
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,6 @@
include Makefile
include requirements*.txt
include version.txt
-include README.md
+include README.rst
recursive-include swh py.typed
+include swh/perfecthash/hash.[ch]
diff --git a/docs/index.rst b/docs/index.rst
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,4 +1,4 @@
-.. _swh-py-template:
+.. _swh-perfecthash:
.. include:: README.rst
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -11,5 +11,8 @@
[mypy-pytest.*]
ignore_missing_imports = True
-# [mypy-add_your_lib_here.*]
-# ignore_missing_imports = True
+[mypy-_hash_cffi.*]
+ignore_missing_imports = True
+
+[mypy-cffi.*]
+ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@
# should match https://pypi.python.org/pypi names. For the full spec or
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
+cffi
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -48,9 +48,10 @@
packages=find_packages(), # packages's modules
install_requires=parse_requirements(None, "swh"),
tests_require=parse_requirements("test"),
- setup_requires=["setuptools-scm"],
+ setup_requires=["setuptools-scm", "cffi"],
use_scm_version=True,
extras_require={"testing": parse_requirements("test")},
+ cffi_modules=["swh/perfecthash/build.py:ffibuilder"],
classifiers=[
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
diff --git a/swh/perfecthash/.clang-format b/swh/perfecthash/.clang-format
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/.clang-format
@@ -0,0 +1,2 @@
+---
+IndentWidth: 4
diff --git a/swh/perfecthash/Makefile b/swh/perfecthash/Makefile
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/Makefile
@@ -0,0 +1,20 @@
+CFLAGS=-DHASH_DEBUG -Wall -I../.. -g -std=c++17 -fprofile-arcs -ftest-coverage
+LDFLAGS=-lcmph -lgtest -lpthread -lstdc++ -lstdc++fs -fprofile-arcs -ftest-coverage
+
+hash: hash.o test_hash.o
+ $(CXX) -o $@ $^ $(LDFLAGS)
+
+hash.c: hash.h
+test_hash.cpp: hash.h
+
+format:
+ clang-format -i hash.c hash.h test_hash.cpp
+
+check: hash
+ valgrind --leak-check=full --tool=memcheck ./hash
+ lcov -d . -c -o hash.lcov
+ rm -fr html ; genhtml -o html hash.lcov
+ firefox html/perfecthash/hash.c.gcov.html
+
+clean:
+ rm -f *.o hash
diff --git a/swh/perfecthash/__init__.py b/swh/perfecthash/__init__.py
--- a/swh/perfecthash/__init__.py
+++ b/swh/perfecthash/__init__.py
@@ -0,0 +1,30 @@
+from _hash_cffi import lib
+
+
+class Shard:
+
+ def __init__(self, path):
+ self.shard = lib.shard_init(path.encode('utf-8'))
+
+ def __del__(self):
+ lib.shard_destroy(self.shard)
+
+ def create(self):
+ lib.shard_create(self.shard)
+ return self
+
+ def load(self):
+ lib.shard_load(self.shard)
+ return self
+
+ def save(self):
+ lib.shard_save(self.shard)
+
+ def lookup(self, key):
+ object_pointer = ffi.new("char**")
+ object_size_pointer = ffi.new("size_t*")
+ lib.shard_lookup(self.shard, key, object, object_size)
+ return ffi.buffer(object_pointer[0], object_size_pointer[0])
+
+ def write(self, key, object):
+ lib.shard_object_write(self.shard, key, object, len(object))
diff --git a/swh/perfecthash/build.py b/swh/perfecthash/build.py
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/build.py
@@ -0,0 +1,35 @@
+from cffi import FFI
+
+ffibuilder = FFI()
+
+# cdef() expects a single string declaring the C types, functions and
+# globals needed to use the shared object. It must be in valid C syntax.
+ffibuilder.cdef(
+ """
+typedef struct shard_t shard_t;
+
+shard_t* shard_init(const char* path);
+int shard_destroy(shard_t* shard);
+
+int shard_create(shard_t* shard);
+int shard_object_write(shard_t* shard, const char* key, const char* object, size_t object_size);
+int shard_save(shard_t* shard);
+
+int shard_load(shard_t* shard);
+int shard_lookup(shard_t* shard, const char* key, char** object, size_t* object_size);
+
+"""
+)
+
+ffibuilder.set_source(
+ "_hash_cffi",
+ """
+ #include "swh/perfecthash/hash.h"
+ """,
+ sources=["swh/perfecthash/hash.c"],
+ include_dirs=["."],
+ libraries=["cmph"],
+) # library name, for the linker
+
+if __name__ == "__main__":
+ ffibuilder.compile(verbose=True)
diff --git a/swh/perfecthash/hash.h b/swh/perfecthash/hash.h
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/hash.h
@@ -0,0 +1,44 @@
+#include <cmph.h>
+#include <cmph_types.h>
+
+#define SHARD_OFFSET_HEADER 512
+#define SHARD_KEY_LEN 32
+
+typedef struct {
+ size_t objects_count;
+ size_t objects_position;
+ size_t objects_size;
+ size_t index_position;
+ size_t index_size;
+ size_t hash_position;
+} shard_header_t;
+
+typedef struct {
+ char key[SHARD_KEY_LEN];
+ size_t object_offset;
+} shard_index_t;
+
+typedef struct {
+ cmph_io_adapter_t *source;
+ cmph_config_t *config;
+ cmph_t *hash;
+ char *addr;
+ char *path;
+ shard_index_t *index;
+ size_t index_offset;
+ size_t file_size;
+ shard_header_t header;
+ size_t object_offset;
+} shard_t;
+
+shard_t *shard_init(const char *path);
+int shard_destroy(shard_t *shard);
+
+int shard_create(shard_t *shard, size_t objects_count);
+int shard_object_write(shard_t *shard, const char *key, const char *object,
+ size_t object_size);
+int shard_save(shard_t *shard);
+
+int shard_load(shard_t *shard);
+int shard_lookup(shard_t *shard, const char *key, char **object,
+ size_t *object_size);
diff --git a/swh/perfecthash/hash.c b/swh/perfecthash/hash.c
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/hash.c
@@ -0,0 +1,255 @@
+#include <assert.h>
+#include <fcntl.h>
+#include <memory.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "swh/perfecthash/hash.h"
+
+#ifdef HASH_DEBUG
+#define debug(...) printf(__VA_ARGS__)
+#else
+#define debug(...)
+#endif
+
+size_t ntohq(size_t v) { return __builtin_bswap64(v); }
+
+size_t htonq(size_t v) { return __builtin_bswap64(v); }
+
+int shard_size_align(size_t s) {
+ int r = sizeof(size_t);
+ if (s % r)
+ return s + r - (s % r);
+ else
+ return s;
+}
+
+int shard_header_load(shard_header_t *header, int fd) {
+ if (lseek(fd, 0, SEEK_SET) < 0) {
+ perror("lseek");
+ return -1;
+ }
+ if (read(fd, (void *)header, sizeof(shard_header_t)) < 0) {
+ perror("read");
+ return -1;
+ }
+ return 0;
+}
+
+int shard_header_reset(shard_header_t *header) {
+ memset((void *)header, '\0', sizeof(shard_header_t));
+ header->objects_position = SHARD_OFFSET_HEADER;
+ return 0;
+}
+
+int shard_header_save(shard_t *shard) {
+ memcpy((void *)shard->addr, (void *)&(shard->header),
+ sizeof(shard_header_t));
+ return 0;
+}
+
+int shard_map(shard_t *shard, int flags) {
+ int fd = open(shard->path, flags);
+ if (fd < 0) {
+ perror("open");
+ return -1;
+ }
+ struct stat sb;
+ if (fstat(fd, &sb) == -1) {
+ perror("fstat");
+ return -1;
+ }
+ shard->file_size = sb.st_size;
+ int prot = flags == O_RDONLY ? PROT_READ : PROT_WRITE;
+ shard->addr = mmap(NULL, shard->file_size, prot, MAP_SHARED, fd, 0);
+ if (shard->addr == NULL) {
+ perror("mmap");
+ return -1;
+ }
+ if (shard_header_load(&shard->header, fd) < 0)
+ return -1;
+ if (close(fd) < 0) {
+ perror("close");
+ return -1;
+ }
+ return 0;
+}
+
+int shard_uninit(shard_t *shard) {
+ if (shard->addr)
+ return munmap(shard->addr, shard->file_size);
+ else
+ return 0;
+}
+
+int shard_object_write(shard_t *shard, const char *key, const char *object,
+ size_t object_size) {
+ // save key & index to later build the hash
+ shard_index_t *index = &shard->index[shard->index_offset];
+ memcpy((void *)index->key, key, SHARD_KEY_LEN);
+ index->object_offset = shard->object_offset;
+ shard->index_offset++;
+ // write the object size and the object itself
+ size_t n_object_size = htonq(object_size);
+ debug("shard_object_write: object_size = %ld n_object_size = %ld\n",
+ object_size, n_object_size);
+ size_t object_offset =
+ shard->header.objects_position + shard->object_offset;
+ debug("shard_object_write: object_offset = %ld\n", object_offset);
+ memcpy((void *)(shard->addr + object_offset), (const void *)&n_object_size,
+ sizeof(size_t));
+ memcpy((void *)(shard->addr + object_offset + sizeof(size_t)),
+ (const void *)object, object_size);
+ shard->object_offset += shard_size_align(sizeof(size_t) + object_size);
+ return 0;
+}
+
+static int io_read(void *data, char **key, cmph_uint32 *keylen) {
+ shard_t *shard = (shard_t *)data;
+ *key = shard->index[shard->index_offset].key;
+ *keylen = SHARD_KEY_LEN;
+ shard->index_offset++;
+ return shard->index_offset >= shard->header.objects_count ? -1 : 0;
+}
+
+static void io_dispose(void *data, char *key, cmph_uint32 keylen) {}
+
+static void io_rewind(void *data) {
+ shard_t *shard = (shard_t *)data;
+ shard->index_offset = 0;
+}
+
+static cmph_io_adapter_t *io_adapter(shard_t *shard) {
+ cmph_io_adapter_t *key_source =
+ (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
+ if (key_source == NULL)
+ return NULL;
+ key_source->data = (void *)shard;
+ key_source->nkeys = shard->header.objects_count;
+ key_source->read = io_read;
+ key_source->dispose = io_dispose;
+ key_source->rewind = io_rewind;
+ return key_source;
+}
+
+int shard_lookup(shard_t *shard, const char *key, char **object,
+ size_t *object_size) {
+ cmph_uint32 h = cmph_search(shard->hash, key, SHARD_KEY_LEN);
+ debug("shard_lookup: h = %d\n", h);
+ size_t index_offset = shard->header.index_position + h * sizeof(size_t);
+ debug("shard_lookup: index_offset = %ld\n", index_offset);
+ size_t object_offset = shard->header.objects_position +
+ ntohq(*(size_t *)(shard->addr + index_offset));
+ debug("shard_lookup: object_offset = %ld\n", object_offset);
+ *object_size = ntohq(*(size_t *)(shard->addr + object_offset));
+ debug("shard_lookup: object_size = %ld\n", *object_size);
+ *object = shard->addr + object_offset + sizeof(size_t);
+ return 0;
+}
+
+int shard_hash_create(shard_t *shard) {
+ shard->source = io_adapter(shard);
+ shard->config = cmph_config_new(shard->source);
+ cmph_config_set_algo(shard->config, CMPH_CHD_PH);
+ cmph_config_set_keys_per_bin(shard->config, 1);
+ cmph_config_set_b(shard->config, 4);
+ shard->hash = cmph_new(shard->config);
+ return 0;
+}
+
+int shard_index_save(shard_t *shard) {
+ shard->header.index_position = shard_size_align(
+ shard->header.objects_position + shard->header.objects_size);
+ cmph_uint32 count = cmph_size(shard->hash);
+ debug("shard_index_save: count = %d\n", count);
+ shard->header.index_size = count * sizeof(size_t);
+ size_t *index = (size_t *)malloc(shard->header.index_size);
+ for (size_t i = 0; i < shard->index_offset; i++) {
+ cmph_uint32 h =
+ cmph_search(shard->hash, shard->index[i].key, SHARD_KEY_LEN);
+ debug("shard_index_save: i = %ld, h = %d, offset = %ld\n", i, h,
+ shard->index[i].object_offset);
+ index[h] = htonq(shard->index[i].object_offset);
+ }
+ memcpy((void *)(shard->addr + shard->header.index_position), index,
+ shard->header.index_size);
+ free(index);
+ return 0;
+}
+
+int shard_hash_save(shard_t *shard) {
+ shard->header.hash_position = shard_size_align(
+ shard->header.index_position + shard->header.index_size);
+ FILE *fd = fopen(shard->path, "r+");
+ fseek(fd, shard->header.hash_position, SEEK_SET);
+ cmph_dump(shard->hash, fd);
+ fclose(fd);
+ return 0;
+}
+
+int shard_hash_load(shard_t *shard) {
+ FILE *fd = fopen(shard->path, "r");
+ fseek(fd, shard->header.hash_position, SEEK_SET);
+ shard->hash = cmph_load(fd);
+ assert(shard->hash != NULL);
+ fclose(fd);
+ return 0;
+}
+
+int shard_save(shard_t *shard) {
+ shard->header.objects_size = shard->object_offset;
+ if (shard_hash_create(shard) < 0)
+ return -1;
+ if (shard_index_save(shard) < 0)
+ return -1;
+ if (shard_hash_save(shard) < 0)
+ return -1;
+ if (shard_header_save(shard) < 0)
+ return -1;
+ return 0;
+}
+
+int shard_destroy(shard_t *shard) {
+ if (shard->source)
+ free(shard->source);
+ if (shard->config)
+ cmph_config_destroy(shard->config);
+ if (shard->hash)
+ cmph_destroy(shard->hash);
+ shard_uninit(shard);
+ if (shard->index)
+ free(shard->index);
+ free(shard->path);
+ free(shard);
+ return 0;
+}
+
+shard_t *shard_init(const char *path) {
+ shard_t *shard = (shard_t *)malloc(sizeof(shard_t));
+ if (shard == NULL)
+ return NULL;
+ memset((void *)shard, '\0', sizeof(shard_t));
+ shard->path = strdup(path);
+ return shard;
+}
+
+int shard_reset(shard_t *shard) { return shard_header_reset(&shard->header); }
+
+int shard_create(shard_t *shard, size_t objects_count) {
+ if (shard_map(shard, O_RDWR) < 0)
+ return -1;
+ if (shard_reset(shard) < 0)
+ return -1;
+ shard->header.objects_count = objects_count;
+ shard->index =
+ (shard_index_t *)malloc(sizeof(shard_index_t) * objects_count);
+ return 0;
+}
+
+int shard_load(shard_t *shard) {
+ if (shard_map(shard, O_RDONLY) < 0)
+ return -1;
+ return shard_hash_load(shard);
+}
diff --git a/swh/perfecthash/test_hash.cpp b/swh/perfecthash/test_hash.cpp
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/test_hash.cpp
@@ -0,0 +1,136 @@
+#include <experimental/filesystem>
+#include <fcntl.h>
+#include <gtest/gtest.h>
+#include <random>
+#include <sys/types.h>
+#include <unistd.h>
+
+extern "C" {
+#include "hash.h"
+}
+
+using namespace std::experimental;
+
+filesystem::path
+create_temporary_directory(unsigned long long max_tries = 1000) {
+ auto tmp_dir = filesystem::temp_directory_path();
+ unsigned long long i = 0;
+ std::random_device dev;
+ std::mt19937 prng(dev());
+ std::uniform_int_distribution<int> rand(0);
+ filesystem::path path;
+ while (true) {
+ std::stringstream ss;
+ ss << std::hex << rand(prng);
+ path = tmp_dir / ss.str();
+ // true if the directory was created.
+ if (filesystem::create_directory(path)) {
+ break;
+ }
+ if (i == max_tries) {
+ throw std::runtime_error("could not find non-existing directory");
+ }
+ i++;
+ }
+ return path;
+}
+
+std::string gen_random(const int len) {
+
+ std::string tmp_s;
+ static const char alphanum[] = "0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz";
+
+ tmp_s.reserve(len);
+ std::random_device dev;
+ std::mt19937 prng(dev());
+ std::uniform_int_distribution<int> rand(0, strlen(alphanum) - 1);
+
+ for (int i = 0; i < len; ++i)
+ tmp_s += alphanum[rand(prng)];
+
+ return tmp_s;
+}
+
+TEST(HashTest, One) {
+ auto tmpdir = create_temporary_directory();
+ filesystem::path tmpfile = tmpdir / std::string("shard");
+ ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0);
+ ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0);
+
+ shard_t *shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ ASSERT_GE(shard_create(shard, 1), 0);
+ const char *keyA = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
+ const char *objectA = "AAAA";
+ size_t objectA_size = strlen(objectA);
+ ASSERT_GE(shard_object_write(shard, keyA, objectA, objectA_size), 0);
+ ASSERT_GE(shard_save(shard), 0);
+ char *found = NULL;
+ size_t found_size = 0;
+ ASSERT_GE(shard_lookup(shard, keyA, &found, &found_size), 0);
+ ASSERT_EQ(objectA_size, found_size);
+ ASSERT_EQ(memcmp((const void *)objectA, (const void *)found, found_size),
+ 0);
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ ASSERT_GE(shard_load(shard), 0);
+ found = NULL;
+ found_size = 0;
+ ASSERT_GE(shard_lookup(shard, keyA, &found, &found_size), 0);
+ ASSERT_EQ(objectA_size, found_size);
+ ASSERT_EQ(memcmp((const void *)objectA, (const void *)found, found_size),
+ 0);
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ filesystem::remove_all(tmpdir);
+}
+
+TEST(HashTest, Many) {
+ auto tmpdir = create_temporary_directory();
+ filesystem::path tmpfile = tmpdir / std::string("shard");
+ ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0);
+ ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0);
+
+ std::map<std::string, std::string> key2object;
+
+ shard_t *shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ int objects_count = 10;
+ ASSERT_GE(shard_create(shard, objects_count), 0);
+ for (int i = 0; i < objects_count; i++) {
+ std::string key = gen_random(32);
+ std::string object = gen_random(50);
+ key2object[key] = object;
+ std::cout << key << std::endl;
+ ASSERT_GE(shard_object_write(shard, key.c_str(), object.c_str(),
+ object.length()),
+ 0);
+ }
+ ASSERT_GE(shard_save(shard), 0);
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ ASSERT_GE(shard_load(shard), 0);
+ for (std::pair<std::string, std::string> p : key2object) {
+ char *found = NULL;
+ size_t found_size = 0;
+ ASSERT_GE(shard_lookup(shard, p.first.c_str(), &found, &found_size), 0);
+ ASSERT_EQ(p.second.length(), found_size);
+ ASSERT_EQ(memcmp((const void *)p.second.c_str(), (const void *)found,
+ found_size),
+ 0);
+ }
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ filesystem::remove_all(tmpdir);
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/swh/perfecthash/tests/test_hash.py b/swh/perfecthash/tests/test_hash.py
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/tests/test_hash.py
@@ -0,0 +1,24 @@
+import os
+from swh.perfecthash import Shard
+
+
+def test_all(tmpdir):
+ f = f'{tmpdir}/shard'
+ open(f, 'w').close()
+ os.truncate(f, 10*1024*1024)
+
+ s = Shard(f).create()
+ keyA = "A"*32
+ objectA = "AAAA"
+ s.write(keyA, objectA)
+ keyB = "B"*32
+ objectB = "BBBB"
+ s.write(keyB, objectB)
+ s.save()
+ del s
+
+ s = Shard(f).load()
+ assert s.lookup(keyA) == objectA
+ assert s.lookup(keyB) == objectB
+ assert s.lookup("UNKNOWN") is None
+ del s
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Aug 24, 5:56 PM (6 d, 3 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226946
Attached To
D6424: Perfect hashmap C implementation
Event Timeline
Log In to Comment