Page MenuHomeSoftware Heritage

D6424.id23806.diff
No OneTemporary

D6424.id23806.diff

diff --git a/.gitignore b/.gitignore
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,9 @@
version.txt
.mypy_cache/
.vscode/
+*.o
+swh/perfecthash/test_hash
+swh/perfecthash/hash.gcda
+swh/perfecthash/hash.gcno
+swh/perfecthash/test_hash.lcov
+swh/perfecthash/html
diff --git a/AUTHORS b/AUTHORS
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,3 +1,3 @@
-Copyright (C) 2019 The Software Heritage developers
+Copyright (C) 2021 The Software Heritage developers
See http://www.softwareheritage.org/ for more information.
diff --git a/MANIFEST.in b/MANIFEST.in
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,6 @@
include Makefile
include requirements*.txt
include version.txt
-include README.md
+include README.rst
recursive-include swh py.typed
+include swh/perfecthash/hash.[ch]
diff --git a/conftest.py b/conftest.py
new file mode 100644
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,22 @@
+def pytest_addoption(parser):
+ parser.addoption(
+ "--shard-size", default=10, type=int, help="Size of the Read Shard file in MB",
+ )
+ parser.addoption(
+ "--shard-path", default="/tmp/payload", help="Path of the Read Shard file",
+ )
+ parser.addoption(
+ "--shard-count",
+ default=2,
+ type=int,
+ help="Number of Read Shard files for lookup tests",
+ )
+ parser.addoption(
+ "--object-max-size",
+ default=10 * 1024,
+ type=int,
+ help="Maximum size of anobject in bytes",
+ )
+ parser.addoption(
+ "--lookups", default=10, type=int, help="Number of lookups to perform",
+ )
diff --git a/docs/README.rst b/docs/README.rst
deleted file mode 100644
--- a/docs/README.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Software Heritage - Python module template
-==========================================
-
-Python module template, used as skeleton to create new modules.
diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst
new file mode 100644
--- /dev/null
+++ b/docs/benchmarks.rst
@@ -0,0 +1,93 @@
+Benchmarks
+==========
+
+The tests (tox) run the benchmark code with very small files. The benchmarks is about running the same
+code with Read Shards that have the expected size in production (100GB) and verify:
+
+* The time required to build a Read Shard is less than 5 times the
+ time to copy a file of the same size. It guards against regressions
+ that would have a significant performance impact.
+
+* The time required to build and write the perfect hash table is
+ always a fraction of the time required to copy the content of the
+ objects.
+
+* It is possible to perform at least 10,000 object lookups per second.
+
+
+Build performances
+------------------
+
+In both cases the distribution of the object sizes is uniform.
+
+With a small number of large (<100MB) objects
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The goal is to verify that manipulating up to the largest supported
+object size actually works and does not trigger problems in the
+extreme case that all objects have the maximum size.
+
+* time tox -e py3 -- --basetemp=/mnt/pytest -s --shard-path /mnt/payload --shard-size $((100 * 1024)) --object-max-size $((100 * 1024 * 1024)) -k test_build_speed
+ number of objects = 2057, total size = 107374116576
+ baseline 165.85, write_duration 327.19, build_duration 0.0062, total_duration 327.19
+
+With a large number of small (<4KB) objects
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It approximates the maximum number of objects a Read Shard can
+contain. The goal is to verify that creating a perfect hash with this
+many items does not require more than a fraction of the time required
+to copy the objects.
+
+* time tox -e py3 -- --basetemp=/mnt/pytest -s --shard-path /mnt/payload --shard-size $((100 * 1024)) --object-max-size $((4 * 1024)) -k test_build_speed
+ number of objects = 45973694, total size = 105903024192
+ baseline 165.74, write_duration 495.07, build_duration 24.21, total_duration 519.28
+
+
+Object lookup performances
+--------------------------
+
+The machine has 200GB of RAM and can therefore approximately cache the
+content of two Read Shards which can have a significant impact on
+performances. To minize that effect, four Read Shard are created
+totaling 400GB. All objects are looked up in all shards to verify
+the lookup speed is greater than 10,000 objects per second.
+
+* time tox -e py3 -- --basetemp=/mnt/pytest -s -k test_lookup_speed --lookups $((100 * 1024 * 1024)) --shard-size $((100 * 1024)) --object-max-size $((4 * 1024)) swh/perfecthash/tests/test_hash.py --shard-path /mnt/payload --shard-count 4
+ number of objects = 45972629, total size = 105903058272
+ key lookups speed = 148528.56/s
+
+
+Setup via Fed4Fire
+------------------
+
+* https://www.fed4fire.eu/
+* /opt/jFed/jFed-Experimenter
+* Create an experiment with one machine
+* Click on Topology Viewer and run the experiment (name test)
+* Once the provisionning is complete (Testing connectivity to resources on Grid5000) click Export As
+* Choose Export Configuration Management Settings
+* Save under test.zip and unzip
+* ssh -i id_rsa -F ssh-config node0
+* alias sudo=sudo-g5k
+
+Setup via Grid5000
+------------------
+
+* https://www.grid5000.fr/
+* oarsub -I -l "{cluster='dahu'}/host=1,walltime=1" -t deploy
+* kadeploy3 -f $OAR_NODE_FILE -e debian11-x64-base -k
+* ssh root@$(tail -1 $OAR_NODE_FILE)
+
+Common setup
+------------
+
+* mkfs.ext4 /dev/sdc
+* mount /dev/sdc /mnt
+* apt-get install -y python3-venv python3-dev libcmph-dev gcc git emacs-nox
+* git clone -b wip-benchmark https://git.easter-eggs.org/biceps/swh-perfecthash/
+* python3 -m venv bench
+* source bench/bin/activate
+* cd swh-perfecthash
+* pip install -r requirements.txt -r requirements-test.txt tox wheel
+* tox -e py3
diff --git a/docs/index.rst b/docs/index.rst
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,15 +1,17 @@
-.. _swh-py-template:
+.. _swh-perfecthash:
-.. include:: README.rst
+Software Heritage - Object storage perfect hash
+===============================================
+
+Low level management for read-only content-addressable object storage
+indexed with a perfect hash table.
-.. toctree::
- :maxdepth: 2
- :caption: Contents:
+Reference Documentation
+-----------------------
-Indices and tables
-------------------
+.. toctree::
+ :maxdepth: 2
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
+ /apidoc/swh.perfecthash
+ benchmarks
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -11,5 +11,8 @@
[mypy-pytest.*]
ignore_missing_imports = True
-# [mypy-add_your_lib_here.*]
-# ignore_missing_imports = True
+[mypy-_hash_cffi.*]
+ignore_missing_imports = True
+
+[mypy-cffi.*]
+ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@
# should match https://pypi.python.org/pypi names. For the full spec or
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
+cffi
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -48,9 +48,10 @@
packages=find_packages(), # packages's modules
install_requires=parse_requirements(None, "swh"),
tests_require=parse_requirements("test"),
- setup_requires=["setuptools-scm"],
+ setup_requires=["setuptools-scm", "cffi"],
use_scm_version=True,
extras_require={"testing": parse_requirements("test")},
+ cffi_modules=["swh/perfecthash/build.py:ffibuilder"],
classifiers=[
"Programming Language :: Python :: 3",
"Intended Audience :: Developers",
diff --git a/swh/perfecthash/.clang-format b/swh/perfecthash/.clang-format
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/.clang-format
@@ -0,0 +1,2 @@
+---
+IndentWidth: 4
diff --git a/swh/perfecthash/Makefile b/swh/perfecthash/Makefile
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/Makefile
@@ -0,0 +1,19 @@
+CFLAGS=-DHASH_DEBUG -Wall -I../.. -g -std=c++17 -fprofile-arcs -ftest-coverage
+LDFLAGS=-lcmph -lgtest -lpthread -lstdc++ -lstdc++fs -fprofile-arcs -ftest-coverage
+
+test_hash: hash.o test_hash.o
+ $(CXX) -o $@ $^ $(LDFLAGS)
+
+hash.c: hash.h
+test_hash.cpp: hash.h
+
+format:
+ clang-format -i hash.c hash.h test_hash.cpp
+
+check: test_hash
+ valgrind --leak-check=full --tool=memcheck ./test_hash
+ lcov -d . -c -o test_hash.lcov
+ rm -fr html ; genhtml -o html test_hash.lcov
+
+clean:
+ rm -f *.o test_hash
diff --git a/swh/perfecthash/__init__.py b/swh/perfecthash/__init__.py
--- a/swh/perfecthash/__init__.py
+++ b/swh/perfecthash/__init__.py
@@ -0,0 +1,105 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from typing import NewType
+
+from _hash_cffi import lib
+from cffi import FFI
+
+Key = NewType("Key", bytes)
+HashObject = NewType("HashObject", bytes)
+
+
+class Shard:
+ """Low level management for files indexed with a perfect hash table.
+
+ This class allows creating a Read Shard by adding key/object pairs
+ and looking up the content of an object when given the key.
+ """
+
+ def __init__(self, path: str):
+ """Initialize with an existing Read Shard.
+
+ Args:
+ path: path to an existing Read Shard file or device
+
+ """
+ self.ffi = FFI()
+ self.shard = lib.shard_init(path.encode("utf-8"))
+
+ def __del__(self):
+ lib.shard_destroy(self.shard)
+
+ def create(self, objects_count: int) -> "Shard":
+ """Wipe out the content of the Read Shard. It must be followed by
+ **object_count** calls to the **write** method otherwise the content
+ of the Read Shard will be inconsistent. When all objects are inserted,
+ the Read Shard must be made persistent by calling the **save** method.
+
+ Args:
+ objects_count: number of objects in the Read Shard.
+
+ Returns:
+ self.
+
+ """
+ assert lib.shard_create(self.shard, objects_count) != -1
+ return self
+
+ def load(self):
+ """Open the Read Shard file in read-only mode.
+
+ Returns:
+ self.
+ """
+ assert lib.shard_load(self.shard) != -1
+ return self
+
+ def save(self):
+ """Create the perfect hash table the **lookup** method
+ relies on to find the content of the objects.
+
+ It must be called after **create** an **write** otherwise the
+ content of the Read Shard will be inconsistent.
+
+ Returns:
+ 0 on success, -1 on error.
+ """
+ return lib.shard_save(self.shard)
+
+ def lookup(self, key: Key) -> HashObject:
+ """Fetch the object matching the key in the Read Shard.
+
+ Fetching an object is O(1): one lookup in the index to obtain
+ the offset of the object in the Read Shard and one read to get
+ the payload.
+
+ Args:
+ key: the key associated with the object to retrieve.
+
+ Returns:
+ the object as bytes.
+ """
+ object_size_pointer = self.ffi.new("size_t*")
+ lib.shard_lookup_object_size(self.shard, key, object_size_pointer)
+ object_size = object_size_pointer[0]
+ object_pointer = self.ffi.new("char[]", object_size)
+ lib.shard_lookup_object(self.shard, object_pointer, object_size)
+ return self.ffi.buffer(object_pointer, object_size)
+
+ def write(self, key: Key, object: HashObject) -> int:
+ """Add the key/object pair to the Read Shard.
+
+ The **create** method must have been called prior to calling
+ the **write** method.
+
+ Args:
+ key: the unique key associated with the object.
+ object: the object
+
+ Returns:
+ 0 on success, -1 on error.
+ """
+ return lib.shard_object_write(self.shard, key, object, len(object))
diff --git a/swh/perfecthash/build.py b/swh/perfecthash/build.py
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/build.py
@@ -0,0 +1,42 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from cffi import FFI
+
+ffibuilder = FFI()
+
+# cdef() expects a single string declaring the C types, functions and
+# globals needed to use the shared object. It must be in valid C syntax.
+ffibuilder.cdef(
+ """
+typedef struct shard_t shard_t;
+
+shard_t* shard_init(const char* path);
+int shard_destroy(shard_t* shard);
+
+int shard_create(shard_t* shard, size_t objects_count);
+int shard_object_write(shard_t* shard, const char* key,
+ const char* object, size_t object_size);
+int shard_save(shard_t* shard);
+
+int shard_load(shard_t* shard);
+int shard_lookup_object_size(shard_t *shard, const char *key, size_t *object_size);
+int shard_lookup_object(shard_t *shard, char *object, size_t object_size);
+
+"""
+)
+
+ffibuilder.set_source(
+ "_hash_cffi",
+ """
+ #include "swh/perfecthash/hash.h"
+ """,
+ sources=["swh/perfecthash/hash.c"],
+ include_dirs=["."],
+ libraries=["cmph"],
+) # library name, for the linker
+
+if __name__ == "__main__":
+ ffibuilder.compile(verbose=True)
diff --git a/swh/perfecthash/hash.h b/swh/perfecthash/hash.h
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/hash.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2021 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+#include <cmph.h>
+#include <cmph_types.h>
+
+/*
+ * The Read Shard has the following structure:
+ *
+ * - bytes [0, objects_position[
+ * The header shard_header_t
+ * - bytes [objects_position, index_position[
+ * objects_count times the size of the object (size_t) followed by the
+ * object itself
+ * - bytes [index_position, hash_position[
+ * An array of size_t object positions in the range [objects_position,
+ * index_position[ The size of the array is provided by cmph_size after building
+ * the hash function
+ * - bytes [hash_position, ...[
+ * The hash function, as written by cmph_dump
+ */
+
+#define SHARD_OFFSET_HEADER 512
+#define SHARD_KEY_LEN 32
+
+typedef struct {
+ size_t objects_count;
+ size_t objects_position;
+ size_t objects_size;
+ size_t index_position;
+ size_t index_size;
+ size_t hash_position;
+} shard_header_t;
+
+typedef struct {
+ char key[SHARD_KEY_LEN];
+ size_t object_offset;
+} shard_index_t;
+
+typedef struct {
+ char *path;
+ FILE *f;
+ shard_header_t header;
+ cmph_t *hash;
+
+ // The following fields are only used when creating the Read Shard
+ cmph_io_adapter_t *source;
+ cmph_config_t *config;
+ shard_index_t *index;
+ size_t index_offset;
+} shard_t;
+
+shard_t *shard_init(const char *path);
+int shard_destroy(shard_t *shard);
+
+int shard_create(shard_t *shard, size_t objects_count);
+int shard_object_write(shard_t *shard, const char *key, const char *object,
+ size_t object_size);
+int shard_save(shard_t *shard);
+
+int shard_load(shard_t *shard);
+int shard_lookup_object_size(shard_t *shard, const char *key,
+ size_t *object_size);
+int shard_lookup_object(shard_t *shard, char *object, size_t object_size);
diff --git a/swh/perfecthash/hash.c b/swh/perfecthash/hash.c
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/hash.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (C) 2021 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <memory.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "swh/perfecthash/hash.h"
+
+#ifdef HASH_DEBUG
+#define debug(...) printf(__VA_ARGS__)
+#else
+#define debug(...)
+#endif
+
+size_t ntohq(size_t v) { return __builtin_bswap64(v); }
+
+size_t htonq(size_t v) { return __builtin_bswap64(v); }
+
+/***********************************************************
+ * wrappers around FILE functions that:
+ * - return -1 on error
+ * - print a meaningful message when an error occurs
+ *
+ */
+
+int shard_open(shard_t *shard, const char *mode) {
+ shard->f = fopen(shard->path, mode);
+ if (shard->f == NULL) {
+ printf("shard_open: open(%s, %s): %s\n", shard->path, mode,
+ strerror(errno));
+ return -1;
+ } else
+ return 0;
+}
+
+int shard_close(shard_t *shard) {
+ if (shard->f == NULL)
+ return 0;
+ int r = fclose(shard->f);
+ if (r < 0)
+ printf("shard_close: fclose(%p): %s\n", shard->f, strerror(errno));
+ return r;
+}
+
+int shard_seek(shard_t *shard, long offset, int whence) {
+ int r = fseek(shard->f, offset, whence);
+ if (r < 0)
+ printf("shard_seek: fseek(%p, %ld, %d): %s\n", shard->f, offset, whence,
+ strerror(errno));
+ return r;
+}
+
+size_t shard_tell(shard_t *shard) {
+ long r = ftell(shard->f);
+ if (r < 0)
+ printf("shard_tell: ftell(%p): %s\n", shard->f, strerror(errno));
+ return r;
+}
+
+int shard_read(shard_t *shard, void *ptr, size_t size) {
+ size_t read;
+ if ((read = fread(ptr, 1, size, shard->f)) != size) {
+ printf("shard_read: read %ld instead of %ld\n", read, size);
+ return -1;
+ }
+ return 0;
+}
+
+int shard_read_size_t(shard_t *shard, size_t *ptr) {
+ size_t n_size;
+ if (shard_read(shard, &n_size, sizeof(size_t)) < 0) {
+ printf("shard_read_size_t: shard_read\n");
+ return -1;
+ }
+ *ptr = ntohq(n_size);
+ return 0;
+}
+
+int shard_write(shard_t *shard, const void *ptr, size_t nmemb) {
+ size_t wrote;
+ if ((wrote = fwrite(ptr, 1, nmemb, shard->f)) != nmemb) {
+ printf("shard_write: wrote %ld instead of %ld\n", wrote, nmemb);
+ return -1;
+ }
+ return 0;
+}
+
+/***********************************************************
+ * load or save a shard_header_t
+ */
+
+int shard_header_print(shard_header_t *header) {
+#define PRINT(name) debug("shard_header_print: " #name " %ld\n", header->name)
+ PRINT(objects_count);
+ PRINT(objects_position);
+ PRINT(objects_size);
+ PRINT(index_position);
+ PRINT(index_size);
+ PRINT(hash_position);
+#undef PRINT
+ return 0;
+}
+
+int shard_header_load(shard_t *shard) {
+ if (shard_seek(shard, 0, SEEK_SET) < 0) {
+ printf("shard_header_load\n");
+ return -1;
+ }
+ shard_header_t header;
+ if (shard_read(shard, (void *)&header, sizeof(shard_header_t)) < 0) {
+ printf("shard_header_load\n");
+ return -1;
+ }
+#define LOAD(name) shard->header.name = ntohq(header.name)
+ LOAD(objects_count);
+ LOAD(objects_position);
+ LOAD(objects_size);
+ LOAD(index_position);
+ LOAD(index_size);
+ LOAD(hash_position);
+#undef LOAD
+ shard_header_print(&shard->header);
+ return 0;
+}
+
+int shard_header_save(shard_t *shard) {
+ if (shard_seek(shard, 0, SEEK_SET) < 0) {
+ printf("shard_header_save\n");
+ return -1;
+ }
+ shard_header_print(&shard->header);
+ shard_header_t header;
+#define SAVE(name) header.name = htonq(shard->header.name)
+ SAVE(objects_count);
+ SAVE(objects_position);
+ SAVE(objects_size);
+ SAVE(index_position);
+ SAVE(index_size);
+ SAVE(hash_position);
+#undef SAVE
+ if (shard_write(shard, (void *)&header, sizeof(shard_header_t)) < 0) {
+ printf("shard_header_save\n");
+ return -1;
+ }
+ return 0;
+}
+
+int shard_header_reset(shard_header_t *header) {
+ memset((void *)header, '\0', sizeof(shard_header_t));
+ header->objects_position = SHARD_OFFSET_HEADER;
+ return 0;
+}
+
+/***********************************************************
+ * Create the Read Shard
+ */
+
+int shard_object_write(shard_t *shard, const char *key, const char *object,
+ size_t object_size) {
+ // save key & index to later build the hash
+ shard_index_t *index = &shard->index[shard->index_offset];
+ memcpy((void *)index->key, key, SHARD_KEY_LEN);
+ index->object_offset = shard_tell(shard);
+ shard->index_offset++;
+ // write the object size and the object itself
+ size_t n_object_size = htonq(object_size);
+ if (shard_write(shard, (void *)&n_object_size, sizeof(size_t)) < 0) {
+ printf("shard_object_write: object_size\n");
+ return -1;
+ }
+ if (shard_write(shard, (void *)object, object_size) < 0) {
+ printf("shard_object_write: object\n");
+ return -1;
+ }
+ return 0;
+}
+
+static int io_read(void *data, char **key, cmph_uint32 *keylen) {
+ shard_t *shard = (shard_t *)data;
+ *key = shard->index[shard->index_offset].key;
+ *keylen = SHARD_KEY_LEN;
+ shard->index_offset++;
+ return shard->index_offset >= shard->header.objects_count ? -1 : 0;
+}
+
+static void io_dispose(void *data, char *key, cmph_uint32 keylen) {}
+
+static void io_rewind(void *data) {
+ shard_t *shard = (shard_t *)data;
+ shard->index_offset = 0;
+}
+
+static cmph_io_adapter_t *io_adapter(shard_t *shard) {
+ cmph_io_adapter_t *key_source =
+ (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
+ if (key_source == NULL)
+ return NULL;
+ key_source->data = (void *)shard;
+ key_source->nkeys = shard->header.objects_count;
+ key_source->read = io_read;
+ key_source->dispose = io_dispose;
+ key_source->rewind = io_rewind;
+ return key_source;
+}
+
+int shard_hash_create(shard_t *shard) {
+ shard->source = io_adapter(shard);
+ shard->config = cmph_config_new(shard->source);
+ cmph_config_set_algo(shard->config, CMPH_CHD_PH);
+ cmph_config_set_keys_per_bin(shard->config, 1);
+ cmph_config_set_b(shard->config, 4);
+ shard->hash = cmph_new(shard->config);
+ return 0;
+}
+
+int shard_index_save(shard_t *shard) {
+ shard->header.index_position =
+ shard->header.objects_position + shard->header.objects_size;
+ debug("shard_index_save: index_position %ld\n",
+ shard->header.index_position);
+ assert(shard->header.index_position == shard_tell(shard));
+ cmph_uint32 count = cmph_size(shard->hash);
+ debug("shard_index_save: count = %d\n", count);
+ shard->header.index_size = count * sizeof(size_t);
+ size_t *index = (size_t *)calloc(1, shard->header.index_size);
+ for (size_t i = 0; i < shard->index_offset; i++) {
+ cmph_uint32 h =
+ cmph_search(shard->hash, shard->index[i].key, SHARD_KEY_LEN);
+ debug("shard_index_save: i = %ld, h = %d, offset = %ld\n", i, h,
+ shard->index[i].object_offset);
+ assert(h < count);
+ index[h] = htonq(shard->index[i].object_offset);
+ }
+ size_t index_size = shard->header.index_size;
+ if (shard_write(shard, (void *)index, index_size) < 0) {
+ printf("shard_index_save\n");
+ return -1;
+ }
+ free(index);
+ return 0;
+}
+
+int shard_hash_save(shard_t *shard) {
+ shard->header.hash_position =
+ shard->header.index_position + shard->header.index_size;
+ debug("shard_hash_save: hash_position %ld\n", shard->header.hash_position);
+ cmph_dump(shard->hash, shard->f);
+ return 0;
+}
+
+int shard_save(shard_t *shard) {
+ shard->header.objects_size =
+ shard_tell(shard) - shard->header.objects_position;
+ return (shard_hash_create(shard) < 0 || shard_index_save(shard) < 0 ||
+ shard_hash_save(shard) < 0 || shard_header_save(shard) < 0)
+ ? -1
+ : 0;
+ return 0;
+}
+
+int shard_reset(shard_t *shard) {
+ if (shard_header_reset(&shard->header) < 0)
+ return -1;
+ return shard_seek(shard, SHARD_OFFSET_HEADER, SEEK_SET);
+}
+
+int shard_create(shard_t *shard, size_t objects_count) {
+ if (shard_open(shard, "w+") < 0)
+ return -1;
+ if (shard_reset(shard) < 0)
+ return -1;
+ shard->header.objects_count = objects_count;
+ shard->index =
+ (shard_index_t *)malloc(sizeof(shard_index_t) * objects_count);
+ return 0;
+}
+
+/**********************************************************
+ * Lookup objects from a Read Shard
+ */
+
+int shard_lookup_object_size(shard_t *shard, const char *key,
+ size_t *object_size) {
+ debug("shard_lookup_object_size\n");
+ cmph_uint32 h = cmph_search(shard->hash, key, SHARD_KEY_LEN);
+ debug("shard_lookup_object_size: h = %d\n", h);
+ size_t index_offset = shard->header.index_position + h * sizeof(size_t);
+ debug("shard_lookup_object_size: index_offset = %ld\n", index_offset);
+ if (shard_seek(shard, index_offset, SEEK_SET) < 0) {
+ printf("shard_lookup_object_size: index_offset\n");
+ return -1;
+ }
+ size_t object_offset;
+ if (shard_read_size_t(shard, &object_offset) < 0) {
+ printf("shard_lookup_object_size: object_offset\n");
+ return -1;
+ }
+ debug("shard_lookup_object_size: object_offset = %ld\n", object_offset);
+ if (shard_seek(shard, object_offset, SEEK_SET) < 0) {
+ printf("shard_lookup_object_size: object_offset\n");
+ return -1;
+ }
+ if (shard_read_size_t(shard, object_size) < 0) {
+ printf("shard_lookup_object_size: object_size\n");
+ return -1;
+ }
+ debug("shard_lookup_object_size: object_size = %ld\n", *object_size);
+ return 0;
+}
+
+int shard_lookup_object(shard_t *shard, char *object, size_t object_size) {
+ if (shard_read(shard, (void *)object, object_size) < 0) {
+ printf("shard_lookup_object: object\n");
+ return -1;
+ }
+ return 0;
+}
+
+int shard_hash_load(shard_t *shard) {
+ if (shard_seek(shard, shard->header.hash_position, SEEK_SET) < 0) {
+ printf("shard_hash_load\n");
+ return -1;
+ }
+ debug("shard_hash_load: hash_position %ld\n", shard->header.hash_position);
+ shard->hash = cmph_load(shard->f);
+ if (shard->hash == NULL) {
+ printf("shard_hash_load: cmph_load\n");
+ return -1;
+ }
+ return 0;
+}
+
+int shard_load(shard_t *shard) {
+ debug("shard_load\n");
+ if (shard_open(shard, "r") < 0)
+ return -1;
+ if (shard_header_load(shard) < 0)
+ return -1;
+ return shard_hash_load(shard);
+}
+
+/**********************************************************
+ * Initialize and destroy a Read Shard
+ */
+
+shard_t *shard_init(const char *path) {
+ debug("shard_init\n");
+ shard_t *shard = (shard_t *)malloc(sizeof(shard_t));
+ if (shard == NULL)
+ return NULL;
+ memset((void *)shard, '\0', sizeof(shard_t));
+ shard->path = strdup(path);
+ return shard;
+}
+
+int shard_destroy(shard_t *shard) {
+ if (shard->source)
+ free(shard->source);
+ if (shard->config)
+ cmph_config_destroy(shard->config);
+ if (shard->hash)
+ cmph_destroy(shard->hash);
+ if (shard->index)
+ free(shard->index);
+ free(shard->path);
+ int r = shard_close(shard);
+ free(shard);
+ return r;
+}
diff --git a/swh/perfecthash/test_hash.cpp b/swh/perfecthash/test_hash.cpp
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/test_hash.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2021 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+#include <experimental/filesystem>
+#include <fcntl.h>
+#include <gtest/gtest.h>
+#include <random>
+#include <sys/types.h>
+#include <unistd.h>
+
+extern "C" {
+#include "hash.h"
+}
+
+using namespace std::experimental;
+
+filesystem::path
+create_temporary_directory(unsigned long long max_tries = 1000) {
+ auto tmp_dir = filesystem::temp_directory_path();
+ unsigned long long i = 0;
+ std::random_device dev;
+ std::mt19937 prng(dev());
+ std::uniform_int_distribution<int> rand(0);
+ filesystem::path path;
+ while (true) {
+ std::stringstream ss;
+ ss << std::hex << rand(prng);
+ path = tmp_dir / ss.str();
+ // true if the directory was created.
+ if (filesystem::create_directory(path)) {
+ break;
+ }
+ if (i == max_tries) {
+ throw std::runtime_error("could not find non-existing directory");
+ }
+ i++;
+ }
+ return path;
+}
+
+std::string gen_random(const int len) {
+
+ std::string tmp_s;
+ static const char alphanum[] = "0123456789"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz";
+
+ tmp_s.reserve(len);
+ std::random_device dev;
+ std::mt19937 prng(dev());
+ std::uniform_int_distribution<int> rand(0, strlen(alphanum) - 1);
+
+ for (int i = 0; i < len; ++i)
+ tmp_s += alphanum[rand(prng)];
+
+ return tmp_s;
+}
+
+TEST(HashTest, One) {
+ auto tmpdir = create_temporary_directory();
+ filesystem::path tmpfile = tmpdir / std::string("shard");
+ ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0);
+ ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0);
+
+ //
+ // Create a Read Shard and write a single object
+ //
+ shard_t *shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ ASSERT_GE(shard_create(shard, 1), 0);
+ const char *keyA = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
+ const char *objectA = "AAAAA";
+ size_t objectA_size = strlen(objectA);
+ ASSERT_GE(shard_object_write(shard, keyA, objectA, objectA_size), 0);
+ ASSERT_GE(shard_save(shard), 0);
+ size_t found_size = 0;
+ ASSERT_GE(shard_lookup_object_size(shard, keyA, &found_size), 0);
+ ASSERT_EQ(objectA_size, found_size);
+ char *found = (char *)malloc(found_size);
+ ASSERT_GE(shard_lookup_object(shard, found, found_size), 0);
+ ASSERT_EQ(memcmp((const void *)objectA, (const void *)found, found_size),
+ 0);
+ free(found);
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ //
+ // Open the Read Shard created above and verify the object can be
+ // looked up.
+ //
+ shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ ASSERT_GE(shard_load(shard), 0);
+ found_size = 0;
+ ASSERT_GE(shard_lookup_object_size(shard, keyA, &found_size), 0);
+ ASSERT_EQ(objectA_size, found_size);
+ found = (char *)malloc(found_size);
+ ASSERT_GE(shard_lookup_object(shard, found, found_size), 0);
+ ASSERT_EQ(memcmp((const void *)objectA, (const void *)found, found_size),
+ 0);
+ free(found);
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ filesystem::remove_all(tmpdir);
+}
+
+TEST(HashTest, Many) {
+ auto tmpdir = create_temporary_directory();
+ filesystem::path tmpfile = tmpdir / std::string("shard");
+ ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0);
+ ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0);
+
+ //
+ // Populate a Read Shard with multiple objects (objects_count)
+ // The object content and their keys are from a random source
+ // A map is kept in memory in key2object for verification
+ //
+ std::map<std::string, std::string> key2object;
+
+ shard_t *shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ int objects_count = 10;
+ ASSERT_GE(shard_create(shard, objects_count), 0);
+ for (int i = 0; i < objects_count; i++) {
+ std::string key = gen_random(32);
+ std::string object = gen_random(50);
+ key2object[key] = object;
+ std::cout << key << std::endl;
+ ASSERT_GE(shard_object_write(shard, key.c_str(), object.c_str(),
+ object.length()),
+ 0);
+ }
+ ASSERT_GE(shard_save(shard), 0);
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ //
+ // Open the Read Shard for lookups, lookup every key inserted
+ // and verify the matching object has the expected content.
+ //
+ shard = shard_init(tmpfile.c_str());
+ ASSERT_NE(shard, nullptr);
+ ASSERT_GE(shard_load(shard), 0);
+ for (std::pair<std::string, std::string> p : key2object) {
+ size_t found_size = 0;
+ ASSERT_GE(shard_lookup_object_size(shard, p.first.c_str(), &found_size),
+ 0);
+ ASSERT_EQ(p.second.length(), found_size);
+ char *found = (char *)malloc(found_size);
+ ASSERT_GE(shard_lookup_object(shard, found, found_size), 0);
+ ASSERT_EQ(memcmp((const void *)p.second.c_str(), (const void *)found,
+ found_size),
+ 0);
+ free(found);
+ }
+ ASSERT_GE(shard_destroy(shard), 0);
+
+ filesystem::remove_all(tmpdir);
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/swh/perfecthash/tests/test_hash.py b/swh/perfecthash/tests/test_hash.py
new file mode 100644
--- /dev/null
+++ b/swh/perfecthash/tests/test_hash.py
@@ -0,0 +1,144 @@
+# Copyright (C) 2021 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import random
+import time
+
+import pytest
+
+from swh.perfecthash import Shard
+
+
+def test_all(tmpdir):
+ f = f"{tmpdir}/shard"
+ open(f, "w").close()
+ os.truncate(f, 10 * 1024 * 1024)
+
+ s = Shard(f).create(2)
+ keyA = b"A" * 32
+ objectA = b"AAAA"
+ s.write(keyA, objectA)
+ keyB = b"B" * 32
+ objectB = b"BBBB"
+ s.write(keyB, objectB)
+ s.save()
+ del s
+
+ s = Shard(f).load()
+ assert s.lookup(keyA) == objectA
+ assert s.lookup(keyB) == objectB
+ del s
+
+
+@pytest.fixture
+def payload(request):
+ size = request.config.getoption("--shard-size")
+ path = request.config.getoption("--shard-path")
+ if not os.path.exists(path) or os.path.getsize(path) != size * 1024 * 1024:
+ os.system(f"dd if=/dev/urandom of={path} count={size} bs=1024k")
+ return path
+
+
+#
+# PYTHONMALLOC=malloc valgrind --tool=memcheck .tox/py3/bin/pytest \
+# -k test_build_speed swh/perfecthash/tests/test_hash.py |& tee /tmp/v
+#
+def test_build_speed(request, tmpdir, payload):
+ start = time.time()
+ os.system(f"cp {payload} {tmpdir}/shard ; rm {tmpdir}/shard")
+ baseline = time.time() - start
+ write_duration, build_duration, _ = shard_build(request, tmpdir, payload)
+ duration = write_duration + build_duration
+ print(
+ f"baseline {baseline}, "
+ f"write_duration {write_duration}, "
+ f"build_duration {build_duration}, "
+ f"total_duration {duration}"
+ )
+ assert duration < baseline * 5
+
+
+def test_lookup_speed(request, tmpdir, payload):
+ _, _, objects = shard_build(request, tmpdir, payload)
+ for i in range(request.config.getoption("--shard-count")):
+ os.system(f"cp {tmpdir}/shard {tmpdir}/shard{i}")
+ start = time.time()
+ shard_lookups(request, tmpdir, objects)
+ duration = time.time() - start
+
+ lookups = request.config.getoption("--lookups")
+ key_per_sec = lookups / duration
+ print(f"key lookups speed = {key_per_sec:.2f}/s")
+
+
+def shard_lookups(request, tmpdir, objects):
+ shard_path = f"{tmpdir}/shard"
+ shards = []
+ for i in range(request.config.getoption("--shard-count")):
+ shards.append(Shard(f"{shard_path}{i}").load())
+ lookups = request.config.getoption("--lookups")
+ count = 0
+ while True:
+ for key, object_size in objects.items():
+ if count >= lookups:
+ return
+ for shard in shards:
+ object = shard.lookup(key)
+ assert len(object) == object_size
+ count += 1
+
+
+def shard_build(request, tmpdir, payload):
+ shard_size = request.config.getoption("--shard-size") * 1024 * 1024
+ shard_path = f"{tmpdir}/shard"
+ open(shard_path, "w").close()
+ os.truncate(shard_path, shard_size * 2)
+
+ object_max_size = request.config.getoption("--object-max-size")
+ objects = {}
+ count = 0
+ size = 0
+ with open(payload, "rb") as f:
+ while True:
+ key = f.read(32)
+ if len(key) < 32:
+ break
+ assert key not in objects
+ object = f.read(random.randrange(512, object_max_size))
+ if len(object) < 512:
+ break
+ objects[key] = len(object)
+ size += len(object)
+ count += 1
+
+ print(f"number of objects = {count}, total size = {size}")
+ assert size < shard_size
+ start = time.time()
+ shard = Shard(shard_path).create(len(objects))
+
+ count = 0
+ size = 0
+ with open(payload, "rb") as f:
+ while True:
+ key = f.read(32)
+ if len(key) < 32:
+ break
+ if key not in objects:
+ break
+ object = f.read(objects[key])
+ assert len(object) == objects[key]
+ count += 1
+ size += len(object)
+ assert shard.write(key, object) >= 0, (
+ f"object count {count}/{len(objects)}, "
+ f"size {size}, "
+ f"object size {len(object)}"
+ )
+ write_duration = time.time() - start
+ start = time.time()
+ shard.save()
+ build_duration = time.time() - start
+ return write_duration, build_duration, objects
diff --git a/swh/perfecthash/tests/test_nothing.py b/swh/perfecthash/tests/test_nothing.py
deleted file mode 100644
--- a/swh/perfecthash/tests/test_nothing.py
+++ /dev/null
@@ -1,3 +0,0 @@
-def test_nothing():
- # Placeholder; remove this when we add actual tests
- pass
diff --git a/tox.ini b/tox.ini
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
[tox]
-envlist=black,flake8,mypy,py3
+envlist=black,flake8,mypy,py3,c
[testenv]
extras =
@@ -19,6 +19,12 @@
commands =
{envpython} -m black --check swh
+[testenv:c]
+whitelist_externals = make
+usedevelop = true
+commands =
+ make -C swh/perfecthash check
+
[testenv:flake8]
skip_install = true
deps =

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 7:25 PM (1 h, 57 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3224799

Event Timeline