D6424.id23806.diff
No OneTemporary
Actions

Size

37 KB

Subscribers

None

D6424.id23806.diff
View Options

	diff --git a/.gitignore b/.gitignore
	--- a/.gitignore
	+++ b/.gitignore
	@@ -12,3 +12,9 @@
	version.txt
	.mypy_cache/
	.vscode/
	+*.o
	+swh/perfecthash/test_hash
	+swh/perfecthash/hash.gcda
	+swh/perfecthash/hash.gcno
	+swh/perfecthash/test_hash.lcov
	+swh/perfecthash/html
	diff --git a/AUTHORS b/AUTHORS
	--- a/AUTHORS
	+++ b/AUTHORS
	@@ -1,3 +1,3 @@
	-Copyright (C) 2019 The Software Heritage developers
	+Copyright (C) 2021 The Software Heritage developers

	See http://www.softwareheritage.org/ for more information.
	diff --git a/MANIFEST.in b/MANIFEST.in
	--- a/MANIFEST.in
	+++ b/MANIFEST.in
	@@ -1,5 +1,6 @@
	include Makefile
	include requirements*.txt
	include version.txt
	-include README.md
	+include README.rst
	recursive-include swh py.typed
	+include swh/perfecthash/hash.[ch]
	diff --git a/conftest.py b/conftest.py
	new file mode 100644
	--- /dev/null
	+++ b/conftest.py
	@@ -0,0 +1,22 @@
	+def pytest_addoption(parser):
	+ parser.addoption(
	+ "--shard-size", default=10, type=int, help="Size of the Read Shard file in MB",
	+ )
	+ parser.addoption(
	+ "--shard-path", default="/tmp/payload", help="Path of the Read Shard file",
	+ )
	+ parser.addoption(
	+ "--shard-count",
	+ default=2,
	+ type=int,
	+ help="Number of Read Shard files for lookup tests",
	+ )
	+ parser.addoption(
	+ "--object-max-size",
	+ default=10 * 1024,
	+ type=int,
	+ help="Maximum size of anobject in bytes",
	+ )
	+ parser.addoption(
	+ "--lookups", default=10, type=int, help="Number of lookups to perform",
	+ )
	diff --git a/docs/README.rst b/docs/README.rst
	deleted file mode 100644
	--- a/docs/README.rst
	+++ /dev/null
	@@ -1,4 +0,0 @@
	-Software Heritage - Python module template
	-==========================================
	-
	-Python module template, used as skeleton to create new modules.
	diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst
	new file mode 100644
	--- /dev/null
	+++ b/docs/benchmarks.rst
	@@ -0,0 +1,93 @@
	+Benchmarks
	+==========
	+
	+The tests (tox) run the benchmark code with very small files. The benchmarks is about running the same
	+code with Read Shards that have the expected size in production (100GB) and verify:
	+
	+* The time required to build a Read Shard is less than 5 times the
	+ time to copy a file of the same size. It guards against regressions
	+ that would have a significant performance impact.
	+
	+* The time required to build and write the perfect hash table is
	+ always a fraction of the time required to copy the content of the
	+ objects.
	+
	+* It is possible to perform at least 10,000 object lookups per second.
	+
	+
	+Build performances
	+------------------
	+
	+In both cases the distribution of the object sizes is uniform.
	+
	+With a small number of large (<100MB) objects
	+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	+
	+The goal is to verify that manipulating up to the largest supported
	+object size actually works and does not trigger problems in the
	+extreme case that all objects have the maximum size.
	+
	+* time tox -e py3 -- --basetemp=/mnt/pytest -s --shard-path /mnt/payload --shard-size $((100 * 1024)) --object-max-size $((100 * 1024 * 1024)) -k test_build_speed
	+ number of objects = 2057, total size = 107374116576
	+ baseline 165.85, write_duration 327.19, build_duration 0.0062, total_duration 327.19
	+
	+With a large number of small (<4KB) objects
	+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	+
	+It approximates the maximum number of objects a Read Shard can
	+contain. The goal is to verify that creating a perfect hash with this
	+many items does not require more than a fraction of the time required
	+to copy the objects.
	+
	+* time tox -e py3 -- --basetemp=/mnt/pytest -s --shard-path /mnt/payload --shard-size $((100 * 1024)) --object-max-size $((4 * 1024)) -k test_build_speed
	+ number of objects = 45973694, total size = 105903024192
	+ baseline 165.74, write_duration 495.07, build_duration 24.21, total_duration 519.28
	+
	+
	+Object lookup performances
	+--------------------------
	+
	+The machine has 200GB of RAM and can therefore approximately cache the
	+content of two Read Shards which can have a significant impact on
	+performances. To minize that effect, four Read Shard are created
	+totaling 400GB. All objects are looked up in all shards to verify
	+the lookup speed is greater than 10,000 objects per second.
	+
	+* time tox -e py3 -- --basetemp=/mnt/pytest -s -k test_lookup_speed --lookups $((100 * 1024 * 1024)) --shard-size $((100 * 1024)) --object-max-size $((4 * 1024)) swh/perfecthash/tests/test_hash.py --shard-path /mnt/payload --shard-count 4
	+ number of objects = 45972629, total size = 105903058272
	+ key lookups speed = 148528.56/s
	+
	+
	+Setup via Fed4Fire
	+------------------
	+
	+* https://www.fed4fire.eu/
	+* /opt/jFed/jFed-Experimenter
	+* Create an experiment with one machine
	+* Click on Topology Viewer and run the experiment (name test)
	+* Once the provisionning is complete (Testing connectivity to resources on Grid5000) click Export As
	+* Choose Export Configuration Management Settings
	+* Save under test.zip and unzip
	+* ssh -i id_rsa -F ssh-config node0
	+* alias sudo=sudo-g5k
	+
	+Setup via Grid5000
	+------------------
	+
	+* https://www.grid5000.fr/
	+* oarsub -I -l "{cluster='dahu'}/host=1,walltime=1" -t deploy
	+* kadeploy3 -f $OAR_NODE_FILE -e debian11-x64-base -k
	+* ssh root@$(tail -1 $OAR_NODE_FILE)
	+
	+Common setup
	+------------
	+
	+* mkfs.ext4 /dev/sdc
	+* mount /dev/sdc /mnt
	+* apt-get install -y python3-venv python3-dev libcmph-dev gcc git emacs-nox
	+* git clone -b wip-benchmark https://git.easter-eggs.org/biceps/swh-perfecthash/
	+* python3 -m venv bench
	+* source bench/bin/activate
	+* cd swh-perfecthash
	+* pip install -r requirements.txt -r requirements-test.txt tox wheel
	+* tox -e py3
	diff --git a/docs/index.rst b/docs/index.rst
	--- a/docs/index.rst
	+++ b/docs/index.rst
	@@ -1,15 +1,17 @@
	-.. _swh-py-template:
	+.. _swh-perfecthash:

	-.. include:: README.rst
	+Software Heritage - Object storage perfect hash
	+===============================================
	+
	+Low level management for read-only content-addressable object storage
	+indexed with a perfect hash table.

	-.. toctree::
	- :maxdepth: 2
	- :caption: Contents:

	+Reference Documentation
	+-----------------------

	-Indices and tables
	-------------------
	+.. toctree::
	+ :maxdepth: 2

	-* :ref:`genindex`
	-* :ref:`modindex`
	-* :ref:`search`
	+ /apidoc/swh.perfecthash
	+ benchmarks
	diff --git a/mypy.ini b/mypy.ini
	--- a/mypy.ini
	+++ b/mypy.ini
	@@ -11,5 +11,8 @@
	[mypy-pytest.*]
	ignore_missing_imports = True

	-# [mypy-add_your_lib_here.*]
	-# ignore_missing_imports = True
	+[mypy-_hash_cffi.*]
	+ignore_missing_imports = True
	+
	+[mypy-cffi.*]
	+ignore_missing_imports = True
	diff --git a/requirements.txt b/requirements.txt
	--- a/requirements.txt
	+++ b/requirements.txt
	@@ -2,3 +2,4 @@
	# should match https://pypi.python.org/pypi names. For the full spec or
	# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html

	+cffi
	diff --git a/setup.py b/setup.py
	--- a/setup.py
	+++ b/setup.py
	@@ -48,9 +48,10 @@
	packages=find_packages(), # packages's modules
	install_requires=parse_requirements(None, "swh"),
	tests_require=parse_requirements("test"),
	- setup_requires=["setuptools-scm"],
	+ setup_requires=["setuptools-scm", "cffi"],
	use_scm_version=True,
	extras_require={"testing": parse_requirements("test")},
	+ cffi_modules=["swh/perfecthash/build.py:ffibuilder"],
	classifiers=[
	"Programming Language :: Python :: 3",
	"Intended Audience :: Developers",
	diff --git a/swh/perfecthash/.clang-format b/swh/perfecthash/.clang-format
	new file mode 100644
	--- /dev/null
	+++ b/swh/perfecthash/.clang-format
	@@ -0,0 +1,2 @@
	+---
	+IndentWidth: 4
	diff --git a/swh/perfecthash/Makefile b/swh/perfecthash/Makefile
	new file mode 100644
	--- /dev/null
	+++ b/swh/perfecthash/Makefile
	@@ -0,0 +1,19 @@
	+CFLAGS=-DHASH_DEBUG -Wall -I../.. -g -std=c++17 -fprofile-arcs -ftest-coverage
	+LDFLAGS=-lcmph -lgtest -lpthread -lstdc++ -lstdc++fs -fprofile-arcs -ftest-coverage
	+
	+test_hash: hash.o test_hash.o
	+ $(CXX) -o $@ $^ $(LDFLAGS)
	+
	+hash.c: hash.h
	+test_hash.cpp: hash.h
	+
	+format:
	+ clang-format -i hash.c hash.h test_hash.cpp
	+
	+check: test_hash
	+ valgrind --leak-check=full --tool=memcheck ./test_hash
	+ lcov -d . -c -o test_hash.lcov
	+ rm -fr html ; genhtml -o html test_hash.lcov
	+
	+clean:
	+ rm -f *.o test_hash
	diff --git a/swh/perfecthash/__init__.py b/swh/perfecthash/__init__.py
	--- a/swh/perfecthash/__init__.py
	+++ b/swh/perfecthash/__init__.py
	@@ -0,0 +1,105 @@
	+# Copyright (C) 2021 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+from typing import NewType
	+
	+from _hash_cffi import lib
	+from cffi import FFI
	+
	+Key = NewType("Key", bytes)
	+HashObject = NewType("HashObject", bytes)
	+
	+
	+class Shard:
	+ """Low level management for files indexed with a perfect hash table.
	+
	+ This class allows creating a Read Shard by adding key/object pairs
	+ and looking up the content of an object when given the key.
	+ """
	+
	+ def __init__(self, path: str):
	+ """Initialize with an existing Read Shard.
	+
	+ Args:
	+ path: path to an existing Read Shard file or device
	+
	+ """
	+ self.ffi = FFI()
	+ self.shard = lib.shard_init(path.encode("utf-8"))
	+
	+ def __del__(self):
	+ lib.shard_destroy(self.shard)
	+
	+ def create(self, objects_count: int) -> "Shard":
	+ """Wipe out the content of the Read Shard. It must be followed by
	+ object_count calls to the write method otherwise the content
	+ of the Read Shard will be inconsistent. When all objects are inserted,
	+ the Read Shard must be made persistent by calling the save method.
	+
	+ Args:
	+ objects_count: number of objects in the Read Shard.
	+
	+ Returns:
	+ self.
	+
	+ """
	+ assert lib.shard_create(self.shard, objects_count) != -1
	+ return self
	+
	+ def load(self):
	+ """Open the Read Shard file in read-only mode.
	+
	+ Returns:
	+ self.
	+ """
	+ assert lib.shard_load(self.shard) != -1
	+ return self
	+
	+ def save(self):
	+ """Create the perfect hash table the lookup method
	+ relies on to find the content of the objects.
	+
	+ It must be called after create an write otherwise the
	+ content of the Read Shard will be inconsistent.
	+
	+ Returns:
	+ 0 on success, -1 on error.
	+ """
	+ return lib.shard_save(self.shard)
	+
	+ def lookup(self, key: Key) -> HashObject:
	+ """Fetch the object matching the key in the Read Shard.
	+
	+ Fetching an object is O(1): one lookup in the index to obtain
	+ the offset of the object in the Read Shard and one read to get
	+ the payload.
	+
	+ Args:
	+ key: the key associated with the object to retrieve.
	+
	+ Returns:
	+ the object as bytes.
	+ """
	+ object_size_pointer = self.ffi.new("size_t*")
	+ lib.shard_lookup_object_size(self.shard, key, object_size_pointer)
	+ object_size = object_size_pointer[0]
	+ object_pointer = self.ffi.new("char[]", object_size)
	+ lib.shard_lookup_object(self.shard, object_pointer, object_size)
	+ return self.ffi.buffer(object_pointer, object_size)
	+
	+ def write(self, key: Key, object: HashObject) -> int:
	+ """Add the key/object pair to the Read Shard.
	+
	+ The create method must have been called prior to calling
	+ the write method.
	+
	+ Args:
	+ key: the unique key associated with the object.
	+ object: the object
	+
	+ Returns:
	+ 0 on success, -1 on error.
	+ """
	+ return lib.shard_object_write(self.shard, key, object, len(object))
	diff --git a/swh/perfecthash/build.py b/swh/perfecthash/build.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/perfecthash/build.py
	@@ -0,0 +1,42 @@
	+# Copyright (C) 2021 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+from cffi import FFI
	+
	+ffibuilder = FFI()
	+
	+# cdef() expects a single string declaring the C types, functions and
	+# globals needed to use the shared object. It must be in valid C syntax.
	+ffibuilder.cdef(
	+ """
	+typedef struct shard_t shard_t;
	+
	+shard_t* shard_init(const char* path);
	+int shard_destroy(shard_t* shard);
	+
	+int shard_create(shard_t* shard, size_t objects_count);
	+int shard_object_write(shard_t* shard, const char* key,
	+ const char* object, size_t object_size);
	+int shard_save(shard_t* shard);
	+
	+int shard_load(shard_t* shard);
	+int shard_lookup_object_size(shard_t shard, const char key, size_t *object_size);
	+int shard_lookup_object(shard_t shard, char object, size_t object_size);
	+
	+"""
	+)
	+
	+ffibuilder.set_source(
	+ "_hash_cffi",
	+ """
	+ #include "swh/perfecthash/hash.h"
	+ """,
	+ sources=["swh/perfecthash/hash.c"],
	+ include_dirs=["."],
	+ libraries=["cmph"],
	+) # library name, for the linker
	+
	+if __name__ == "__main__":
	+ ffibuilder.compile(verbose=True)
	diff --git a/swh/perfecthash/hash.h b/swh/perfecthash/hash.h
	new file mode 100644
	--- /dev/null
	+++ b/swh/perfecthash/hash.h
	@@ -0,0 +1,68 @@
	+/*
	+ * Copyright (C) 2021 The Software Heritage developers
	+ * See the AUTHORS file at the top-level directory of this distribution
	+ * License: GNU General Public License version 3, or any later version
	+ * See top-level LICENSE file for more information
	+ */
	+
	+#include <cmph.h>
	+#include <cmph_types.h>
	+
	+/*
	+ * The Read Shard has the following structure:
	+ *
	+ * - bytes [0, objects_position[
	+ * The header shard_header_t
	+ * - bytes [objects_position, index_position[
	+ * objects_count times the size of the object (size_t) followed by the
	+ * object itself
	+ * - bytes [index_position, hash_position[
	+ * An array of size_t object positions in the range [objects_position,
	+ * index_position[ The size of the array is provided by cmph_size after building
	+ * the hash function
	+ * - bytes [hash_position, ...[
	+ * The hash function, as written by cmph_dump
	+ */
	+
	+#define SHARD_OFFSET_HEADER 512
	+#define SHARD_KEY_LEN 32
	+
	+typedef struct {
	+ size_t objects_count;
	+ size_t objects_position;
	+ size_t objects_size;
	+ size_t index_position;
	+ size_t index_size;
	+ size_t hash_position;
	+} shard_header_t;
	+
	+typedef struct {
	+ char key[SHARD_KEY_LEN];
	+ size_t object_offset;
	+} shard_index_t;
	+
	+typedef struct {
	+ char *path;
	+ FILE *f;
	+ shard_header_t header;
	+ cmph_t *hash;
	+
	+ // The following fields are only used when creating the Read Shard
	+ cmph_io_adapter_t *source;
	+ cmph_config_t *config;
	+ shard_index_t *index;
	+ size_t index_offset;
	+} shard_t;
	+
	+shard_t shard_init(const char path);
	+int shard_destroy(shard_t *shard);
	+
	+int shard_create(shard_t *shard, size_t objects_count);
	+int shard_object_write(shard_t shard, const char key, const char *object,
	+ size_t object_size);
	+int shard_save(shard_t *shard);
	+
	+int shard_load(shard_t *shard);
	+int shard_lookup_object_size(shard_t shard, const char key,
	+ size_t *object_size);
	+int shard_lookup_object(shard_t shard, char object, size_t object_size);
	diff --git a/swh/perfecthash/hash.c b/swh/perfecthash/hash.c
	new file mode 100644
	--- /dev/null
	+++ b/swh/perfecthash/hash.c
	@@ -0,0 +1,380 @@
	+/*
	+ * Copyright (C) 2021 The Software Heritage developers
	+ * See the AUTHORS file at the top-level directory of this distribution
	+ * License: GNU General Public License version 3, or any later version
	+ * See top-level LICENSE file for more information
	+ */
	+
	+#include <assert.h>
	+#include <errno.h>
	+#include <fcntl.h>
	+#include <memory.h>
	+#include <string.h>
	+#include <sys/mman.h>
	+#include <sys/stat.h>
	+#include <sys/types.h>
	+#include <unistd.h>
	+
	+#include "swh/perfecthash/hash.h"
	+
	+#ifdef HASH_DEBUG
	+#define debug(...) printf(__VA_ARGS__)
	+#else
	+#define debug(...)
	+#endif
	+
	+size_t ntohq(size_t v) { return __builtin_bswap64(v); }
	+
	+size_t htonq(size_t v) { return __builtin_bswap64(v); }
	+
	+/***********************************************************
	+ * wrappers around FILE functions that:
	+ * - return -1 on error
	+ * - print a meaningful message when an error occurs
	+ *
	+ */
	+
	+int shard_open(shard_t shard, const char mode) {
	+ shard->f = fopen(shard->path, mode);
	+ if (shard->f == NULL) {
	+ printf("shard_open: open(%s, %s): %s\n", shard->path, mode,
	+ strerror(errno));
	+ return -1;
	+ } else
	+ return 0;
	+}
	+
	+int shard_close(shard_t *shard) {
	+ if (shard->f == NULL)
	+ return 0;
	+ int r = fclose(shard->f);
	+ if (r < 0)
	+ printf("shard_close: fclose(%p): %s\n", shard->f, strerror(errno));
	+ return r;
	+}
	+
	+int shard_seek(shard_t *shard, long offset, int whence) {
	+ int r = fseek(shard->f, offset, whence);
	+ if (r < 0)
	+ printf("shard_seek: fseek(%p, %ld, %d): %s\n", shard->f, offset, whence,
	+ strerror(errno));
	+ return r;
	+}
	+
	+size_t shard_tell(shard_t *shard) {
	+ long r = ftell(shard->f);
	+ if (r < 0)
	+ printf("shard_tell: ftell(%p): %s\n", shard->f, strerror(errno));
	+ return r;
	+}
	+
	+int shard_read(shard_t shard, void ptr, size_t size) {
	+ size_t read;
	+ if ((read = fread(ptr, 1, size, shard->f)) != size) {
	+ printf("shard_read: read %ld instead of %ld\n", read, size);
	+ return -1;
	+ }
	+ return 0;
	+}
	+
	+int shard_read_size_t(shard_t shard, size_t ptr) {
	+ size_t n_size;
	+ if (shard_read(shard, &n_size, sizeof(size_t)) < 0) {
	+ printf("shard_read_size_t: shard_read\n");
	+ return -1;
	+ }
	+ *ptr = ntohq(n_size);
	+ return 0;
	+}
	+
	+int shard_write(shard_t shard, const void ptr, size_t nmemb) {
	+ size_t wrote;
	+ if ((wrote = fwrite(ptr, 1, nmemb, shard->f)) != nmemb) {
	+ printf("shard_write: wrote %ld instead of %ld\n", wrote, nmemb);
	+ return -1;
	+ }
	+ return 0;
	+}
	+
	+/***********************************************************
	+ * load or save a shard_header_t
	+ */
	+
	+int shard_header_print(shard_header_t *header) {
	+#define PRINT(name) debug("shard_header_print: " #name " %ld\n", header->name)
	+ PRINT(objects_count);
	+ PRINT(objects_position);
	+ PRINT(objects_size);
	+ PRINT(index_position);
	+ PRINT(index_size);
	+ PRINT(hash_position);
	+#undef PRINT
	+ return 0;
	+}
	+
	+int shard_header_load(shard_t *shard) {
	+ if (shard_seek(shard, 0, SEEK_SET) < 0) {
	+ printf("shard_header_load\n");
	+ return -1;
	+ }
	+ shard_header_t header;
	+ if (shard_read(shard, (void *)&header, sizeof(shard_header_t)) < 0) {
	+ printf("shard_header_load\n");
	+ return -1;
	+ }
	+#define LOAD(name) shard->header.name = ntohq(header.name)
	+ LOAD(objects_count);
	+ LOAD(objects_position);
	+ LOAD(objects_size);
	+ LOAD(index_position);
	+ LOAD(index_size);
	+ LOAD(hash_position);
	+#undef LOAD
	+ shard_header_print(&shard->header);
	+ return 0;
	+}
	+
	+int shard_header_save(shard_t *shard) {
	+ if (shard_seek(shard, 0, SEEK_SET) < 0) {
	+ printf("shard_header_save\n");
	+ return -1;
	+ }
	+ shard_header_print(&shard->header);
	+ shard_header_t header;
	+#define SAVE(name) header.name = htonq(shard->header.name)
	+ SAVE(objects_count);
	+ SAVE(objects_position);
	+ SAVE(objects_size);
	+ SAVE(index_position);
	+ SAVE(index_size);
	+ SAVE(hash_position);
	+#undef SAVE
	+ if (shard_write(shard, (void *)&header, sizeof(shard_header_t)) < 0) {
	+ printf("shard_header_save\n");
	+ return -1;
	+ }
	+ return 0;
	+}
	+
	+int shard_header_reset(shard_header_t *header) {
	+ memset((void *)header, '\0', sizeof(shard_header_t));
	+ header->objects_position = SHARD_OFFSET_HEADER;
	+ return 0;
	+}
	+
	+/***********************************************************
	+ * Create the Read Shard
	+ */
	+
	+int shard_object_write(shard_t shard, const char key, const char *object,
	+ size_t object_size) {
	+ // save key & index to later build the hash
	+ shard_index_t *index = &shard->index[shard->index_offset];
	+ memcpy((void *)index->key, key, SHARD_KEY_LEN);
	+ index->object_offset = shard_tell(shard);
	+ shard->index_offset++;
	+ // write the object size and the object itself
	+ size_t n_object_size = htonq(object_size);
	+ if (shard_write(shard, (void *)&n_object_size, sizeof(size_t)) < 0) {
	+ printf("shard_object_write: object_size\n");
	+ return -1;
	+ }
	+ if (shard_write(shard, (void *)object, object_size) < 0) {
	+ printf("shard_object_write: object\n");
	+ return -1;
	+ }
	+ return 0;
	+}
	+
	+static int io_read(void data, char key, cmph_uint32 keylen) {
	+ shard_t shard = (shard_t )data;
	+ *key = shard->index[shard->index_offset].key;
	+ *keylen = SHARD_KEY_LEN;
	+ shard->index_offset++;
	+ return shard->index_offset >= shard->header.objects_count ? -1 : 0;
	+}
	+
	+static void io_dispose(void data, char key, cmph_uint32 keylen) {}
	+
	+static void io_rewind(void *data) {
	+ shard_t shard = (shard_t )data;
	+ shard->index_offset = 0;
	+}
	+
	+static cmph_io_adapter_t io_adapter(shard_t shard) {
	+ cmph_io_adapter_t *key_source =
	+ (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
	+ if (key_source == NULL)
	+ return NULL;
	+ key_source->data = (void *)shard;
	+ key_source->nkeys = shard->header.objects_count;
	+ key_source->read = io_read;
	+ key_source->dispose = io_dispose;
	+ key_source->rewind = io_rewind;
	+ return key_source;
	+}
	+
	+int shard_hash_create(shard_t *shard) {
	+ shard->source = io_adapter(shard);
	+ shard->config = cmph_config_new(shard->source);
	+ cmph_config_set_algo(shard->config, CMPH_CHD_PH);
	+ cmph_config_set_keys_per_bin(shard->config, 1);
	+ cmph_config_set_b(shard->config, 4);
	+ shard->hash = cmph_new(shard->config);
	+ return 0;
	+}
	+
	+int shard_index_save(shard_t *shard) {
	+ shard->header.index_position =
	+ shard->header.objects_position + shard->header.objects_size;
	+ debug("shard_index_save: index_position %ld\n",
	+ shard->header.index_position);
	+ assert(shard->header.index_position == shard_tell(shard));
	+ cmph_uint32 count = cmph_size(shard->hash);
	+ debug("shard_index_save: count = %d\n", count);
	+ shard->header.index_size = count * sizeof(size_t);
	+ size_t index = (size_t )calloc(1, shard->header.index_size);
	+ for (size_t i = 0; i < shard->index_offset; i++) {
	+ cmph_uint32 h =
	+ cmph_search(shard->hash, shard->index[i].key, SHARD_KEY_LEN);
	+ debug("shard_index_save: i = %ld, h = %d, offset = %ld\n", i, h,
	+ shard->index[i].object_offset);
	+ assert(h < count);
	+ index[h] = htonq(shard->index[i].object_offset);
	+ }
	+ size_t index_size = shard->header.index_size;
	+ if (shard_write(shard, (void *)index, index_size) < 0) {
	+ printf("shard_index_save\n");
	+ return -1;
	+ }
	+ free(index);
	+ return 0;
	+}
	+
	+int shard_hash_save(shard_t *shard) {
	+ shard->header.hash_position =
	+ shard->header.index_position + shard->header.index_size;
	+ debug("shard_hash_save: hash_position %ld\n", shard->header.hash_position);
	+ cmph_dump(shard->hash, shard->f);
	+ return 0;
	+}
	+
	+int shard_save(shard_t *shard) {
	+ shard->header.objects_size =
	+ shard_tell(shard) - shard->header.objects_position;
	+ return (shard_hash_create(shard) < 0 \|\| shard_index_save(shard) < 0 \|\|
	+ shard_hash_save(shard) < 0 \|\| shard_header_save(shard) < 0)
	+ ? -1
	+ : 0;
	+ return 0;
	+}
	+
	+int shard_reset(shard_t *shard) {
	+ if (shard_header_reset(&shard->header) < 0)
	+ return -1;
	+ return shard_seek(shard, SHARD_OFFSET_HEADER, SEEK_SET);
	+}
	+
	+int shard_create(shard_t *shard, size_t objects_count) {
	+ if (shard_open(shard, "w+") < 0)
	+ return -1;
	+ if (shard_reset(shard) < 0)
	+ return -1;
	+ shard->header.objects_count = objects_count;
	+ shard->index =
	+ (shard_index_t )malloc(sizeof(shard_index_t) objects_count);
	+ return 0;
	+}
	+
	+/**********************************************************
	+ * Lookup objects from a Read Shard
	+ */
	+
	+int shard_lookup_object_size(shard_t shard, const char key,
	+ size_t *object_size) {
	+ debug("shard_lookup_object_size\n");
	+ cmph_uint32 h = cmph_search(shard->hash, key, SHARD_KEY_LEN);
	+ debug("shard_lookup_object_size: h = %d\n", h);
	+ size_t index_offset = shard->header.index_position + h * sizeof(size_t);
	+ debug("shard_lookup_object_size: index_offset = %ld\n", index_offset);
	+ if (shard_seek(shard, index_offset, SEEK_SET) < 0) {
	+ printf("shard_lookup_object_size: index_offset\n");
	+ return -1;
	+ }
	+ size_t object_offset;
	+ if (shard_read_size_t(shard, &object_offset) < 0) {
	+ printf("shard_lookup_object_size: object_offset\n");
	+ return -1;
	+ }
	+ debug("shard_lookup_object_size: object_offset = %ld\n", object_offset);
	+ if (shard_seek(shard, object_offset, SEEK_SET) < 0) {
	+ printf("shard_lookup_object_size: object_offset\n");
	+ return -1;
	+ }
	+ if (shard_read_size_t(shard, object_size) < 0) {
	+ printf("shard_lookup_object_size: object_size\n");
	+ return -1;
	+ }
	+ debug("shard_lookup_object_size: object_size = %ld\n", *object_size);
	+ return 0;
	+}
	+
	+int shard_lookup_object(shard_t shard, char object, size_t object_size) {
	+ if (shard_read(shard, (void *)object, object_size) < 0) {
	+ printf("shard_lookup_object: object\n");
	+ return -1;
	+ }
	+ return 0;
	+}
	+
	+int shard_hash_load(shard_t *shard) {
	+ if (shard_seek(shard, shard->header.hash_position, SEEK_SET) < 0) {
	+ printf("shard_hash_load\n");
	+ return -1;
	+ }
	+ debug("shard_hash_load: hash_position %ld\n", shard->header.hash_position);
	+ shard->hash = cmph_load(shard->f);
	+ if (shard->hash == NULL) {
	+ printf("shard_hash_load: cmph_load\n");
	+ return -1;
	+ }
	+ return 0;
	+}
	+
	+int shard_load(shard_t *shard) {
	+ debug("shard_load\n");
	+ if (shard_open(shard, "r") < 0)
	+ return -1;
	+ if (shard_header_load(shard) < 0)
	+ return -1;
	+ return shard_hash_load(shard);
	+}
	+
	+/**********************************************************
	+ * Initialize and destroy a Read Shard
	+ */
	+
	+shard_t shard_init(const char path) {
	+ debug("shard_init\n");
	+ shard_t shard = (shard_t )malloc(sizeof(shard_t));
	+ if (shard == NULL)
	+ return NULL;
	+ memset((void *)shard, '\0', sizeof(shard_t));
	+ shard->path = strdup(path);
	+ return shard;
	+}
	+
	+int shard_destroy(shard_t *shard) {
	+ if (shard->source)
	+ free(shard->source);
	+ if (shard->config)
	+ cmph_config_destroy(shard->config);
	+ if (shard->hash)
	+ cmph_destroy(shard->hash);
	+ if (shard->index)
	+ free(shard->index);
	+ free(shard->path);
	+ int r = shard_close(shard);
	+ free(shard);
	+ return r;
	+}
	diff --git a/swh/perfecthash/test_hash.cpp b/swh/perfecthash/test_hash.cpp
	new file mode 100644
	--- /dev/null
	+++ b/swh/perfecthash/test_hash.cpp
	@@ -0,0 +1,166 @@
	+/*
	+ * Copyright (C) 2021 The Software Heritage developers
	+ * See the AUTHORS file at the top-level directory of this distribution
	+ * License: GNU General Public License version 3, or any later version
	+ * See top-level LICENSE file for more information
	+ */
	+
	+#include <experimental/filesystem>
	+#include <fcntl.h>
	+#include <gtest/gtest.h>
	+#include <random>
	+#include <sys/types.h>
	+#include <unistd.h>
	+
	+extern "C" {
	+#include "hash.h"
	+}
	+
	+using namespace std::experimental;
	+
	+filesystem::path
	+create_temporary_directory(unsigned long long max_tries = 1000) {
	+ auto tmp_dir = filesystem::temp_directory_path();
	+ unsigned long long i = 0;
	+ std::random_device dev;
	+ std::mt19937 prng(dev());
	+ std::uniform_int_distribution<int> rand(0);
	+ filesystem::path path;
	+ while (true) {
	+ std::stringstream ss;
	+ ss << std::hex << rand(prng);
	+ path = tmp_dir / ss.str();
	+ // true if the directory was created.
	+ if (filesystem::create_directory(path)) {
	+ break;
	+ }
	+ if (i == max_tries) {
	+ throw std::runtime_error("could not find non-existing directory");
	+ }
	+ i++;
	+ }
	+ return path;
	+}
	+
	+std::string gen_random(const int len) {
	+
	+ std::string tmp_s;
	+ static const char alphanum[] = "0123456789"
	+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	+ "abcdefghijklmnopqrstuvwxyz";
	+
	+ tmp_s.reserve(len);
	+ std::random_device dev;
	+ std::mt19937 prng(dev());
	+ std::uniform_int_distribution<int> rand(0, strlen(alphanum) - 1);
	+
	+ for (int i = 0; i < len; ++i)
	+ tmp_s += alphanum[rand(prng)];
	+
	+ return tmp_s;
	+}
	+
	+TEST(HashTest, One) {
	+ auto tmpdir = create_temporary_directory();
	+ filesystem::path tmpfile = tmpdir / std::string("shard");
	+ ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0);
	+ ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0);
	+
	+ //
	+ // Create a Read Shard and write a single object
	+ //
	+ shard_t *shard = shard_init(tmpfile.c_str());
	+ ASSERT_NE(shard, nullptr);
	+ ASSERT_GE(shard_create(shard, 1), 0);
	+ const char *keyA = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
	+ const char *objectA = "AAAAA";
	+ size_t objectA_size = strlen(objectA);
	+ ASSERT_GE(shard_object_write(shard, keyA, objectA, objectA_size), 0);
	+ ASSERT_GE(shard_save(shard), 0);
	+ size_t found_size = 0;
	+ ASSERT_GE(shard_lookup_object_size(shard, keyA, &found_size), 0);
	+ ASSERT_EQ(objectA_size, found_size);
	+ char found = (char )malloc(found_size);
	+ ASSERT_GE(shard_lookup_object(shard, found, found_size), 0);
	+ ASSERT_EQ(memcmp((const void )objectA, (const void )found, found_size),
	+ 0);
	+ free(found);
	+ ASSERT_GE(shard_destroy(shard), 0);
	+
	+ //
	+ // Open the Read Shard created above and verify the object can be
	+ // looked up.
	+ //
	+ shard = shard_init(tmpfile.c_str());
	+ ASSERT_NE(shard, nullptr);
	+ ASSERT_GE(shard_load(shard), 0);
	+ found_size = 0;
	+ ASSERT_GE(shard_lookup_object_size(shard, keyA, &found_size), 0);
	+ ASSERT_EQ(objectA_size, found_size);
	+ found = (char *)malloc(found_size);
	+ ASSERT_GE(shard_lookup_object(shard, found, found_size), 0);
	+ ASSERT_EQ(memcmp((const void )objectA, (const void )found, found_size),
	+ 0);
	+ free(found);
	+ ASSERT_GE(shard_destroy(shard), 0);
	+
	+ filesystem::remove_all(tmpdir);
	+}
	+
	+TEST(HashTest, Many) {
	+ auto tmpdir = create_temporary_directory();
	+ filesystem::path tmpfile = tmpdir / std::string("shard");
	+ ASSERT_GE(close(open(tmpfile.c_str(), O_CREAT, 0777)), 0);
	+ ASSERT_GE(truncate(tmpfile.c_str(), 10 * 1024 * 1024), 0);
	+
	+ //
	+ // Populate a Read Shard with multiple objects (objects_count)
	+ // The object content and their keys are from a random source
	+ // A map is kept in memory in key2object for verification
	+ //
	+ std::map<std::string, std::string> key2object;
	+
	+ shard_t *shard = shard_init(tmpfile.c_str());
	+ ASSERT_NE(shard, nullptr);
	+ int objects_count = 10;
	+ ASSERT_GE(shard_create(shard, objects_count), 0);
	+ for (int i = 0; i < objects_count; i++) {
	+ std::string key = gen_random(32);
	+ std::string object = gen_random(50);
	+ key2object[key] = object;
	+ std::cout << key << std::endl;
	+ ASSERT_GE(shard_object_write(shard, key.c_str(), object.c_str(),
	+ object.length()),
	+ 0);
	+ }
	+ ASSERT_GE(shard_save(shard), 0);
	+ ASSERT_GE(shard_destroy(shard), 0);
	+
	+ //
	+ // Open the Read Shard for lookups, lookup every key inserted
	+ // and verify the matching object has the expected content.
	+ //
	+ shard = shard_init(tmpfile.c_str());
	+ ASSERT_NE(shard, nullptr);
	+ ASSERT_GE(shard_load(shard), 0);
	+ for (std::pair<std::string, std::string> p : key2object) {
	+ size_t found_size = 0;
	+ ASSERT_GE(shard_lookup_object_size(shard, p.first.c_str(), &found_size),
	+ 0);
	+ ASSERT_EQ(p.second.length(), found_size);
	+ char found = (char )malloc(found_size);
	+ ASSERT_GE(shard_lookup_object(shard, found, found_size), 0);
	+ ASSERT_EQ(memcmp((const void )p.second.c_str(), (const void )found,
	+ found_size),
	+ 0);
	+ free(found);
	+ }
	+ ASSERT_GE(shard_destroy(shard), 0);
	+
	+ filesystem::remove_all(tmpdir);
	+}
	+
	+int main(int argc, char **argv) {
	+ testing::InitGoogleTest(&argc, argv);
	+ return RUN_ALL_TESTS();
	+}
	diff --git a/swh/perfecthash/tests/test_hash.py b/swh/perfecthash/tests/test_hash.py
	new file mode 100644
	--- /dev/null
	+++ b/swh/perfecthash/tests/test_hash.py
	@@ -0,0 +1,144 @@
	+# Copyright (C) 2021 The Software Heritage developers
	+# See the AUTHORS file at the top-level directory of this distribution
	+# License: GNU General Public License version 3, or any later version
	+# See top-level LICENSE file for more information
	+
	+import os
	+import random
	+import time
	+
	+import pytest
	+
	+from swh.perfecthash import Shard
	+
	+
	+def test_all(tmpdir):
	+ f = f"{tmpdir}/shard"
	+ open(f, "w").close()
	+ os.truncate(f, 10 * 1024 * 1024)
	+
	+ s = Shard(f).create(2)
	+ keyA = b"A" * 32
	+ objectA = b"AAAA"
	+ s.write(keyA, objectA)
	+ keyB = b"B" * 32
	+ objectB = b"BBBB"
	+ s.write(keyB, objectB)
	+ s.save()
	+ del s
	+
	+ s = Shard(f).load()
	+ assert s.lookup(keyA) == objectA
	+ assert s.lookup(keyB) == objectB
	+ del s
	+
	+
	+@pytest.fixture
	+def payload(request):
	+ size = request.config.getoption("--shard-size")
	+ path = request.config.getoption("--shard-path")
	+ if not os.path.exists(path) or os.path.getsize(path) != size * 1024 * 1024:
	+ os.system(f"dd if=/dev/urandom of={path} count={size} bs=1024k")
	+ return path
	+
	+
	+#
	+# PYTHONMALLOC=malloc valgrind --tool=memcheck .tox/py3/bin/pytest \
	+# -k test_build_speed swh/perfecthash/tests/test_hash.py \|& tee /tmp/v
	+#
	+def test_build_speed(request, tmpdir, payload):
	+ start = time.time()
	+ os.system(f"cp {payload} {tmpdir}/shard ; rm {tmpdir}/shard")
	+ baseline = time.time() - start
	+ write_duration, build_duration, _ = shard_build(request, tmpdir, payload)
	+ duration = write_duration + build_duration
	+ print(
	+ f"baseline {baseline}, "
	+ f"write_duration {write_duration}, "
	+ f"build_duration {build_duration}, "
	+ f"total_duration {duration}"
	+ )
	+ assert duration < baseline * 5
	+
	+
	+def test_lookup_speed(request, tmpdir, payload):
	+ _, _, objects = shard_build(request, tmpdir, payload)
	+ for i in range(request.config.getoption("--shard-count")):
	+ os.system(f"cp {tmpdir}/shard {tmpdir}/shard{i}")
	+ start = time.time()
	+ shard_lookups(request, tmpdir, objects)
	+ duration = time.time() - start
	+
	+ lookups = request.config.getoption("--lookups")
	+ key_per_sec = lookups / duration
	+ print(f"key lookups speed = {key_per_sec:.2f}/s")
	+
	+
	+def shard_lookups(request, tmpdir, objects):
	+ shard_path = f"{tmpdir}/shard"
	+ shards = []
	+ for i in range(request.config.getoption("--shard-count")):
	+ shards.append(Shard(f"{shard_path}{i}").load())
	+ lookups = request.config.getoption("--lookups")
	+ count = 0
	+ while True:
	+ for key, object_size in objects.items():
	+ if count >= lookups:
	+ return
	+ for shard in shards:
	+ object = shard.lookup(key)
	+ assert len(object) == object_size
	+ count += 1
	+
	+
	+def shard_build(request, tmpdir, payload):
	+ shard_size = request.config.getoption("--shard-size") * 1024 * 1024
	+ shard_path = f"{tmpdir}/shard"
	+ open(shard_path, "w").close()
	+ os.truncate(shard_path, shard_size * 2)
	+
	+ object_max_size = request.config.getoption("--object-max-size")
	+ objects = {}
	+ count = 0
	+ size = 0
	+ with open(payload, "rb") as f:
	+ while True:
	+ key = f.read(32)
	+ if len(key) < 32:
	+ break
	+ assert key not in objects
	+ object = f.read(random.randrange(512, object_max_size))
	+ if len(object) < 512:
	+ break
	+ objects[key] = len(object)
	+ size += len(object)
	+ count += 1
	+
	+ print(f"number of objects = {count}, total size = {size}")
	+ assert size < shard_size
	+ start = time.time()
	+ shard = Shard(shard_path).create(len(objects))
	+
	+ count = 0
	+ size = 0
	+ with open(payload, "rb") as f:
	+ while True:
	+ key = f.read(32)
	+ if len(key) < 32:
	+ break
	+ if key not in objects:
	+ break
	+ object = f.read(objects[key])
	+ assert len(object) == objects[key]
	+ count += 1
	+ size += len(object)
	+ assert shard.write(key, object) >= 0, (
	+ f"object count {count}/{len(objects)}, "
	+ f"size {size}, "
	+ f"object size {len(object)}"
	+ )
	+ write_duration = time.time() - start
	+ start = time.time()
	+ shard.save()
	+ build_duration = time.time() - start
	+ return write_duration, build_duration, objects
	diff --git a/swh/perfecthash/tests/test_nothing.py b/swh/perfecthash/tests/test_nothing.py
	deleted file mode 100644
	--- a/swh/perfecthash/tests/test_nothing.py
	+++ /dev/null
	@@ -1,3 +0,0 @@
	-def test_nothing():
	- # Placeholder; remove this when we add actual tests
	- pass
	diff --git a/tox.ini b/tox.ini
	--- a/tox.ini
	+++ b/tox.ini
	@@ -1,5 +1,5 @@
	[tox]
	-envlist=black,flake8,mypy,py3
	+envlist=black,flake8,mypy,py3,c

	[testenv]
	extras =
	@@ -19,6 +19,12 @@
	commands =
	{envpython} -m black --check swh

	+[testenv:c]
	+whitelist_externals = make
	+usedevelop = true
	+commands =
	+ make -C swh/perfecthash check
	+
	[testenv:flake8]
	skip_install = true
	deps =

File Metadata

Mime Type: text/plain
Expires: Wed, Dec 18, 7:25 PM (1 h, 57 m ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 3224799

D6424.id23806.diffNo OneTemporaryActions

D6424.id23806.diffView Options

File Metadata

Event Timeline

D6424.id23806.diff
No OneTemporary
Actions

D6424.id23806.diff
View Options