diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f44dbd8..573f9ef 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,53 +1,58 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v2.4.0 hooks: - id: trailing-whitespace - id: check-json - id: check-yaml # we need the master of pyflakes to have support for @overload on methods, # so we use a local config for now # - repo: https://gitlab.com/pycqa/flake8 # hooks: # - id: flake8 - repo: local hooks: - id: flake8 name: flake8 entry: flake8 pass_filenames: true language: system types: [python] - repo: https://github.com/codespell-project/codespell rev: v1.16.0 hooks: - id: codespell - repo: local hooks: - id: mypy name: mypy entry: mypy args: [swh] pass_filenames: false language: system types: [python] +- repo: https://github.com/PyCQA/isort + rev: 5.5.2 + hooks: + - id: isort + - repo: https://github.com/python/black rev: 19.10b0 hooks: - id: black # unfortunately, we are far from being able to enable this... # - repo: https://github.com/PyCQA/pydocstyle.git # rev: 4.0.0 # hooks: # - id: pydocstyle # name: pydocstyle # description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions. # entry: pydocstyle --convention=google # language: python # types: [python] diff --git a/PKG-INFO b/PKG-INFO index 00b1c54..2a84d35 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,72 +1,72 @@ Metadata-Version: 2.1 Name: swh.journal -Version: 0.4.2 +Version: 0.4.3 Summary: Software Heritage Journal utilities Home-page: https://forge.softwareheritage.org/diffusion/DJNL/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-journal Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-journal/ Description: swh-journal =========== Persistent logger of changes to the archive, with publish-subscribe support. See the [documentation](https://docs.softwareheritage.org/devel/swh-journal/index.html#software-heritage-journal) for more details. # Local test As a pre-requisite, you need a kakfa installation path. The following target will take care of this: ``` make install ``` Then, provided you are in the right virtual environment as described in the [swh getting-started](https://docs.softwareheritage.org/devel/developer-setup.html#developer-setup): ``` pytest ``` or: ``` tox ``` # Running ## publisher Command: ``` $ swh-journal --config-file ~/.config/swh/journal/publisher.yml \ publisher ``` # Auto-completion To have the completion, add the following in your ~/.virtualenvs/swh/bin/postactivate: ``` eval "$(_SWH_JOURNAL_COMPLETE=$autocomplete_cmd swh-journal)" ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/debian/changelog b/debian/changelog index 4245ba5..dbfd14d 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,424 +1,426 @@ -swh-journal (0.4.2-1~swh2~bpo10+1) buster-swh; urgency=medium +swh-journal (0.4.3-1~swh1) unstable-swh; urgency=medium - * Rebuild for buster-swh + * New upstream release 0.4.3 - (tagged by David Douard + on 2020-09-25 11:51:00 +0200) + * Upstream changes: - v0.4.3 - -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 Aug 2020 09:43:06 +0000 + -- Software Heritage autobuilder (on jenkins-debian1) Fri, 25 Sep 2020 09:53:37 +0000 swh-journal (0.4.2-1~swh2) unstable-swh; urgency=medium * Fix debian dependencies -- Antoine R. Dumont (@ardumont)) Fri, 07 Aug 2020 09:38:25 +0000 swh-journal (0.4.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-08-07 10:58:45 +0200) * Upstream changes: - v0.4.2 - pytest_plugin: Deal with the case when ctime is dropped - setup.py: Migrate from vcversioner to setuptools-scm -- Software Heritage autobuilder (on jenkins-debian1) Fri, 07 Aug 2020 09:01:25 +0000 swh-journal (0.4.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.1 - (tagged by Valentin Lorentz on 2020-07-31 11:25:19 +0200) * Upstream changes: - v0.4.1 - * Remove TEST_OBJECT_DICTS, use only TEST_OBJECTS. - * Add support for MetadataAuthority, MetadataFetcher, and RawExtrinsicMetadata. -- Software Heritage autobuilder (on jenkins-debian1) Fri, 31 Jul 2020 09:31:32 +0000 swh-journal (0.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.0 - (tagged by David Douard on 2020-07-06 13:49:40 +0200) * Upstream changes: - v0.4.0 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 06 Jul 2020 11:58:06 +0000 swh-journal (0.3.5-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.5 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-01 15:55:31 +0200) * Upstream changes: - v0.3.5 - journal_data: Drop obsolete origin_visit fields - Use proper hash ids in tests' journal_data -- Software Heritage autobuilder (on jenkins-debian1) Wed, 01 Jul 2020 14:00:27 +0000 swh-journal (0.3.4-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.4 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-25 10:10:16 +0200) * Upstream changes: - v0.3.4 - Drop datetime conversion indirection -- Software Heritage autobuilder (on jenkins-debian1) Thu, 25 Jun 2020 08:13:02 +0000 swh-journal (0.3.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.3 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-25 09:34:32 +0200) * Upstream changes: - v0.3.3 - journal_data: Make origin-visit optional fields to None -- Software Heritage autobuilder (on jenkins-debian1) Thu, 25 Jun 2020 07:36:28 +0000 swh-journal (0.3.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.2 - (tagged by David Douard on 2020-06-17 09:29:44 +0200) * Upstream changes: - v0.3.2 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 17 Jun 2020 07:37:10 +0000 swh-journal (0.3.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-10 10:54:44 +0200) * Upstream changes: - v0.3.1 - pytest_plugin: pprint key when assertion is not respected -- Software Heritage autobuilder (on jenkins-debian1) Wed, 10 Jun 2020 08:56:57 +0000 swh-journal (0.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-06-09 10:42:30 +0200) * Upstream changes: - v0.3.0 - Allow journal to deal with origin_visit_status - test: Use origin-visit date field as datetime to phase out iso8601 str -- Software Heritage autobuilder (on jenkins-debian1) Tue, 09 Jun 2020 08:44:44 +0000 swh-journal (0.2.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.0 - (tagged by David Douard on 2020-06-03 13:50:54 +0200) * Upstream changes: - v0.2.0 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Jun 2020 12:00:31 +0000 swh-journal (0.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.0 - (tagged by David Douard on 2020-05-07 10:33:18 +0200) * Upstream changes: - v0.1.0 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 07 May 2020 08:35:49 +0000 swh-journal (0.0.32-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.32 - (tagged by Antoine R. Dumont (@ardumont) on 2020-05-04 18:02:45 +0200) * Upstream changes: - v0.0.32 - serializers: Make kafka_to_key implem compatible with stable version - setup.py: add documentation link - Remove the content replayer code - Remove the backfiller and the (storage) replayer -- Software Heritage autobuilder (on jenkins-debian1) Mon, 04 May 2020 16:04:38 +0000 swh-journal (0.0.31-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.31 - (tagged by David Douard on 2020-04-23 12:27:47 +0200) * Upstream changes: - v0.0.31 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 Apr 2020 10:37:28 +0000 swh-journal (0.0.30-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.30 - (tagged by Nicolas Dandrimont on 2020-04-14 15:57:34 +0200) * Upstream changes: - Release swh.journal v0.0.30 - various test refactorings - enable black on the source code - accept swh.model objects in the journal writers - listen to delivery notifications after writing messages - default to message.max.bytes = 100 MB in the kafka writer -- Software Heritage autobuilder (on jenkins-debian1) Tue, 14 Apr 2020 14:10:54 +0000 swh-journal (0.0.29-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.29 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-27 10:40:46 +0100) * Upstream changes: - v0.0.29 - replayer: Allow legacy origin to be replayed - tests.utils: Test from modules using this need to call close method - tests: Adapt model according to latest change -- Software Heritage autobuilder (on jenkins-debian1) Fri, 27 Mar 2020 09:51:08 +0000 swh-journal (0.0.28-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.28 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-25 10:14:47 +0100) * Upstream changes: - v0.0.28 - journal.replay: Migrate to latest HashCollision change - replayer: factor out legacy objects fixers - journal.replay: Inline `_fix_origin_visit` for loop in insert_object - journal.replay: Align _fix_content with other fix methods - journal.replay: Align fix revision behavior to other fix methods - Remove extra 'perms' key of contents when replaying. -- Software Heritage autobuilder (on jenkins-debian1) Wed, 25 Mar 2020 09:23:55 +0000 swh-journal (0.0.27-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.27 - (tagged by Antoine R. Dumont (@ardumont) on 2020-03-16 16:09:33 +0100) * Upstream changes: - v0.0.27 - Migrate to latest origin_visit_upsert/add api changes - journal: Use swh-model objects instead of dicts in replay and writer - replay: Filter out colliding contents when replaying - Use better kafka producer semantics in the journal writers - Make the number of messages processed at a time by journal clients - configurable - Drop deprecated cli options - Replace deprecated options with a config file override in cli tests - Clean up the signature of test_cli's invoke method - Migrate test cli config to a dict instead of raw yaml - kafka: normalize KafkaJournalWriter.write_addition[s] API - Rename JournalClient.max_messages to JournalClient.stop_after_objects - Be more careful with content generation in test_write_replay - Add type annotations to swh.journal.client arguments - Unify tense for content replay statistics log entry - Add missing log4j.properties file from MANIFEST.in - Unify retry/error handling for content replay -- Software Heritage autobuilder (on jenkins-debian1) Mon, 16 Mar 2020 15:17:43 +0000 swh-journal (0.0.26-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.26 - (tagged by David Douard on 2020-03-06 15:36:23 +0100) * Upstream changes: - v0.0.26 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 06 Mar 2020 14:47:18 +0000 swh-journal (0.0.25-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.25 - (tagged by Valentin Lorentz on 2020-01-30 15:35:41 +0100) * Upstream changes: - v0.0.25 - * Add support for swh-storage v0.0.168. - * Accept None dates when validating revisions. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 30 Jan 2020 14:43:25 +0000 swh-journal (0.0.24-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.24 - (tagged by Antoine Lambert on 2020-01-06 16:14:32 +0100) * Upstream changes: - version 0.0.24 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 06 Jan 2020 15:20:54 +0000 swh-journal (0.0.23-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.23 - (tagged by Nicolas Dandrimont on 2020-01-03 20:02:39 +0100) * Upstream changes: - Release swh.journal v0.0.23 - support short-hand syntax for initializing the journal direct writer -- Software Heritage autobuilder (on jenkins-debian1) Fri, 03 Jan 2020 19:06:31 +0000 swh-journal (0.0.21-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.21 - (tagged by David Douard on 2019-11-29 15:37:49 +0100) * Upstream changes: - v0.0.21 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 02 Dec 2019 14:12:49 +0000 swh-journal (0.0.20-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.20 - (tagged by David Douard on 2019-11-29 12:06:13 +0100) * Upstream changes: - v0.0.20 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 29 Nov 2019 11:10:44 +0000 swh-journal (0.0.19-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.19 - (tagged by Nicolas Dandrimont on 2019-11-07 14:33:58 +0100) * Upstream changes: - Release swh.journal v0.0.19 - Merge several reliability fixes -- Software Heritage autobuilder (on jenkins-debian1) Thu, 07 Nov 2019 13:37:23 +0000 swh-journal (0.0.18-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.18 - (tagged by Stefano Zacchiroli on 2019-10-01 10:19:26 +0200) * Upstream changes: - v0.0.18 - * tox: anticipate mypy run to just after flake8 - * init.py: switch to documented way of extending path - * typing: minimal changes to make a no-op mypy run pass - * writer: Normalize 'cls' value to 'memory' for in- memory instantiation - * Add a test directly for the journal client. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 01 Oct 2019 11:12:03 +0000 swh-journal (0.0.17-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.17 - (tagged by Nicolas Dandrimont on 2019-09-18 18:01:50 +0200) * Upstream changes: - Release swh.journal v0.0.17 - Cleanup fallout from confluent_kafka migration - Better error handling and logging in direct_writer - Backfiller fixups - More extensive mock for KafkaConsumer -- Software Heritage autobuilder (on jenkins-debian1) Wed, 18 Sep 2019 16:08:20 +0000 swh-journal (0.0.16-1~swh2) unstable-swh; urgency=medium * Migrate to confluent-kafka -- Nicolas Dandrimont Fri, 13 Sep 2019 20:11:24 +0200 swh-journal (0.0.16-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.16 - (tagged by Nicolas Dandrimont on 2019-09-13 14:28:50 +0200) * Upstream changes: - Release swh.journal v0.0.16: - Migrate to confluent-kafka from python-kafka -- Software Heritage autobuilder (on jenkins-debian1) Fri, 13 Sep 2019 12:34:35 +0000 swh-journal (0.0.15-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.15 - (tagged by David Douard on 2019-09-10 16:50:42 +0200) * Upstream changes: - v0.0.15 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 10 Sep 2019 14:53:55 +0000 swh-journal (0.0.14-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.14 - (tagged by David Douard on 2019-07-18 13:38:39 +0200) * Upstream changes: - 0.0.14 - Code of conduct - fix the backfiller - fix compatibility with click < 7 - make the replayer robust against old formats -- Software Heritage autobuilder (on jenkins-debian1) Thu, 18 Jul 2019 11:44:40 +0000 swh-journal (0.0.13-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.13 - (tagged by Antoine R. Dumont (@ardumont) on 2019-07-03 10:26:29 +0200) * Upstream changes: - v0.0.13 - cli: Document depreated options -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Jul 2019 08:33:57 +0000 swh-journal (0.0.12-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.12 - (tagged by Valentin Lorentz on 2019-07-02 11:58:00 +0200) * Upstream changes: - v0.0.12 - More CLI option - Replay parallelism - Fix build on Debian 9 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 02 Jul 2019 10:08:07 +0000 swh-journal (0.0.11-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.11 - (tagged by David Douard on 2019-06-12 13:58:14 +0200) * Upstream changes: - v0.0.11 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 12 Jun 2019 12:10:52 +0000 swh-journal (0.0.10-1~swh2) unstable-swh; urgency=medium * Disable tests at build-time -- Nicolas Dandrimont Thu, 09 May 2019 14:42:24 +0200 swh-journal (0.0.10-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.10 - (tagged by Nicolas Dandrimont on 2019-05-09 14:29:52 +0200) * Upstream changes: - Release swh.journal v0.0.10 - Remove the publisher component, introduce the backfiller component. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 May 2019 12:34:36 +0000 swh-journal (0.0.9-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.9 - (tagged by David Douard on 2019-04-10 13:42:32 +0200) * Upstream changes: - v0.0.9 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 10 Apr 2019 11:48:39 +0000 swh-journal (0.0.8-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.8 - (tagged by Antoine R. Dumont (@ardumont) on 2019-03-15 13:56:57 +0100) * Upstream changes: - v0.0.8 - Add swh-journal cli -- Software Heritage autobuilder (on jenkins-debian1) Fri, 15 Mar 2019 13:00:11 +0000 swh-journal (0.0.7-1~swh2) unstable-swh; urgency=low * New release fixing build dependencies -- Antoine Romain Dumont (@ardumont) Tue, 19 Feb 2019 14:18:06 +0100 swh-journal (0.0.7-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.7 - (tagged by Antoine R. Dumont (@ardumont) on 2019-01-11 11:53:44 +0100) * Upstream changes: - v0.0.7 - Fix off-by-one error when checking max_messages. - tests: Adapt tests according to latest in-memory storage changes -- Software Heritage autobuilder (on jenkins-debian1) Fri, 11 Jan 2019 10:56:46 +0000 swh-journal (0.0.4-1~swh1) unstable-swh; urgency=medium * Release swh.journal version 0.0.4 * Update packaging runes -- Nicolas Dandrimont Thu, 12 Oct 2017 19:01:53 +0200 swh-journal (0.0.3-1~swh1) unstable-swh; urgency=medium * Release swh.journal v0.0.3 * Prepare building for stretch -- Nicolas Dandrimont Fri, 30 Jun 2017 17:29:15 +0200 swh-journal (0.0.2-1~swh1) unstable-swh; urgency=medium * v0.0.2 * Adapt swh.journal.publisher * Adapt swh.journal.client * Add swh.journal.checker basic implementation (reads and sends all * objects to publisher's subscribed queues). -- Antoine R. Dumont (@ardumont) Fri, 24 Mar 2017 12:54:16 +0100 swh-journal (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * v0.0.1 * Add a journal publisher * Add a base class interface for journal clients -- Antoine R. Dumont (@ardumont) Tue, 21 Mar 2017 14:38:13 +0100 diff --git a/pyproject.toml b/pyproject.toml index b5413f6..69b8f4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,11 @@ [tool.black] target-version = ['py37'] + +[tool.isort] +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true +line_length = 88 +force_sort_within_sections = true diff --git a/setup.py b/setup.py index eb30b87..7408fd4 100755 --- a/setup.py +++ b/setup.py @@ -1,71 +1,71 @@ #!/usr/bin/env python3 # Copyright (C) 2015-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from setuptools import setup, find_packages - -from os import path from io import open +from os import path + +from setuptools import find_packages, setup here = path.abspath(path.dirname(__file__)) # Get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(name=None): if name: reqf = "requirements-%s.txt" % name else: reqf = "requirements.txt" requirements = [] if not path.exists(reqf): return requirements with open(reqf) as f: for line in f.readlines(): line = line.strip() if not line or line.startswith("#"): continue requirements.append(line) return requirements setup( name="swh.journal", description="Software Heritage Journal utilities", long_description=long_description, long_description_content_type="text/markdown", python_requires=">=3.7", author="Software Heritage developers", author_email="swh-devel@inria.fr", url="https://forge.softwareheritage.org/diffusion/DJNL/", packages=find_packages(), scripts=[], entry_points=""" [pytest11] pytest_swh_journal = swh.journal.pytest_plugin """, install_requires=parse_requirements() + parse_requirements("swh"), setup_requires=["setuptools-scm"], use_scm_version=True, extras_require={"testing": parse_requirements("test")}, include_package_data=True, classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Operating System :: OS Independent", "Development Status :: 5 - Production/Stable", ], project_urls={ "Bug Reports": "https://forge.softwareheritage.org/maniphest", "Funding": "https://www.softwareheritage.org/donate", "Source": "https://forge.softwareheritage.org/source/swh-journal", "Documentation": "https://docs.softwareheritage.org/devel/swh-journal/", }, ) diff --git a/swh.journal.egg-info/PKG-INFO b/swh.journal.egg-info/PKG-INFO index 00b1c54..2a84d35 100644 --- a/swh.journal.egg-info/PKG-INFO +++ b/swh.journal.egg-info/PKG-INFO @@ -1,72 +1,72 @@ Metadata-Version: 2.1 Name: swh.journal -Version: 0.4.2 +Version: 0.4.3 Summary: Software Heritage Journal utilities Home-page: https://forge.softwareheritage.org/diffusion/DJNL/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-journal Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-journal/ Description: swh-journal =========== Persistent logger of changes to the archive, with publish-subscribe support. See the [documentation](https://docs.softwareheritage.org/devel/swh-journal/index.html#software-heritage-journal) for more details. # Local test As a pre-requisite, you need a kakfa installation path. The following target will take care of this: ``` make install ``` Then, provided you are in the right virtual environment as described in the [swh getting-started](https://docs.softwareheritage.org/devel/developer-setup.html#developer-setup): ``` pytest ``` or: ``` tox ``` # Running ## publisher Command: ``` $ swh-journal --config-file ~/.config/swh/journal/publisher.yml \ publisher ``` # Auto-completion To have the completion, add the following in your ~/.virtualenvs/swh/bin/postactivate: ``` eval "$(_SWH_JOURNAL_COMPLETE=$autocomplete_cmd swh-journal)" ``` Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing diff --git a/swh.journal.egg-info/SOURCES.txt b/swh.journal.egg-info/SOURCES.txt index 95e8232..0445a6b 100644 --- a/swh.journal.egg-info/SOURCES.txt +++ b/swh.journal.egg-info/SOURCES.txt @@ -1,50 +1,49 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md LICENSE MANIFEST.in Makefile Makefile.local README.md mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini -version.txt docs/.gitignore docs/Makefile docs/conf.py docs/index.rst docs/_static/.placeholder docs/_templates/.placeholder swh/__init__.py swh.journal.egg-info/PKG-INFO swh.journal.egg-info/SOURCES.txt swh.journal.egg-info/dependency_links.txt swh.journal.egg-info/entry_points.txt swh.journal.egg-info/requires.txt swh.journal.egg-info/top_level.txt swh/journal/__init__.py swh/journal/cli.py swh/journal/client.py swh/journal/py.typed swh/journal/pytest_plugin.py swh/journal/serializers.py swh/journal/tests/__init__.py swh/journal/tests/conftest.py swh/journal/tests/journal_data.py swh/journal/tests/log4j.properties swh/journal/tests/test_client.py swh/journal/tests/test_journal_data.py swh/journal/tests/test_kafka_writer.py swh/journal/tests/test_pytest_plugin.py swh/journal/tests/test_serializers.py swh/journal/writer/__init__.py swh/journal/writer/inmemory.py swh/journal/writer/kafka.py \ No newline at end of file diff --git a/swh/journal/client.py b/swh/journal/client.py index 68b9a22..704a2fe 100644 --- a/swh/journal/client.py +++ b/swh/journal/client.py @@ -1,305 +1,306 @@ # Copyright (C) 2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict import logging import os import time from typing import Any, Dict, List, Optional, Set, Tuple, Union -from confluent_kafka import Consumer, KafkaException, KafkaError +from confluent_kafka import Consumer, KafkaError, KafkaException -from .serializers import kafka_to_value from swh.journal import DEFAULT_PREFIX +from .serializers import kafka_to_value + logger = logging.getLogger(__name__) rdkafka_logger = logging.getLogger(__name__ + ".rdkafka") # Only accepted offset reset policy accepted ACCEPTED_OFFSET_RESET = ["earliest", "latest"] # Errors that Kafka raises too often and are not useful; therefore they # we lower their log level to DEBUG instead of INFO. _SPAMMY_ERRORS = [ KafkaError._NO_OFFSET, ] def get_journal_client(cls: str, **kwargs: Any): """Factory function to instantiate a journal client object. Currently, only the "kafka" journal client is supported. """ if cls == "kafka": return JournalClient(**kwargs) raise ValueError("Unknown journal client class `%s`" % cls) def _error_cb(error): if error.fatal(): raise KafkaException(error) if error.code() in _SPAMMY_ERRORS: logger.debug("Received non-fatal kafka error: %s", error) else: logger.info("Received non-fatal kafka error: %s", error) def _on_commit(error, partitions): if error is not None: _error_cb(error) class JournalClient: """A base client for the Software Heritage journal. The current implementation of the journal uses Apache Kafka brokers to publish messages under a given topic prefix, with each object type using a specific topic under that prefix. If the `prefix` argument is None (default value), it will take the default value `'swh.journal.objects'`. Clients subscribe to events specific to each object type as listed in the `object_types` argument (if unset, defaults to all existing kafka topic under the prefix). Clients can be sharded by setting the `group_id` to a common value across instances. The journal will share the message throughput across the nodes sharing the same group_id. Messages are processed by the `worker_fn` callback passed to the `process` method, in batches of maximum `batch_size` messages (defaults to 200). If set, the processing stops after processing `stop_after_objects` messages in total. `stop_on_eof` stops the processing when the client has reached the end of each partition in turn. `auto_offset_reset` sets the behavior of the client when the consumer group initializes: `'earliest'` (the default) processes all objects since the inception of the topics; `''` Any other named argument is passed directly to KafkaConsumer(). """ def __init__( self, brokers: Union[str, List[str]], group_id: str, prefix: Optional[str] = None, object_types: Optional[List[str]] = None, privileged: bool = False, stop_after_objects: Optional[int] = None, batch_size: int = 200, process_timeout: Optional[float] = None, auto_offset_reset: str = "earliest", stop_on_eof: bool = False, **kwargs, ): if prefix is None: prefix = DEFAULT_PREFIX if auto_offset_reset not in ACCEPTED_OFFSET_RESET: raise ValueError( "Option 'auto_offset_reset' only accept %s, not %s" % (ACCEPTED_OFFSET_RESET, auto_offset_reset) ) if batch_size <= 0: raise ValueError("Option 'batch_size' needs to be positive") self.value_deserializer = kafka_to_value if isinstance(brokers, str): brokers = [brokers] debug_logging = rdkafka_logger.isEnabledFor(logging.DEBUG) if debug_logging and "debug" not in kwargs: kwargs["debug"] = "consumer" # Static group instance id management group_instance_id = os.environ.get("KAFKA_GROUP_INSTANCE_ID") if group_instance_id: kwargs["group.instance.id"] = group_instance_id if "group.instance.id" in kwargs: # When doing static consumer group membership, set a higher default # session timeout. The session timeout is the duration after which # the broker considers that a consumer has left the consumer group # for good, and triggers a rebalance. Considering our current # processing pattern, 10 minutes gives the consumer ample time to # restart before that happens. if "session.timeout.ms" not in kwargs: kwargs["session.timeout.ms"] = 10 * 60 * 1000 # 10 minutes if "session.timeout.ms" in kwargs: # When the session timeout is set, rdkafka requires the max poll # interval to be set to a higher value; the max poll interval is # rdkafka's way of figuring out whether the client's message # processing thread has stalled: when the max poll interval lapses # between two calls to consumer.poll(), rdkafka leaves the consumer # group and terminates the connection to the brokers. # # We default to 1.5 times the session timeout if "max.poll.interval.ms" not in kwargs: kwargs["max.poll.interval.ms"] = kwargs["session.timeout.ms"] // 2 * 3 consumer_settings = { **kwargs, "bootstrap.servers": ",".join(brokers), "auto.offset.reset": auto_offset_reset, "group.id": group_id, "on_commit": _on_commit, "error_cb": _error_cb, "enable.auto.commit": False, "logger": rdkafka_logger, } self.stop_on_eof = stop_on_eof if self.stop_on_eof: consumer_settings["enable.partition.eof"] = True logger.debug("Consumer settings: %s", consumer_settings) self.consumer = Consumer(consumer_settings) if privileged: privileged_prefix = f"{prefix}_privileged" else: # do not attempt to subscribe to privileged topics privileged_prefix = f"{prefix}" existing_topics = [ topic for topic in self.consumer.list_topics(timeout=10).topics.keys() if ( topic.startswith(f"{prefix}.") or topic.startswith(f"{privileged_prefix}.") ) ] if not existing_topics: raise ValueError( f"The prefix {prefix} does not match any existing topic " "on the kafka broker" ) if not object_types: object_types = list({topic.split(".")[-1] for topic in existing_topics}) self.subscription = [] unknown_types = [] for object_type in object_types: topics = (f"{privileged_prefix}.{object_type}", f"{prefix}.{object_type}") for topic in topics: if topic in existing_topics: self.subscription.append(topic) break else: unknown_types.append(object_type) if unknown_types: raise ValueError( f"Topic(s) for object types {','.join(unknown_types)} " "are unknown on the kafka broker" ) logger.debug(f"Upstream topics: {existing_topics}") self.subscribe() self.stop_after_objects = stop_after_objects self.process_timeout = process_timeout self.eof_reached: Set[Tuple[str, str]] = set() self.batch_size = batch_size def subscribe(self): """Subscribe to topics listed in self.subscription This can be overridden if you need, for instance, to manually assign partitions. """ logger.debug(f"Subscribing to: {self.subscription}") self.consumer.subscribe(topics=self.subscription) def process(self, worker_fn): """Polls Kafka for a batch of messages, and calls the worker_fn with these messages. Args: worker_fn Callable[Dict[str, List[dict]]]: Function called with the messages as argument. """ start_time = time.monotonic() total_objects_processed = 0 while True: # timeout for message poll timeout = 1.0 elapsed = time.monotonic() - start_time if self.process_timeout: # +0.01 to prevent busy-waiting on / spamming consumer.poll. # consumer.consume() returns shortly before X expired # (a matter of milliseconds), so after it returns a first # time, it would then be called with a timeout in the order # of milliseconds, therefore returning immediately, then be # called again, etc. if elapsed + 0.01 >= self.process_timeout: break timeout = self.process_timeout - elapsed batch_size = self.batch_size if self.stop_after_objects: if total_objects_processed >= self.stop_after_objects: break # clamp batch size to avoid overrunning stop_after_objects batch_size = min( self.stop_after_objects - total_objects_processed, batch_size, ) messages = self.consumer.consume(timeout=timeout, num_messages=batch_size) if not messages: continue batch_processed, at_eof = self.handle_messages(messages, worker_fn) total_objects_processed += batch_processed if at_eof: break return total_objects_processed def handle_messages(self, messages, worker_fn): objects: Dict[str, List[Any]] = defaultdict(list) nb_processed = 0 for message in messages: error = message.error() if error is not None: if error.code() == KafkaError._PARTITION_EOF: self.eof_reached.add((message.topic(), message.partition())) else: _error_cb(error) continue if message.value() is None: # ignore message with no payload, these can be generated in tests continue nb_processed += 1 object_type = message.topic().split(".")[-1] objects[object_type].append(self.deserialize_message(message)) if objects: worker_fn(dict(objects)) self.consumer.commit() at_eof = self.stop_on_eof and all( (tp.topic, tp.partition) in self.eof_reached for tp in self.consumer.assignment() ) return nb_processed, at_eof def deserialize_message(self, message): return self.value_deserializer(message.value()) def close(self): self.consumer.close() diff --git a/swh/journal/pytest_plugin.py b/swh/journal/pytest_plugin.py index 778eeef..c1025e6 100644 --- a/swh/journal/pytest_plugin.py +++ b/swh/journal/pytest_plugin.py @@ -1,241 +1,239 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from collections import defaultdict import random import string - from typing import Collection, Dict, Iterator, Optional -from collections import defaultdict import attr -import pytest - from confluent_kafka import Consumer, KafkaException, Producer from confluent_kafka.admin import AdminClient +import pytest -from swh.journal.serializers import object_key, kafka_to_key, kafka_to_value, pprint_key +from swh.journal.serializers import kafka_to_key, kafka_to_value, object_key, pprint_key from swh.journal.tests.journal_data import TEST_OBJECTS def consume_messages(consumer, kafka_prefix, expected_messages): """Consume expected_messages from the consumer; Sort them all into a consumed_objects dict""" consumed_messages = defaultdict(list) fetched_messages = 0 retries_left = 1000 while fetched_messages < expected_messages: if retries_left == 0: raise ValueError( "Timed out fetching messages from kafka. " f"Only {fetched_messages}/{expected_messages} fetched" ) msg = consumer.poll(timeout=0.01) if not msg: retries_left -= 1 continue error = msg.error() if error is not None: if error.fatal(): raise KafkaException(error) retries_left -= 1 continue fetched_messages += 1 topic = msg.topic() assert topic.startswith(f"{kafka_prefix}.") or topic.startswith( f"{kafka_prefix}_privileged." ), "Unexpected topic" object_type = topic[len(kafka_prefix + ".") :] consumed_messages[object_type].append( (kafka_to_key(msg.key()), kafka_to_value(msg.value())) ) return consumed_messages def assert_all_objects_consumed( consumed_messages: Dict, exclude: Optional[Collection] = None ): """Check whether all objects from TEST_OBJECTS have been consumed `exclude` can be a list of object types for which we do not want to compare the values (eg. for anonymized object). """ for object_type, known_objects in TEST_OBJECTS.items(): known_keys = [object_key(object_type, obj) for obj in known_objects] if not consumed_messages[object_type]: return (received_keys, received_values) = zip(*consumed_messages[object_type]) if object_type in ("content", "skipped_content"): for value in received_values: value.pop("ctime", None) if object_type == "content": known_objects = [attr.evolve(o, data=None) for o in known_objects] for key in known_keys: assert key in received_keys, ( f"expected {object_type} key {pprint_key(key)} " "absent from consumed messages" ) if exclude and object_type in exclude: continue for value in known_objects: expected_value = value.to_dict() if value.object_type in ("content", "skipped_content"): expected_value.pop("ctime", None) assert expected_value in received_values, ( f"expected {object_type} value {value!r} is " "absent from consumed messages" ) @pytest.fixture(scope="function") def kafka_prefix(): """Pick a random prefix for kafka topics on each call""" return "".join(random.choice(string.ascii_lowercase) for _ in range(10)) @pytest.fixture(scope="function") def kafka_consumer_group(kafka_prefix: str): """Pick a random consumer group for kafka consumers on each call""" return "test-consumer-%s" % kafka_prefix @pytest.fixture(scope="function") def object_types(): """Set of object types to precreate topics for.""" return set(TEST_OBJECTS.keys()) @pytest.fixture(scope="function") def privileged_object_types(): """Set of object types to precreate privileged topics for.""" return {"revision", "release"} @pytest.fixture(scope="function") def kafka_server( kafka_server_base: str, kafka_prefix: str, object_types: Iterator[str], privileged_object_types: Iterator[str], ) -> str: """A kafka server with existing topics Unprivileged topics are built as ``{kafka_prefix}.{object_type}`` with object_type from the ``object_types`` list. Privileged topics are built as ``{kafka_prefix}_privileged.{object_type}`` with object_type from the ``privileged_object_types`` list. """ topics = [f"{kafka_prefix}.{obj}" for obj in object_types] + [ f"{kafka_prefix}_privileged.{obj}" for obj in privileged_object_types ] # unfortunately, the Mock broker does not support the CreatTopic admin API, so we # have to create topics using a Producer. producer = Producer( { "bootstrap.servers": kafka_server_base, "client.id": "bootstrap producer", "acks": "all", } ) for topic in topics: producer.produce(topic=topic, value=None) for i in range(10): if producer.flush(0.1) == 0: break return kafka_server_base @pytest.fixture(scope="session") def kafka_server_base() -> Iterator[str]: """Create a mock kafka cluster suitable for tests. Yield a connection string. Note: this is a generator to keep the mock broker alive during the whole test session. see https://github.com/edenhill/librdkafka/blob/master/src/rdkafka_mock.h """ admin = AdminClient({"test.mock.num.brokers": "1"}) metadata = admin.list_topics() brokers = [str(broker) for broker in metadata.brokers.values()] assert len(brokers) == 1, "More than one broker found in the kafka cluster?!" broker_connstr, broker_id = brokers[0].split("/") yield broker_connstr TEST_CONFIG = { "consumer_id": "swh.journal.consumer", "stop_after_objects": 1, # will read 1 object and stop "storage": {"cls": "memory", "args": {}}, } @pytest.fixture def test_config( kafka_server_base: str, kafka_prefix: str, object_types: Iterator[str], privileged_object_types: Iterator[str], ): """Test configuration needed for producer/consumer """ return { **TEST_CONFIG, "object_types": object_types, "privileged_object_types": privileged_object_types, "brokers": [kafka_server_base], "prefix": kafka_prefix, } @pytest.fixture def consumer( kafka_server: str, test_config: Dict, kafka_consumer_group: str ) -> Consumer: """Get a connected Kafka consumer. """ consumer = Consumer( { "bootstrap.servers": kafka_server, "auto.offset.reset": "earliest", "enable.auto.commit": True, "group.id": kafka_consumer_group, } ) prefix = test_config["prefix"] kafka_topics = [ f"{prefix}.{object_type}" for object_type in test_config["object_types"] ] + [ f"{prefix}_privileged.{object_type}" for object_type in test_config["privileged_object_types"] ] consumer.subscribe(kafka_topics) yield consumer consumer.close() diff --git a/swh/journal/serializers.py b/swh/journal/serializers.py index 399a29c..3dca255 100644 --- a/swh/journal/serializers.py +++ b/swh/journal/serializers.py @@ -1,171 +1,172 @@ # Copyright (C) 2016-2017 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Any, Dict, Union, overload import msgpack from swh.core.api.serializers import msgpack_dumps, msgpack_loads from swh.model.hashutil import DEFAULT_ALGORITHMS from swh.model.model import ( Content, Directory, MetadataAuthority, MetadataFetcher, Origin, OriginVisit, OriginVisitStatus, RawExtrinsicMetadata, Release, Revision, SkippedContent, Snapshot, ) ModelObject = Union[ Content, Directory, MetadataAuthority, MetadataFetcher, Origin, OriginVisit, OriginVisitStatus, RawExtrinsicMetadata, Release, Revision, SkippedContent, Snapshot, ] KeyType = Union[Dict[str, str], Dict[str, bytes], bytes] # these @overload'ed versions of the object_key method aim at helping mypy figuring # the correct type-ing. @overload def object_key( object_type: str, object_: Union[Content, Directory, Revision, Release, Snapshot] ) -> bytes: ... @overload def object_key( object_type: str, object_: Union[Origin, SkippedContent] ) -> Dict[str, bytes]: ... @overload def object_key( object_type: str, object_: Union[ MetadataAuthority, MetadataFetcher, OriginVisit, OriginVisitStatus, RawExtrinsicMetadata, ], ) -> Dict[str, str]: ... def object_key(object_type: str, object_) -> KeyType: if object_type in ("revision", "release", "directory", "snapshot"): return object_.id elif object_type == "content": return object_.sha1 # TODO: use a dict of hashes elif object_type == "skipped_content": return {hash: getattr(object_, hash) for hash in DEFAULT_ALGORITHMS} elif object_type == "origin": return {"url": object_.url} elif object_type == "origin_visit": return { "origin": object_.origin, "date": str(object_.date), } elif object_type == "origin_visit_status": return { "origin": object_.origin, "visit": str(object_.visit), "date": str(object_.date), } elif object_type == "metadata_authority": return { "type": object_.type.value, "url": object_.url, } elif object_type == "metadata_fetcher": return { "name": object_.name, "version": object_.version, } elif object_type == "raw_extrinsic_metadata": return { "type": object_.type.value, "id": str(object_.id), "authority_type": object_.authority.type.value, "authority_url": object_.authority.url, "discovery_date": str(object_.discovery_date), "fetcher_name": object_.fetcher.name, "fetcher_version": object_.fetcher.version, } else: raise ValueError("Unknown object type: %s." % object_type) def stringify_key_item(k: str, v: Union[str, bytes]) -> str: """Turn the item of a dict key into a string""" if isinstance(v, str): return v if k == "url": return v.decode("utf-8") return v.hex() def pprint_key(key: KeyType) -> str: """Pretty-print a kafka key""" if isinstance(key, dict): return "{%s}" % ", ".join( f"{k}: {stringify_key_item(k, v)}" for k, v in key.items() ) elif isinstance(key, bytes): return key.hex() else: return key def key_to_kafka(key: KeyType) -> bytes: """Serialize a key, possibly a dict, in a predictable way""" p = msgpack.Packer(use_bin_type=True) if isinstance(key, dict): return p.pack_map_pairs(sorted(key.items())) else: return p.pack(key) def kafka_to_key(kafka_key: bytes) -> KeyType: """Deserialize a key""" return msgpack.loads(kafka_key, raw=False) def value_to_kafka(value: Any) -> bytes: """Serialize some data for storage in kafka""" return msgpack_dumps(value) def kafka_to_value(kafka_value: bytes) -> Any: """Deserialize some data stored in kafka""" value = msgpack_loads(kafka_value) - if isinstance(value, list): - return tuple(value) - if isinstance(value, dict): - return ensure_tuples(value) - return value + return ensure_tuples(value) -def ensure_tuples(value: Dict) -> Dict: - return {k: tuple(v) if isinstance(v, list) else v for k, v in value.items()} +def ensure_tuples(value: Any) -> Any: + if isinstance(value, (tuple, list)): + return tuple(map(ensure_tuples, value)) + elif isinstance(value, dict): + return dict(ensure_tuples(list(value.items()))) + else: + return value diff --git a/swh/journal/tests/conftest.py b/swh/journal/tests/conftest.py index ec2a3cf..39670ed 100644 --- a/swh/journal/tests/conftest.py +++ b/swh/journal/tests/conftest.py @@ -1,28 +1,27 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from hypothesis.strategies import one_of -from swh.model import hypothesis_strategies as strategies - # for bw compat from swh.journal.tests.journal_data import * # noqa +from swh.model import hypothesis_strategies as strategies logger = logging.getLogger(__name__) def objects_d(): return one_of( strategies.origins().map(lambda x: ("origin", x.to_dict())), strategies.origin_visits().map(lambda x: ("origin_visit", x.to_dict())), strategies.snapshots().map(lambda x: ("snapshot", x.to_dict())), strategies.releases().map(lambda x: ("release", x.to_dict())), strategies.revisions().map(lambda x: ("revision", x.to_dict())), strategies.directories().map(lambda x: ("directory", x.to_dict())), strategies.skipped_contents().map(lambda x: ("skipped_content", x.to_dict())), strategies.present_contents().map(lambda x: ("content", x.to_dict())), ) diff --git a/swh/journal/tests/journal_data.py b/swh/journal/tests/journal_data.py index 1b103bf..f891293 100644 --- a/swh/journal/tests/journal_data.py +++ b/swh/journal/tests/journal_data.py @@ -1,342 +1,341 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime - from typing import Dict, Sequence import attr -from swh.model.hashutil import MultiHash, hash_to_bytes, hash_to_hex from swh.journal.serializers import ModelObject - +from swh.model.hashutil import MultiHash, hash_to_bytes, hash_to_hex from swh.model.identifiers import SWHID from swh.model.model import ( Content, Directory, DirectoryEntry, MetadataAuthority, MetadataAuthorityType, MetadataFetcher, MetadataTargetType, ObjectType, Origin, OriginVisit, OriginVisitStatus, Person, RawExtrinsicMetadata, Release, Revision, RevisionType, SkippedContent, Snapshot, SnapshotBranch, TargetType, Timestamp, TimestampWithTimezone, ) UTC = datetime.timezone.utc CONTENTS = [ Content( length=4, data=f"foo{i}".encode(), status="visible", **MultiHash.from_data(f"foo{i}".encode()).digest(), ) for i in range(10) ] + [ Content( length=14, data=f"forbidden foo{i}".encode(), status="hidden", **MultiHash.from_data(f"forbidden foo{i}".encode()).digest(), ) for i in range(10) ] SKIPPED_CONTENTS = [ SkippedContent( length=4, status="absent", reason=f"because chr({i}) != '*'", **MultiHash.from_data(f"bar{i}".encode()).digest(), ) for i in range(2) ] duplicate_content1 = Content( length=4, sha1=hash_to_bytes("44973274ccef6ab4dfaaf86599792fa9c3fe4689"), sha1_git=b"another-foo", blake2s256=b"another-bar", sha256=b"another-baz", status="visible", ) # Craft a sha1 collision sha1_array = bytearray(duplicate_content1.sha1_git) sha1_array[0] += 1 duplicate_content2 = attr.evolve(duplicate_content1, sha1_git=bytes(sha1_array)) DUPLICATE_CONTENTS = [duplicate_content1, duplicate_content2] COMMITTERS = [ Person(fullname=b"foo", name=b"foo", email=b""), Person(fullname=b"bar", name=b"bar", email=b""), ] DATES = [ TimestampWithTimezone( timestamp=Timestamp(seconds=1234567891, microseconds=0,), offset=120, negative_utc=False, ), TimestampWithTimezone( timestamp=Timestamp(seconds=1234567892, microseconds=0,), offset=120, negative_utc=False, ), ] REVISIONS = [ Revision( id=hash_to_bytes("4ca486e65eb68e4986aeef8227d2db1d56ce51b3"), message=b"hello", date=DATES[0], committer=COMMITTERS[0], author=COMMITTERS[0], committer_date=DATES[0], type=RevisionType.GIT, directory=b"\x01" * 20, synthetic=False, metadata=None, parents=(), ), Revision( id=hash_to_bytes("677063f5c405d6fc1781fc56379c9a9adf43d3a0"), message=b"hello again", date=DATES[1], committer=COMMITTERS[1], author=COMMITTERS[1], committer_date=DATES[1], type=RevisionType.MERCURIAL, directory=b"\x02" * 20, synthetic=False, metadata=None, parents=(), + extra_headers=((b"foo", b"bar"),), ), ] RELEASES = [ Release( id=hash_to_bytes("8059dc4e17fcd0e51ca3bcd6b80f4577d281fd08"), name=b"v0.0.1", date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0,), offset=120, negative_utc=False, ), author=COMMITTERS[0], target_type=ObjectType.REVISION, target=b"\x04" * 20, message=b"foo", synthetic=False, ), ] ORIGINS = [ Origin(url="https://somewhere.org/den/fox",), Origin(url="https://overtherainbow.org/fox/den",), ] ORIGIN_VISITS = [ OriginVisit( origin=ORIGINS[0].url, date=datetime.datetime(2013, 5, 7, 4, 20, 39, 369271, tzinfo=UTC), visit=1, type="git", ), OriginVisit( origin=ORIGINS[1].url, date=datetime.datetime(2014, 11, 27, 17, 20, 39, tzinfo=UTC), visit=1, type="hg", ), OriginVisit( origin=ORIGINS[0].url, date=datetime.datetime(2018, 11, 27, 17, 20, 39, tzinfo=UTC), visit=2, type="git", ), OriginVisit( origin=ORIGINS[0].url, date=datetime.datetime(2018, 11, 27, 17, 20, 39, tzinfo=UTC), visit=3, type="git", ), OriginVisit( origin=ORIGINS[1].url, date=datetime.datetime(2015, 11, 27, 17, 20, 39, tzinfo=UTC), visit=2, type="hg", ), ] # The origin-visit-status dates needs to be shifted slightly in the future from their # visit dates counterpart. Otherwise, we are hitting storage-wise the "on conflict" # ignore policy (because origin-visit-add creates an origin-visit-status with the same # parameters from the origin-visit {origin, visit, date}... ORIGIN_VISIT_STATUSES = [ OriginVisitStatus( origin=ORIGINS[0].url, date=datetime.datetime(2013, 5, 7, 4, 20, 39, 432222, tzinfo=UTC), visit=1, status="ongoing", snapshot=None, metadata=None, ), OriginVisitStatus( origin=ORIGINS[1].url, date=datetime.datetime(2014, 11, 27, 17, 21, 12, tzinfo=UTC), visit=1, status="ongoing", snapshot=None, metadata=None, ), OriginVisitStatus( origin=ORIGINS[0].url, date=datetime.datetime(2018, 11, 27, 17, 20, 59, tzinfo=UTC), visit=2, status="ongoing", snapshot=None, metadata=None, ), OriginVisitStatus( origin=ORIGINS[0].url, date=datetime.datetime(2018, 11, 27, 17, 20, 49, tzinfo=UTC), visit=3, status="full", snapshot=hash_to_bytes("17d0066a4a80aba4a0e913532ee8ff2014f006a9"), metadata=None, ), OriginVisitStatus( origin=ORIGINS[1].url, date=datetime.datetime(2015, 11, 27, 17, 22, 18, tzinfo=UTC), visit=2, status="partial", snapshot=hash_to_bytes("8ce268b87faf03850693673c3eb5c9bb66e1ca38"), metadata=None, ), ] DIRECTORIES = [ Directory(id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"), entries=()), Directory( id=hash_to_bytes("21416d920e0ebf0df4a7888bed432873ed5cb3a7"), entries=( DirectoryEntry( name=b"file1.ext", perms=0o644, type="file", target=CONTENTS[0].sha1_git, ), DirectoryEntry( name=b"dir1", perms=0o755, type="dir", target=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"), ), DirectoryEntry( name=b"subprepo1", perms=0o160000, type="rev", target=REVISIONS[1].id, ), ), ), ] SNAPSHOTS = [ Snapshot( id=hash_to_bytes("17d0066a4a80aba4a0e913532ee8ff2014f006a9"), branches={ b"master": SnapshotBranch( target_type=TargetType.REVISION, target=REVISIONS[0].id ) }, ), Snapshot( id=hash_to_bytes("8ce268b87faf03850693673c3eb5c9bb66e1ca38"), branches={ b"target/revision": SnapshotBranch( target_type=TargetType.REVISION, target=REVISIONS[0].id, ), b"target/alias": SnapshotBranch( target_type=TargetType.ALIAS, target=b"target/revision" ), b"target/directory": SnapshotBranch( target_type=TargetType.DIRECTORY, target=DIRECTORIES[0].id, ), b"target/release": SnapshotBranch( target_type=TargetType.RELEASE, target=RELEASES[0].id ), b"target/snapshot": SnapshotBranch( target_type=TargetType.SNAPSHOT, target=hash_to_bytes("17d0066a4a80aba4a0e913532ee8ff2014f006a9"), ), }, ), ] METADATA_AUTHORITIES = [ MetadataAuthority( type=MetadataAuthorityType.FORGE, url="http://example.org/", metadata={}, ), ] METADATA_FETCHERS = [ MetadataFetcher(name="test-fetcher", version="1.0.0", metadata={},) ] RAW_EXTRINSIC_METADATA = [ RawExtrinsicMetadata( type=MetadataTargetType.ORIGIN, id="http://example.org/foo.git", discovery_date=datetime.datetime(2020, 7, 30, 17, 8, 20, tzinfo=UTC), authority=attr.evolve(METADATA_AUTHORITIES[0], metadata=None), fetcher=attr.evolve(METADATA_FETCHERS[0], metadata=None), format="json", metadata=b'{"foo": "bar"}', ), RawExtrinsicMetadata( type=MetadataTargetType.CONTENT, id=SWHID(object_type="content", object_id=hash_to_hex(CONTENTS[0].sha1_git)), discovery_date=datetime.datetime(2020, 7, 30, 17, 8, 20, tzinfo=UTC), authority=attr.evolve(METADATA_AUTHORITIES[0], metadata=None), fetcher=attr.evolve(METADATA_FETCHERS[0], metadata=None), format="json", metadata=b'{"foo": "bar"}', ), ] TEST_OBJECTS: Dict[str, Sequence[ModelObject]] = { "content": CONTENTS, "directory": DIRECTORIES, "metadata_authority": METADATA_AUTHORITIES, "metadata_fetcher": METADATA_FETCHERS, "origin": ORIGINS, "origin_visit": ORIGIN_VISITS, "origin_visit_status": ORIGIN_VISIT_STATUSES, "raw_extrinsic_metadata": RAW_EXTRINSIC_METADATA, "release": RELEASES, "revision": REVISIONS, "snapshot": SNAPSHOTS, "skipped_content": SKIPPED_CONTENTS, } diff --git a/swh/journal/tests/test_client.py b/swh/journal/tests/test_client.py index 66278b8..3984c46 100644 --- a/swh/journal/tests/test_client.py +++ b/swh/journal/tests/test_client.py @@ -1,330 +1,329 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Dict, List from unittest.mock import MagicMock from confluent_kafka import Producer import pytest -from swh.model.model import Content - from swh.journal.client import JournalClient from swh.journal.serializers import key_to_kafka, value_to_kafka +from swh.model.model import Content REV = { "message": b"something cool", "author": {"fullname": b"Peter", "name": None, "email": b"peter@ouiche.lo"}, "committer": {"fullname": b"Stephen", "name": b"From Outer Space", "email": None}, "date": { "timestamp": {"seconds": 123456789, "microseconds": 123}, "offset": 120, "negative_utc": False, }, "committer_date": { "timestamp": {"seconds": 123123456, "microseconds": 0}, "offset": 0, "negative_utc": False, }, "type": "git", "directory": ( b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" b"\x01\x02\x03\x04\x05" ), "synthetic": False, "metadata": None, "parents": (), "id": b"\x8b\xeb\xd1\x9d\x07\xe2\x1e0\xe2 \x91X\x8d\xbd\x1c\xa8\x86\xdeB\x0c", } def test_client(kafka_prefix: str, kafka_consumer_group: str, kafka_server: str): producer = Producer( { "bootstrap.servers": kafka_server, "client.id": "test producer", "acks": "all", } ) # Fill Kafka producer.produce( topic=kafka_prefix + ".revision", key=REV["id"], value=value_to_kafka(REV), ) producer.flush() client = JournalClient( brokers=[kafka_server], group_id=kafka_consumer_group, prefix=kafka_prefix, stop_after_objects=1, ) worker_fn = MagicMock() client.process(worker_fn) worker_fn.assert_called_once_with({"revision": [REV]}) def test_client_eof(kafka_prefix: str, kafka_consumer_group: str, kafka_server: str): producer = Producer( { "bootstrap.servers": kafka_server, "client.id": "test producer", "acks": "all", } ) # Fill Kafka producer.produce( topic=kafka_prefix + ".revision", key=REV["id"], value=value_to_kafka(REV), ) producer.flush() client = JournalClient( brokers=[kafka_server], group_id=kafka_consumer_group, prefix=kafka_prefix, stop_after_objects=None, stop_on_eof=True, ) worker_fn = MagicMock() client.process(worker_fn) worker_fn.assert_called_once_with({"revision": [REV]}) @pytest.mark.parametrize("batch_size", [1, 5, 100]) def test_client_batch_size( kafka_prefix: str, kafka_consumer_group: str, kafka_server: str, batch_size: int, ): num_objects = 2 * batch_size + 1 assert num_objects < 256, "Too many objects, generation will fail" producer = Producer( { "bootstrap.servers": kafka_server, "client.id": "test producer", "acks": "all", } ) contents = [Content.from_data(bytes([i])) for i in range(num_objects)] # Fill Kafka for content in contents: producer.produce( topic=kafka_prefix + ".content", key=key_to_kafka(content.sha1), value=value_to_kafka(content.to_dict()), ) producer.flush() client = JournalClient( brokers=[kafka_server], group_id=kafka_consumer_group, prefix=kafka_prefix, stop_after_objects=num_objects, batch_size=batch_size, ) collected_output: List[Dict] = [] def worker_fn(objects): received = objects["content"] assert len(received) <= batch_size collected_output.extend(received) client.process(worker_fn) expected_output = [content.to_dict() for content in contents] assert len(collected_output) == len(expected_output) for output in collected_output: assert output in expected_output @pytest.fixture() def kafka_producer(kafka_prefix: str, kafka_server_base: str): producer = Producer( { "bootstrap.servers": kafka_server_base, "client.id": "test producer", "acks": "all", } ) # Fill Kafka producer.produce( topic=kafka_prefix + ".something", key=key_to_kafka(b"key1"), value=value_to_kafka("value1"), ) producer.produce( topic=kafka_prefix + ".else", key=key_to_kafka(b"key1"), value=value_to_kafka("value2"), ) producer.flush() return producer def test_client_subscribe_all( kafka_producer: Producer, kafka_prefix: str, kafka_server_base: str ): client = JournalClient( brokers=[kafka_server_base], group_id="whatever", prefix=kafka_prefix, stop_after_objects=2, ) assert set(client.subscription) == { f"{kafka_prefix}.something", f"{kafka_prefix}.else", } worker_fn = MagicMock() client.process(worker_fn) worker_fn.assert_called_once_with( {"something": ["value1"], "else": ["value2"],} ) def test_client_subscribe_one_topic( kafka_producer: Producer, kafka_prefix: str, kafka_server_base: str ): client = JournalClient( brokers=[kafka_server_base], group_id="whatever", prefix=kafka_prefix, stop_after_objects=1, object_types=["else"], ) assert client.subscription == [f"{kafka_prefix}.else"] worker_fn = MagicMock() client.process(worker_fn) worker_fn.assert_called_once_with({"else": ["value2"]}) def test_client_subscribe_absent_topic( kafka_producer: Producer, kafka_prefix: str, kafka_server_base: str ): with pytest.raises(ValueError): JournalClient( brokers=[kafka_server_base], group_id="whatever", prefix=kafka_prefix, stop_after_objects=1, object_types=["really"], ) def test_client_subscribe_absent_prefix( kafka_producer: Producer, kafka_prefix: str, kafka_server_base: str ): with pytest.raises(ValueError): JournalClient( brokers=[kafka_server_base], group_id="whatever", prefix="wrong.prefix", stop_after_objects=1, ) with pytest.raises(ValueError): JournalClient( brokers=[kafka_server_base], group_id="whatever", prefix="wrong.prefix", stop_after_objects=1, object_types=["else"], ) def test_client_subscriptions_with_anonymized_topics( kafka_prefix: str, kafka_consumer_group: str, kafka_server_base: str ): producer = Producer( { "bootstrap.servers": kafka_server_base, "client.id": "test producer", "acks": "all", } ) # Fill Kafka with revision object on both the regular prefix (normally for # anonymized objects in this case) and privileged one producer.produce( topic=kafka_prefix + ".revision", key=REV["id"], value=value_to_kafka(REV), ) producer.produce( topic=kafka_prefix + "_privileged.revision", key=REV["id"], value=value_to_kafka(REV), ) producer.flush() # without privileged "channels" activated on the client side client = JournalClient( brokers=[kafka_server_base], group_id=kafka_consumer_group, prefix=kafka_prefix, stop_after_objects=1, privileged=False, ) # we only subscribed to "standard" topics assert client.subscription == [kafka_prefix + ".revision"] # with privileged "channels" activated on the client side client = JournalClient( brokers=[kafka_server_base], group_id=kafka_consumer_group, prefix=kafka_prefix, stop_after_objects=1, privileged=True, ) # we only subscribed to "privileged" topics assert client.subscription == [kafka_prefix + "_privileged.revision"] def test_client_subscriptions_without_anonymized_topics( kafka_prefix: str, kafka_consumer_group: str, kafka_server_base: str ): producer = Producer( { "bootstrap.servers": kafka_server_base, "client.id": "test producer", "acks": "all", } ) # Fill Kafka with revision objects only on the standard prefix producer.produce( topic=kafka_prefix + ".revision", key=REV["id"], value=value_to_kafka(REV), ) producer.flush() # without privileged channel activated on the client side client = JournalClient( brokers=[kafka_server_base], group_id=kafka_consumer_group, prefix=kafka_prefix, stop_after_objects=1, privileged=False, ) # we only subscribed to the standard prefix assert client.subscription == [kafka_prefix + ".revision"] # with privileged channel activated on the client side client = JournalClient( brokers=[kafka_server_base], group_id=kafka_consumer_group, prefix=kafka_prefix, stop_after_objects=1, privileged=True, ) # we also only subscribed to the standard prefix, since there is no priviled prefix # on the kafka broker assert client.subscription == [kafka_prefix + ".revision"] diff --git a/swh/journal/tests/test_kafka_writer.py b/swh/journal/tests/test_kafka_writer.py index 1fe02cc..d09d43a 100644 --- a/swh/journal/tests/test_kafka_writer.py +++ b/swh/journal/tests/test_kafka_writer.py @@ -1,166 +1,165 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Iterable -import pytest from confluent_kafka import Consumer, Producer +import pytest -from swh.model.model import Directory, Revision, Release - +from swh.journal.pytest_plugin import assert_all_objects_consumed, consume_messages from swh.journal.tests.journal_data import TEST_OBJECTS -from swh.journal.pytest_plugin import consume_messages, assert_all_objects_consumed -from swh.journal.writer.kafka import KafkaJournalWriter, KafkaDeliveryError +from swh.journal.writer.kafka import KafkaDeliveryError, KafkaJournalWriter +from swh.model.model import Directory, Release, Revision def test_kafka_writer( kafka_prefix: str, kafka_server: str, consumer: Consumer, privileged_object_types: Iterable[str], ): writer = KafkaJournalWriter( brokers=[kafka_server], client_id="kafka_writer", prefix=kafka_prefix, anonymize=False, ) expected_messages = 0 for object_type, objects in TEST_OBJECTS.items(): writer.write_additions(object_type, objects) expected_messages += len(objects) consumed_messages = consume_messages(consumer, kafka_prefix, expected_messages) assert_all_objects_consumed(consumed_messages) for key, obj_dict in consumed_messages["revision"]: obj = Revision.from_dict(obj_dict) for person in (obj.author, obj.committer): assert not ( len(person.fullname) == 32 and person.name is None and person.email is None ) for key, obj_dict in consumed_messages["release"]: obj = Release.from_dict(obj_dict) for person in (obj.author,): assert not ( len(person.fullname) == 32 and person.name is None and person.email is None ) def test_kafka_writer_anonymized( kafka_prefix: str, kafka_server: str, consumer: Consumer, privileged_object_types: Iterable[str], ): writer = KafkaJournalWriter( brokers=[kafka_server], client_id="kafka_writer", prefix=kafka_prefix, anonymize=True, ) expected_messages = 0 for object_type, objects in TEST_OBJECTS.items(): writer.write_additions(object_type, objects) expected_messages += len(objects) if object_type in privileged_object_types: expected_messages += len(objects) consumed_messages = consume_messages(consumer, kafka_prefix, expected_messages) assert_all_objects_consumed(consumed_messages, exclude=["revision", "release"]) for key, obj_dict in consumed_messages["revision"]: obj = Revision.from_dict(obj_dict) for person in (obj.author, obj.committer): assert ( len(person.fullname) == 32 and person.name is None and person.email is None ) for key, obj_dict in consumed_messages["release"]: obj = Release.from_dict(obj_dict) for person in (obj.author,): assert ( len(person.fullname) == 32 and person.name is None and person.email is None ) def test_write_delivery_failure( kafka_prefix: str, kafka_server: str, consumer: Consumer ): class MockKafkaError: """A mocked kafka error""" def str(self): return "Mocked Kafka Error" def name(self): return "SWH_MOCK_ERROR" class KafkaJournalWriterFailDelivery(KafkaJournalWriter): """A journal writer which always fails delivering messages""" def _on_delivery(self, error, message): """Replace the inbound error with a fake delivery error""" super()._on_delivery(MockKafkaError(), message) kafka_prefix += ".swh.journal.objects" writer = KafkaJournalWriterFailDelivery( brokers=[kafka_server], client_id="kafka_writer", prefix=kafka_prefix, ) empty_dir = Directory(entries=()) with pytest.raises(KafkaDeliveryError) as exc: writer.write_addition("directory", empty_dir) assert "Failed deliveries" in exc.value.message assert len(exc.value.delivery_failures) == 1 delivery_failure = exc.value.delivery_failures[0] assert delivery_failure.key == empty_dir.id assert delivery_failure.code == "SWH_MOCK_ERROR" def test_write_delivery_timeout( kafka_prefix: str, kafka_server: str, consumer: Consumer ): produced = [] class MockProducer(Producer): """A kafka producer which pretends to produce messages, but never sends any delivery acknowledgements""" def produce(self, **kwargs): produced.append(kwargs) writer = KafkaJournalWriter( brokers=[kafka_server], client_id="kafka_writer", prefix=kafka_prefix, flush_timeout=1, producer_class=MockProducer, ) empty_dir = Directory(entries=()) with pytest.raises(KafkaDeliveryError) as exc: writer.write_addition("directory", empty_dir) assert len(produced) == 1 assert "timeout" in exc.value.message assert len(exc.value.delivery_failures) == 1 delivery_failure = exc.value.delivery_failures[0] assert delivery_failure.key == empty_dir.id assert delivery_failure.code == "SWH_FLUSH_TIMEOUT" diff --git a/swh/journal/tests/test_pytest_plugin.py b/swh/journal/tests/test_pytest_plugin.py index 14e49d7..02c0ef7 100644 --- a/swh/journal/tests/test_pytest_plugin.py +++ b/swh/journal/tests/test_pytest_plugin.py @@ -1,71 +1,72 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Iterator + from confluent_kafka.admin import AdminClient def test_kafka_server(kafka_server_base: str): ip, port_str = kafka_server_base.split(":") assert ip == "127.0.0.1" assert int(port_str) admin = AdminClient({"bootstrap.servers": kafka_server_base}) topics = admin.list_topics() assert len(topics.brokers) == 1 def test_kafka_server_with_topics( kafka_server: str, kafka_prefix: str, object_types: Iterator[str], privileged_object_types: Iterator[str], ): admin = AdminClient({"bootstrap.servers": kafka_server}) # check unprivileged topics are present topics = { topic for topic in admin.list_topics().topics if topic.startswith(f"{kafka_prefix}.") } assert topics == {f"{kafka_prefix}.{obj}" for obj in object_types} # check privileged topics are present topics = { topic for topic in admin.list_topics().topics if topic.startswith(f"{kafka_prefix}_privileged.") } assert topics == { f"{kafka_prefix}_privileged.{obj}" for obj in privileged_object_types } def test_test_config(test_config: dict, kafka_prefix: str, kafka_server_base: str): assert test_config == { "consumer_id": "swh.journal.consumer", "stop_after_objects": 1, "storage": {"cls": "memory", "args": {}}, "object_types": { "content", "directory", "metadata_authority", "metadata_fetcher", "origin", "origin_visit", "origin_visit_status", "raw_extrinsic_metadata", "release", "revision", "snapshot", "skipped_content", }, "privileged_object_types": {"release", "revision",}, "brokers": [kafka_server_base], "prefix": kafka_prefix, } diff --git a/swh/journal/tests/test_serializers.py b/swh/journal/tests/test_serializers.py index 1a1fdd9..4cbe236 100644 --- a/swh/journal/tests/test_serializers.py +++ b/swh/journal/tests/test_serializers.py @@ -1,77 +1,76 @@ # Copyright (C) 2017-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import itertools - from collections import OrderedDict +import itertools from typing import Iterable from swh.journal import serializers from .conftest import TEST_OBJECTS def test_key_to_kafka_repeatable(): """Check the kafka key encoding is repeatable""" base_dict = { "a": "foo", "b": "bar", "c": "baz", } key = serializers.key_to_kafka(base_dict) for dict_keys in itertools.permutations(base_dict): d = OrderedDict() for k in dict_keys: d[k] = base_dict[k] assert key == serializers.key_to_kafka(d) def test_get_key(): """Test whether get_key works on all our objects""" for object_type, objects in TEST_OBJECTS.items(): for obj in objects: assert serializers.object_key(object_type, obj) is not None def test_pprint_key(): """Test whether get_key works on all our objects""" for object_type, objects in TEST_OBJECTS.items(): for obj in objects: key = serializers.object_key(object_type, obj) pprinted_key = serializers.pprint_key(key) assert isinstance(pprinted_key, str) if isinstance(key, dict): assert pprinted_key[0], pprinted_key[-1] == "{}" for dict_key in key.keys(): assert f"{dict_key}:" in pprinted_key if isinstance(key, bytes): assert pprinted_key == key.hex() def test_kafka_to_key(): """Standard back and forth serialization with keys """ # All KeyType(s) keys: Iterable[serializers.KeyType] = [ {"a": "foo", "b": "bar", "c": "baz",}, {"a": b"foobarbaz",}, b"foo", ] for object_type, objects in TEST_OBJECTS.items(): for obj in objects: key = serializers.object_key(object_type, obj) keys.append(key) for key in keys: ktk = serializers.key_to_kafka(key) v = serializers.kafka_to_key(ktk) assert v == key diff --git a/swh/journal/writer/inmemory.py b/swh/journal/writer/inmemory.py index 61a9269..9c81b6a 100644 --- a/swh/journal/writer/inmemory.py +++ b/swh/journal/writer/inmemory.py @@ -1,40 +1,39 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging - from multiprocessing import Manager from typing import List from swh.model.model import BaseModel from .kafka import ModelObject logger = logging.getLogger(__name__) class InMemoryJournalWriter: def __init__(self): # Share the list of objects across processes, for RemoteAPI tests. self.manager = Manager() self.objects = self.manager.list() self.privileged_objects = self.manager.list() def write_addition( self, object_type: str, object_: ModelObject, privileged: bool = False ) -> None: assert isinstance(object_, BaseModel) if privileged: self.privileged_objects.append((object_type, object_)) else: self.objects.append((object_type, object_)) write_update = write_addition def write_additions( self, object_type: str, objects: List[ModelObject], privileged: bool = False ) -> None: for object_ in objects: self.write_addition(object_type, object_, privileged) diff --git a/swh/journal/writer/kafka.py b/swh/journal/writer/kafka.py index 1dccdcb..e5dc547 100644 --- a/swh/journal/writer/kafka.py +++ b/swh/journal/writer/kafka.py @@ -1,239 +1,239 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import time from typing import Dict, Iterable, List, NamedTuple, Optional, Type -from confluent_kafka import Producer, KafkaException +from confluent_kafka import KafkaException, Producer from swh.journal.serializers import ( KeyType, ModelObject, + key_to_kafka, object_key, pprint_key, - key_to_kafka, value_to_kafka, ) logger = logging.getLogger(__name__) class DeliveryTag(NamedTuple): """Unique tag allowing us to check for a message delivery""" topic: str kafka_key: bytes class DeliveryFailureInfo(NamedTuple): """Verbose information for failed deliveries""" object_type: str key: KeyType message: str code: str def get_object_type(topic: str) -> str: """Get the object type from a topic string""" return topic.rsplit(".", 1)[-1] class KafkaDeliveryError(Exception): """Delivery failed on some kafka messages.""" def __init__(self, message: str, delivery_failures: Iterable[DeliveryFailureInfo]): self.message = message self.delivery_failures = list(delivery_failures) def pretty_failures(self) -> str: return ", ".join( f"{f.object_type} {pprint_key(f.key)} ({f.message})" for f in self.delivery_failures ) def __str__(self): return f"KafkaDeliveryError({self.message}, [{self.pretty_failures()}])" class KafkaJournalWriter: """This class is used to write serialized versions of swh.model.model objects to a series of Kafka topics. Topics used to send objects representations are built from a ``prefix`` plus the type of the object: ``{prefix}.{object_type}`` Objects can be sent as is, or can be anonymized. The anonymization feature, when activated, will write anonymized versions of model objects in the main topic, and stock (non-anonymized) objects will be sent to a dedicated (privileged) set of topics: ``{prefix}_privileged.{object_type}`` The anonymization of a swh.model object is the result of calling its ``BaseModel.anonymize()`` method. An object is considered anonymizable if this method returns a (non-None) value. Args: brokers: list of broker addresses and ports. prefix: the prefix used to build the topic names for objects. client_id: the id of the writer sent to kafka. producer_config: extra configuration keys passed to the `Producer`. flush_timeout: timeout, in seconds, after which the `flush` operation will fail if some message deliveries are still pending. producer_class: override for the kafka producer class. anonymize: if True, activate the anonymization feature. """ def __init__( self, brokers: Iterable[str], prefix: str, client_id: str, producer_config: Optional[Dict] = None, flush_timeout: float = 120, producer_class: Type[Producer] = Producer, anonymize: bool = False, ): self._prefix = prefix self._prefix_privileged = f"{self._prefix}_privileged" self.anonymize = anonymize if not producer_config: producer_config = {} if "message.max.bytes" not in producer_config: producer_config = { "message.max.bytes": 100 * 1024 * 1024, **producer_config, } self.producer = producer_class( { "bootstrap.servers": ",".join(brokers), "client.id": client_id, "on_delivery": self._on_delivery, "error_cb": self._error_cb, "logger": logger, "acks": "all", **producer_config, } ) # Delivery management self.flush_timeout = flush_timeout # delivery tag -> original object "key" mapping self.deliveries_pending: Dict[DeliveryTag, KeyType] = {} # List of (object_type, key, error_msg, error_name) for failed deliveries self.delivery_failures: List[DeliveryFailureInfo] = [] def _error_cb(self, error): if error.fatal(): raise KafkaException(error) logger.info("Received non-fatal kafka error: %s", error) def _on_delivery(self, error, message): (topic, key) = delivery_tag = DeliveryTag(message.topic(), message.key()) sent_key = self.deliveries_pending.pop(delivery_tag, None) if error is not None: self.delivery_failures.append( DeliveryFailureInfo( get_object_type(topic), sent_key, error.str(), error.name() ) ) def send(self, topic: str, key: KeyType, value): kafka_key = key_to_kafka(key) self.producer.produce( topic=topic, key=kafka_key, value=value_to_kafka(value), ) self.deliveries_pending[DeliveryTag(topic, kafka_key)] = key def delivery_error(self, message) -> KafkaDeliveryError: """Get all failed deliveries, and clear them""" ret = self.delivery_failures self.delivery_failures = [] while self.deliveries_pending: delivery_tag, orig_key = self.deliveries_pending.popitem() (topic, kafka_key) = delivery_tag ret.append( DeliveryFailureInfo( get_object_type(topic), orig_key, "No delivery before flush() timeout", "SWH_FLUSH_TIMEOUT", ) ) return KafkaDeliveryError(message, ret) def flush(self): start = time.monotonic() self.producer.flush(self.flush_timeout) while self.deliveries_pending: if time.monotonic() - start > self.flush_timeout: break self.producer.poll(0.1) if self.deliveries_pending: # Delivery timeout raise self.delivery_error( "flush() exceeded timeout (%ss)" % self.flush_timeout, ) elif self.delivery_failures: raise self.delivery_error("Failed deliveries after flush()") def _sanitize_object( self, object_type: str, object_: ModelObject ) -> Dict[str, str]: dict_ = object_.to_dict() if object_type == "content": dict_.pop("data", None) return dict_ def _write_addition(self, object_type: str, object_: ModelObject) -> None: """Write a single object to the journal""" key = object_key(object_type, object_) if self.anonymize: anon_object_ = object_.anonymize() if anon_object_: # can be either None, or an anonymized object # if the object is anonymizable, send the non-anonymized version in the # privileged channel topic = f"{self._prefix_privileged}.{object_type}" dict_ = self._sanitize_object(object_type, object_) logger.debug("topic: %s, key: %s, value: %s", topic, key, dict_) self.send(topic, key=key, value=dict_) object_ = anon_object_ topic = f"{self._prefix}.{object_type}" dict_ = self._sanitize_object(object_type, object_) logger.debug("topic: %s, key: %s, value: %s", topic, key, dict_) self.send(topic, key=key, value=dict_) def write_addition(self, object_type: str, object_: ModelObject) -> None: """Write a single object to the journal""" self._write_addition(object_type, object_) self.flush() write_update = write_addition def write_additions(self, object_type: str, objects: Iterable[ModelObject]) -> None: """Write a set of objects to the journal""" for object_ in objects: self._write_addition(object_type, object_) self.flush() diff --git a/version.txt b/version.txt deleted file mode 100644 index d94ba98..0000000 --- a/version.txt +++ /dev/null @@ -1 +0,0 @@ -v0.4.1-0-g94f282f \ No newline at end of file