diff --git a/PKG-INFO b/PKG-INFO index 79667e4..2b8fcf9 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,37 +1,37 @@ Metadata-Version: 2.1 Name: swh.scheduler -Version: 0.17.0 +Version: 0.17.1 Summary: Software Heritage Scheduler Home-page: https://forge.softwareheritage.org/diffusion/DSCH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-scheduler Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-scheduler/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing Provides-Extra: journal Provides-Extra: simulator License-File: LICENSE License-File: LICENSE.Celery License-File: AUTHORS swh-scheduler ============= Job scheduler for the Software Heritage project. Task manager for asynchronous/delayed tasks, used for both recurrent (e.g., listing a forge, loading new stuff from a Git repository) and one-off activities (e.g., loading a specific version of a source package). diff --git a/debian/changelog b/debian/changelog index d7cad42..7f3f4b0 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,1010 +1,1014 @@ -swh-scheduler (0.17.0-1~swh1~bpo10+1) buster-swh; urgency=medium +swh-scheduler (0.17.1-1~swh1) unstable-swh; urgency=medium - * Rebuild for buster-swh + * New upstream release 0.17.1 - (tagged by Antoine R. Dumont + (@ardumont) on 2021-08-26 10:30:12 + +0200) + * Upstream changes: - v0.17.1 - journal_client: Ensure queue + position does not overflow - -- Software Heritage autobuilder (on jenkins-debian1) Fri, 06 Aug 2021 09:14:02 +0000 + -- Software Heritage autobuilder (on jenkins-debian1) Thu, 26 Aug 2021 08:41:41 +0000 swh-scheduler (0.17.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.17.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-08-05 15:29:18 +0200) * Upstream changes: - v0.17.0 - Introduce new scheduling policy to grab origins without last update - journal_client: Disable origins when too many visited attempts failed - journal_client: Record last_visited and last_successful in origin_visit_stats - Add a specific cooldown for notfound origins - Add a (longer) specific cooldown for failed origin visits - Make the origin visit scheduling cooldown configurable - Various refactoring to simplify the grab next visits logic and updates -- Software Heritage autobuilder (on jenkins-debian1) Fri, 06 Aug 2021 09:11:54 +0000 swh-scheduler (0.16.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.16.0 - (tagged by Antoine Lambert on 2021-06-22 17:35:55 +0200) * Upstream changes: - version 0.16.0 -- Software Heritage autobuilder (on jenkins-debian1) Tue, 22 Jun 2021 15:39:45 +0000 swh-scheduler (0.15.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.15.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-06-10 16:09:06 +0200) * Upstream changes: - v0.15.0 - separate-runner runner: Separate scheduling tasks with and without priority concern - Refactor and extract a get_available_slots utility - Add typing stubs dependencies for mypy>0.900 - pytest_plugin: Explicitly set hostname in broker_url for celery TestApp -- Software Heritage autobuilder (on jenkins-debian1) Thu, 10 Jun 2021 14:48:52 +0000 swh-scheduler (0.14.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.14.2 - (tagged by Valentin Lorentz on 2021-05-06 17:09:00 +0200) * Upstream changes: - v0.14.2 - * Fix flaky tests -- Software Heritage autobuilder (on jenkins-debian1) Thu, 06 May 2021 15:13:11 +0000 swh-scheduler (0.14.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.14.1 - (tagged by Antoine R. Dumont (@ardumont) on 2021-05-06 16:00:07 +0200) * Upstream changes: - v0.14.1 - Use swh.core 0.14 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 06 May 2021 14:17:39 +0000 swh-scheduler (0.13.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.13.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-04-20 11:46:51 +0200) * Upstream changes: - v0.13.0 - scheduler: Clean up priority/ratio task dead code - Parse task_ids before calling set_status_tasks. - tests: Complete checks on message with priority consumption -- Software Heritage autobuilder (on jenkins-debian1) Tue, 20 Apr 2021 09:51:00 +0000 swh-scheduler (0.12.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.12.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-04-15 13:31:30 +0200) * Upstream changes: - v0.12.0 - Route priority tasks to dedicated save code now queues - Fix various Sphinx warnings -- Software Heritage autobuilder (on jenkins-debian1) Thu, 15 Apr 2021 11:36:13 +0000 swh-scheduler (0.11.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.11.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-04-14 18:15:53 +0200) * Upstream changes: - v0.11.0 - separate-queues backend: Open endpoints to peek/grab tasks with any priority - Make origin_visit_stats_get return results from all pages - journal client: Filter out status messages without type - Simplify max_date() - journal_client: Fix date computations for (un)eventful visits - journal_client: Deal with failed status message -- Software Heritage autobuilder (on jenkins-debian1) Wed, 14 Apr 2021 16:19:31 +0000 swh-scheduler (0.10.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.10.0 - (tagged by Nicolas Dandrimont on 2021-02-03 22:53:20 +0100) * Upstream changes: - Release swh.scheduler 0.10.0 - Eagerly acknowledge celery tasks - Loads of simulator improvements - grab_next_visits: - clean up query building - account for schedule time to avoid rescheduling visits too fast - allow overriding the scheduling timestamp for the simulator -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Feb 2021 22:10:13 +0000 swh-scheduler (0.9.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.9.2 - (tagged by Antoine Lambert on 2021-01-25 16:27:41 +0100) * Upstream changes: - version 0.9.2 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 25 Jan 2021 15:31:21 +0000 swh-scheduler (0.9.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.9.1 - (tagged by Vincent SELLIER on 2021-01-21 19:20:33 +0100) * Upstream changes: - v0.9.1 - * Solve uneventful/eventful with unordered messages with snapshots - * Do not consider duplicated messages as uneventful event - * Reorganize grab_next_visits tests to better check sorting behavior -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Jan 2021 18:28:00 +0000 swh-scheduler (0.9.0-1~swh2) unstable-swh; urgency=medium * Bump new release to unstuck packaging -- Antoine R. Dumont (@ardumont) Thu, 21 Jan 2021 13:20:14 +0000 swh-scheduler (0.9.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.9.0 - (tagged by Antoine R. Dumont (@ardumont) on 2021-01-21 11:54:47 +0100) * Upstream changes: - v0.9.0 - Populate origin_visit_stats table out of the origin_visit_status topic - Introduce a scheduler policy simulator (old task-based scheduler, ...) - Implement basic aggregated metrics on listed origins - scheduler.cli.journal: Add `swh scheduler journal-client` cli - Filter origins by visit type when scheduling the next visits - Introduce a `swh scheduler origin schedule-next` cli - Introduce a `swh scheduler origin grab-next` cli - Add an new origin visit stats model object and related backend api - Implement a basic endpoint for getting the next origins to visit - doc: Add a cli section to the doc -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Jan 2021 11:00:29 +0000 swh-scheduler (0.8.2-1~swh2) unstable-swh; urgency=medium * Bump dependency -- Antoine R. Dumont (@ardumont) Tue, 08 Dec 2020 09:29:26 +0000 swh-scheduler (0.8.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-12-07 09:52:28 +0100) * Upstream changes: - v0.8.2 - requirement: Adapt celery requirements - Replace usage of arrow datetime objects in favor of pure datetime ones - Stop using the deprecated configuration scheme - cli.task_type: All task_type clis without a scheduler should raise -- Software Heritage autobuilder (on jenkins-debian1) Mon, 07 Dec 2020 08:55:39 +0000 swh-scheduler (0.8.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.1 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-24 14:13:36 +0100) * Upstream changes: - v0.8.1 - conftest: Reference swh.core.db.pytest_plugin -- Software Heritage autobuilder (on jenkins-debian1) Tue, 24 Nov 2020 13:16:08 +0000 swh-scheduler (0.8.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.8.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-11-23 13:42:05 +0100) * Upstream changes: - v0.8.0 - requirements-test.txt: Drop no longer needed pytest-postgresql requirement - scheduler.pytest_plugin: Make scheduler tests faster -- Software Heritage autobuilder (on jenkins-debian1) Mon, 23 Nov 2020 12:44:40 +0000 swh-scheduler (0.7.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.7.0 - (tagged by Antoine R. Dumont (@ardumont) on 2020-10-19 09:30:36 +0200) * Upstream changes: - v0.7.0 - scheduler: Type and unify get_scheduler factory with other factories - pytest_plugin: Explicitly name the scheduler test db differently - test_server: Simplify exception manipulations - tox.ini: pin black to the pre-commit version (19.10b0) to avoid flip-flops -- Software Heritage autobuilder (on jenkins-debian1) Mon, 19 Oct 2020 07:33:54 +0000 swh-scheduler (0.6.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.6.0 - (tagged by David Douard on 2020-09-25 12:03:33 +0200) * Upstream changes: - v0.6.0 -- Software Heritage autobuilder (on jenkins-debian1) Fri, 25 Sep 2020 10:06:32 +0000 swh-scheduler (0.5.3-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.3 - (tagged by Nicolas Dandrimont on 2020-09-24 17:49:27 +0200) * Upstream changes: - Release swh.scheduler v0.5.3 - Improve swh cli startup time - Add isort and update flake8 - Improve pytest execution time - Support recent kombu versions -- Software Heritage autobuilder (on jenkins-debian1) Thu, 24 Sep 2020 15:53:25 +0000 swh-scheduler (0.5.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.2 - (tagged by Antoine R. Dumont (@ardumont) on 2020-07-10 13:01:48 +0200) * Upstream changes: - v0.5.2 - Do no expose pytest-plugin through setuptools, let modules require it when needed -- Software Heritage autobuilder (on jenkins-debian1) Fri, 10 Jul 2020 11:08:30 +0000 swh-scheduler (0.5.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.1 - (tagged by Nicolas Dandrimont on 2020-07-09 10:18:03 +0200) * Upstream changes: - Release swh.scheduler 0.5.1 - Drop dependency on future (not needed anymore) -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Jul 2020 09:51:38 +0000 swh-scheduler (0.5.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.5.0 - (tagged by Nicolas Dandrimont on 2020-07-09 10:16:57 +0200) * Upstream changes: - Release swh.scheduler v0.5.0 - Move celery fixtures to the pytest plugin -- Software Heritage autobuilder (on jenkins-debian1) Thu, 09 Jul 2020 08:20:42 +0000 swh-scheduler (0.4.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.4.0 - (tagged by Nicolas Dandrimont on 2020-07-06 16:47:28 +0200) * Upstream changes: - Release swh.scheduler 0.4.0 - Extract pytest fixtures to a pytest plugin -- Software Heritage autobuilder (on jenkins-debian1) Mon, 06 Jul 2020 14:52:42 +0000 swh-scheduler (0.3.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.3.0 - (tagged by Nicolas Dandrimont on 2020-07-06 12:18:28 +0200) * Upstream changes: - Release swh.scheduler 0.3.0 - Add get_listed_origins endpoint -- Software Heritage autobuilder (on jenkins-debian1) Mon, 06 Jul 2020 10:23:31 +0000 swh-scheduler (0.2.2-1~swh1) unstable-swh; urgency=medium * New upstream release 0.2.2 - (tagged by Nicolas Dandrimont on 2020-06-22 14:03:34 +0200) * Upstream changes: - Release swh.scheduler 0.2.2 - Re- introduce root endpoint for the RPC server -- Software Heritage autobuilder (on jenkins-debian1) Mon, 22 Jun 2020 12:07:05 +0000 swh-scheduler (0.2.1-1~swh1) unstable-swh; urgency=medium [ Nicolas Dandrimont ] * Force celery >= 4.3 [ Software Heritage autobuilder (on jenkins-debian1) ] * New upstream release 0.2.1 - (tagged by Nicolas Dandrimont on 2020-06-22 12:09:32 +0200) * Upstream changes: - Release swh.scheduler 0.2.1 - Bump celery requirement to 4.3+ -- Software Heritage autobuilder (on jenkins-debian1) Mon, 22 Jun 2020 10:12:50 +0000 swh-scheduler (0.2.0-1~swh1) unstable-swh; urgency=medium [ Nicolas Dandrimont ] * Switch from vcversioner to setuptools-scm * wrap-and-sort [ Software Heritage autobuilder (on jenkins-debian1) ] * New upstream release 0.2.0 - (tagged by Nicolas Dandrimont on 2020-06-22 10:33:11 +0200) * Upstream changes: - Release swh.scheduler 0.2.0 - Implement storage of lister and listed origin information - Add swh scheduler celery-monitor command - Overhaul RPC to use automatic generation -- Software Heritage autobuilder (on jenkins-debian1) Mon, 22 Jun 2020 08:36:49 +0000 swh-scheduler (0.1.1-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.1 - (tagged by Nicolas Dandrimont on 2020-06-03 11:34:19 +0200) * Upstream changes: - Release swh.scheduler v0.1.1 - Add missing dependency on future for celery 4.4.4 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Jun 2020 09:39:25 +0000 swh-scheduler (0.1.0-1~swh1) unstable-swh; urgency=medium * New upstream release 0.1.0 - (tagged by Nicolas Dandrimont on 2020-05-19 11:48:34 +0200) * Upstream changes: - Release swh.scheduler v0.1.0 - Blacken source code - Disable azure http logspam - Only schedule tasks when the buffer is somewhat empty -- Software Heritage autobuilder (on jenkins-debian1) Tue, 19 May 2020 09:52:31 +0000 swh-scheduler (0.0.72-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.72 - (tagged by Nicolas Dandrimont on 2020-03-23 13:07:38 +0100) * Upstream changes: - Release swh.scheduler v0.0.72 - Update instantiation of storage in tests - ensure that create_task_type is idempotent - introduce new listener based on pika -- Software Heritage autobuilder (on jenkins-debian1) Mon, 23 Mar 2020 12:12:00 +0000 swh-scheduler (0.0.71-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.71 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-23 14:24:56 +0100) * Upstream changes: - v0.0.71 - sentry: Fix initialization init_sentry call -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 Jan 2020 13:29:33 +0000 swh-scheduler (0.0.70-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.70 - (tagged by Antoine R. Dumont (@ardumont) on 2020-01-23 13:43:35 +0100) * Upstream changes: - v0.0.70 - Use swh.core.sentry instead of calling sentry_sdk.init directly - backend_es: Fix configuration mapping -- Software Heritage autobuilder (on jenkins-debian1) Thu, 23 Jan 2020 12:47:43 +0000 swh-scheduler (0.0.69-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.69 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-17 16:00:24 +0100) * Upstream changes: - v0.0.69 - Fix scheduler's archive task cli - Make the filter task endpoint a paginated endpoint - Add coverage on the archive task cli -- Software Heritage autobuilder (on jenkins-debian1) Tue, 17 Dec 2019 15:04:48 +0000 swh-scheduler (0.0.68-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.68 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-17 15:28:13 +0100) * Upstream changes: - v0.0.68 - Fix scheduler's archive task cli - Make the filter task endpoint a paginated endpoint - Add coverage on the archive task cli -- Software Heritage autobuilder (on jenkins-debian1) Tue, 17 Dec 2019 14:33:33 +0000 swh-scheduler (0.0.67-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.67 - (tagged by Antoine R. Dumont (@ardumont) on 2019-12-17 14:33:36 +0100) * Upstream changes: - v0.0.67 - Fix scheduler's archive task cli - Make the filter task endpoint a paginated endpoint - Add coverage on the archive task cli -- Software Heritage autobuilder (on jenkins-debian1) Tue, 17 Dec 2019 13:38:03 +0000 swh-scheduler (0.0.66-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.66 - (tagged by Nicolas Dandrimont on 2019-12-17 12:04:20 +0100) * Upstream changes: - Release swh.scheduler v0.0.66 - initialize sentry on celery worker startup - improve task archival endpoints in backend api -- Software Heritage autobuilder (on jenkins-debian1) Tue, 17 Dec 2019 11:08:25 +0000 swh-scheduler (0.0.65-1~swh2) unstable-swh; urgency=medium * Add pytest-mock build-dependency. -- Nicolas Dandrimont Fri, 13 Dec 2019 11:57:41 +0100 swh-scheduler (0.0.65-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.65 - (tagged by Nicolas Dandrimont on 2019-12-13 11:45:55 +0100) * Upstream changes: - Release swh.scheduler v0.0.65 - Drop the scheduler updater - Add a statsd probe for task execution timestamps - Add listener and runner statsd probes - CLI updates - Python packaging housekeeping -- Software Heritage autobuilder (on jenkins-debian1) Fri, 13 Dec 2019 10:54:31 +0000 swh-scheduler (0.0.64-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.64 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-20 14:26:00 +0100) * Upstream changes: - v0.0.64 - req-swh*: Remove old package loader backend names -- Software Heritage autobuilder (on jenkins-debian1) Wed, 20 Nov 2019 13:29:37 +0000 swh-scheduler (0.0.63-1~swh2) unstable-swh; urgency=medium * Update build dependency -- Antoine R. Dumont (@ardumont) Tue, 19 Nov 2019 17:07:40 +0100 swh-scheduler (0.0.63-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.63 - (tagged by Antoine R. Dumont (@ardumont) on 2019-11-19 14:09:12 +0100) * Upstream changes: - v0.0.63 - swh.scheduler.cli: Add `swh scheduler task-type register` cli - Use the shared_task decorator instead of binding to a specific celery app - celery/tests: mostly revert e770eb30 to fix celery app initialization in tests -- Software Heritage autobuilder (on jenkins-debian1) Tue, 19 Nov 2019 13:14:59 +0000 swh-scheduler (0.0.62-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.62 - (tagged by Antoine R. Dumont (@ardumont) on 2019-10-18 13:39:27 +0200) * Upstream changes: - v0.0.62 - celery_backend.config: Make JournalHandler import optional - tests: rewrite tests using pytest fixtures -- Software Heritage autobuilder (on jenkins-debian1) Fri, 18 Oct 2019 11:46:26 +0000 swh-scheduler (0.0.61-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.61 - (tagged by Nicolas Dandrimont on 2019-10-07 16:33:17 +0200) * Upstream changes: - Release swh.scheduler v0.0.61 - Remove bogus dict.get(default=) statement -- Software Heritage autobuilder (on jenkins-debian1) Mon, 07 Oct 2019 14:37:37 +0000 swh-scheduler (0.0.60-1~swh2) unstable-swh; urgency=medium * Force postgresql executable to a pg_ctl that exists when running tests. -- Nicolas Dandrimont Tue, 01 Oct 2019 18:14:39 +0200 swh-scheduler (0.0.60-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.60 - (tagged by Stefano Zacchiroli on 2019-10-01 13:13:13 +0200) * Upstream changes: - v0.0.60 - * tox: anticipate mypy run to just after flake8 - * init.py: switch to documented way of extending path - * tox.ini: add mypy section - * typing: minimal changes to make a no-op mypy run pass - * fix typo in docstring and sample file name - * admin CLI: drop obsolete backward compatibility aliases - * click "required" param wants bool, not int -- Software Heritage autobuilder (on jenkins-debian1) Tue, 01 Oct 2019 11:22:43 +0000 swh-scheduler (0.0.59-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.59 - (tagged by David Douard on 2019-09-04 16:08:27 +0200) * Upstream changes: - v0.0.59 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 04 Sep 2019 14:11:48 +0000 swh-scheduler (0.0.58-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.58 - (tagged by Antoine R. Dumont (@ardumont) on 2019-09-03 10:19:34 +0200) * Upstream changes: - v0.0.58 - celery: auto add tasks declared in the swh.workers entry point in task_modules - api/client: use RPCClient instead of deprecated SWHRemoteAPI - Make schedule_origins use origin urls instead of ids in task arguments. - docs: add code of conduct document - docs: very beginning of a practical documentation on the scheduler - config: Add a pre-commit config file - data: Insert new cgit instance lister task - data: Insert load-tar task-type -- Software Heritage autobuilder (on jenkins-debian1) Tue, 03 Sep 2019 08:28:19 +0000 swh-scheduler (0.0.57-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.57 - (tagged by David Douard on 2019-06-26 14:56:32 +0200) * Upstream changes: - v0.0.57 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 26 Jun 2019 13:05:20 +0000 swh-scheduler (0.0.56-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.56 - (tagged by Nicolas Dandrimont on 2019-05-07 18:16:20 +0200) * Upstream changes: - listener: Release the db object after using it - This is the contract that get_db/put_db is supposed to conform to. -- Software Heritage autobuilder (on jenkins-debian1) Tue, 14 May 2019 12:40:09 +0000 swh-scheduler (0.0.55-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.55 - (tagged by Antoine Lambert on 2019-05-06 11:47:43 +0200) * Upstream changes: - version 0.0.55 -- Software Heritage autobuilder (on jenkins-debian1) Mon, 06 May 2019 09:54:51 +0000 swh-scheduler (0.0.54-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.54 - (tagged by Antoine R. Dumont (@ardumont) on 2019-04-11 11:33:40 +0200) * Upstream changes: - v0.0.54 - cli_utils: Use yaml.safe_load instead of yaml.load - Fix support of latest versions of swh- core and psycopg2 - sql/data: Add npm related task types -- Software Heritage autobuilder (on jenkins-debian1) Thu, 11 Apr 2019 09:40:14 +0000 swh-scheduler (0.0.53-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.53 - (tagged by Antoine Lambert on 2019-04-04 16:45:56 +0200) * Upstream changes: - version 0.0.53 -- Software Heritage autobuilder (on jenkins-debian1) Thu, 04 Apr 2019 14:55:20 +0000 swh-scheduler (0.0.52-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.52 - (tagged by Nicolas Dandrimont on 2019-04-03 10:54:06 +0200) * Upstream changes: - Release swh.scheduler v0.0.52 - Move to result_serializer = json to work around celery 4.3 bug - Fix db initialization -- Software Heritage autobuilder (on jenkins-debian1) Wed, 03 Apr 2019 08:59:00 +0000 swh-scheduler (0.0.51-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.51 - (tagged by Antoine R. Dumont (@ardumont) on 2019-03-22 12:09:22 +0100) * Upstream changes: - v0.0.51 - requirements.txt: Remove kombu dependency -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Mar 2019 11:16:06 +0000 swh-scheduler (0.0.50-1~swh2) unstable-swh; urgency=medium * Update build- and runtime dependencies -- Nicolas Dandrimont Fri, 15 Mar 2019 18:24:11 +0100 swh-scheduler (0.0.50-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.50 - (tagged by Nicolas Dandrimont on 2019-03-15 18:07:24 +0100) * Upstream changes: - Release swh.scheduler v0.0.50 - Add an explicit log target for stdout and/or journald - Avoid useless log lines - Improve test coverage - Add support for non- string options in the CLI -- Software Heritage autobuilder (on jenkins-debian1) Fri, 15 Mar 2019 17:16:03 +0000 swh-scheduler (0.0.49-1~swh2) unstable-swh; urgency=medium * Export LC_ALL=C.UTF-8 -- Nicolas Dandrimont Thu, 14 Mar 2019 13:42:24 +0100 swh-scheduler (0.0.49-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.49 - (tagged by Nicolas Dandrimont on 2019-03-03 08:48:04 +0100) * Upstream changes: - Release swh.scheduler v0.0.49 - various fixes around celery behavior - move wsgi endpoint to a separate module - add tests for the CLI -- Software Heritage autobuilder (on jenkins-debian1) Sun, 03 Mar 2019 07:55:41 +0000 swh-scheduler (0.0.48-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.48 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-22 16:11:51 +0100) * Upstream changes: - v0.0.48 - Fix comment on main scheduler schema -- Software Heritage autobuilder (on jenkins-debian1) Fri, 22 Feb 2019 15:17:20 +0000 swh-scheduler (0.0.47-1~swh2) unstable-swh; urgency=low * Upstream release to fix build dependencies issue -- Antoine Romain Dumont (@ardumont) Thu, 21 Feb 2019 15:41:24 +0100 swh-scheduler (0.0.47-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.47 - (tagged by Valentin Lorentz on 2019-02-20 16:53:20 +0100) * Upstream changes: - Fix crash of SchedulerBackend.search_tasks when no argument is given. -- Software Heritage autobuilder (on jenkins-debian1) Thu, 21 Feb 2019 09:13:07 +0000 swh-scheduler (0.0.46-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.46 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-15 15:05:47 +0100) * Upstream changes: - v0.0.46 - scheduler.task: Remove no longer used Task class -- Software Heritage autobuilder (on jenkins-debian1) Fri, 15 Feb 2019 14:15:26 +0000 swh-scheduler (0.0.45-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.45 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-15 10:43:07 +0100) * Upstream changes: - v0.0.45 - celery_backend/config: Fix loglevel for amqp module -- Software Heritage autobuilder (on jenkins-debian1) Fri, 15 Feb 2019 09:48:25 +0000 swh-scheduler (0.0.44-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.44 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-13 16:29:05 +0100) * Upstream changes: - v0.0.44 - swh-scheduler-api: Fix configuration read too many times -- Software Heritage autobuilder (on jenkins-debian1) Wed, 13 Feb 2019 15:34:34 +0000 swh-scheduler (0.0.43-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.43 - (tagged by David Douard on 2019-02-13 15:27:27 +0100) * Upstream changes: - v0.0.43 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 13 Feb 2019 14:46:59 +0000 swh-scheduler (0.0.42-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.42 - (tagged by Antoine R. Dumont (@ardumont) on 2019-02-11 14:28:10 +0100) * Upstream changes: - v0.0.42 - Fix dependency requirements for hypothesis -- Software Heritage autobuilder (on jenkins-debian1) Mon, 11 Feb 2019 13:33:48 +0000 swh-scheduler (0.0.41-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.41 - (tagged by David Douard on 2019-02-06 15:25:56 +0100) * Upstream changes: - v0.0.41 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 06 Feb 2019 15:33:04 +0000 swh-scheduler (0.0.40-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.40 - (tagged by Antoine R. Dumont (@ardumont) on 2019-01-28 16:24:04 +0100) * Upstream changes: - v0.0.40 - swh.scheduler.tests: Mark db tests as such - Force tox environment to C.UTF-8 locale - Add debug logging in the SWHTask class -- Software Heritage autobuilder (on jenkins-debian1) Mon, 28 Jan 2019 15:30:41 +0000 swh-scheduler (0.0.39-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.39 - (tagged by David Douard on 2019-01-16 13:37:58 +0100) * Upstream changes: - v0.0.39 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 16 Jan 2019 12:42:37 +0000 swh-scheduler (0.0.38-1~swh1) unstable-swh; urgency=medium * New upstream release 0.0.38 - (tagged by David Douard on 2018-12-20 14:39:59 +0100) * Upstream changes: - v0.0.38 -- Software Heritage autobuilder (on jenkins-debian1) Wed, 09 Jan 2019 18:32:14 +0000 swh-scheduler (0.0.35-1~swh1) unstable-swh; urgency=medium * v0.0.35 * tests: Add SchedulerTestFixture * swh.scheduler.utils: Allow to add more task information * sql/40-swh-data: Update new indexer task types for local db -- Antoine R. Dumont (@ardumont) Mon, 29 Oct 2018 10:07:08 +0100 swh-scheduler (0.0.34-1~swh1) unstable-swh; urgency=medium * v0.0.34 * Finalize pytest migration -- Antoine R. Dumont (@ardumont) Thu, 25 Oct 2018 17:52:03 +0200 swh-scheduler (0.0.33-1~swh1) unstable-swh; urgency=medium * v0.0.33 -- David Douard Thu, 25 Oct 2018 16:03:16 +0200 swh-scheduler (0.0.32-1~swh1) unstable-swh; urgency=medium * v0.0.32 * tests: Add celery fixture to ease tests * tests: make tests use sql/ files from the package * tests: Starting migration towards pytest * listener: Make the listener code compatible with new celery (debian buster) * Make swh_scheduler_create_tasks_from_temp use indexes * setup: prepare for pypi upload * docs: add a simple README file -- Antoine R. Dumont (@ardumont) Mon, 22 Oct 2018 15:37:51 +0200 swh-scheduler (0.0.31-1~swh1) unstable-swh; urgency=medium * v0.0.31 * sql/swh-scheduler: Make the create_tasks call idempotent * swh.scheduler.utils: Open create_task_dict function * sql/scheduler-data: Add lister gitlab task types * sql/scheduler-data: Reference the existing production lister data * swh.scheduler.backend_es: Open sniffing options -- Antoine R. Dumont (@ardumont) Tue, 31 Jul 2018 06:55:39 +0200 swh-scheduler (0.0.30-1~swh1) unstable-swh; urgency=medium * v0.0.30 * swh-scheduler-schema.sql: Archive disabled oneshot tasks as well * swh.scheduler.cli: Add policy to pretty printing task routine * swh.scheduler.cli: Fix broken cli list-pending since api change -- Antoine R. Dumont (@ardumont) Fri, 22 Jun 2018 18:07:02 +0200 swh-scheduler (0.0.29-1~swh1) unstable-swh; urgency=medium * v0.0.29 * swh.scheduler.cli: Change archival period to rolling month - 1 week * swh.scheduler.updater.writer: Force filter resolution to list * swh.scheduler.cli: Change default archival period to current month * swh.scheduler.cli: Improve logging message * swh.scheduler.updater.backend: Adapt configuration path accordingly -- Antoine R. Dumont (@ardumont) Thu, 31 May 2018 11:42:51 +0200 swh-scheduler (0.0.28-1~swh1) unstable-swh; urgency=medium * v0.0.28 * Fix wrong runtime dependencies -- Antoine R. Dumont (@ardumont) Tue, 29 May 2018 14:12:15 +0200 swh-scheduler (0.0.27-1~swh1) unstable-swh; urgency=medium * v0.0.27 * scheduler: Deal with priority in tasks * scheduler-update: new package python3-swh.scheduler.updater * Contains tools in charge of consuming events from arbitrary sources * and update the scheduler db -- Antoine R. Dumont (@ardumont) Tue, 29 May 2018 12:27:34 +0200 swh-scheduler (0.0.26-1~swh1) unstable-swh; urgency=medium * v0.0.26 * swh.scheduler: Fix package build * swh.scheduler.tests: Test remote scheduler api as well * swh.scheduler: Add tests around removing archivable tasks * swh.scheduler: Add tests around filtering archivable tasks * swh-scheduler-schema: Fix unneeded drop instructions * swh.scheduler.cli: Improve docstring * swh.scheduler.cli: Permit to specify the backend to use in cli * swh.scheduler.api: Bootstrap scheduler's remote api * swh.scheduler: Use `get_scheduler` api to instantiate a scheduler * swh.scheduler.backend: Fix docstring -- Antoine R. Dumont (@ardumont) Thu, 26 Apr 2018 17:34:07 +0200 swh-scheduler (0.0.25-1~swh1) unstable-swh; urgency=medium * v0.0.25 * swh.scheduler.cli.archive: Index arguments.kwargs as text -- Antoine R. Dumont (@ardumont) Wed, 18 Apr 2018 12:34:43 +0200 swh-scheduler (0.0.24-1~swh1) unstable-swh; urgency=medium * v0.0.24 * data/template: Do not index the arguments field (it's in _source) * data/README: Add a small readme to explain es install step * swh.scheduler.cli: Add a bulk index flag to separate read from index -- Antoine R. Dumont (@ardumont) Fri, 13 Apr 2018 14:55:32 +0200 swh-scheduler (0.0.23-1~swh1) unstable-swh; urgency=medium * swh.scheduler.cli.archive: Delete only completely indexed tasks * Prior to this commit, it could happen that we removed tasks even * though we did not yet index associated task_run. * Related T986 -- Antoine R. Dumont (@ardumont) Tue, 10 Apr 2018 17:43:07 +0200 swh-scheduler (0.0.22-1~swh1) unstable-swh; urgency=medium * v0.0.22 * Update to a more recent python3-elasticsearch client -- Antoine R. Dumont (@ardumont) Mon, 09 Apr 2018 16:09:16 +0200 swh-scheduler (0.0.21-1~swh1) unstable-swh; urgency=medium * v0.0.21 * Adapt default configuration * Fix typo in configuration variable name -- Antoine R. Dumont (@ardumont) Fri, 30 Mar 2018 15:02:55 +0200 swh-scheduler (0.0.20-1~swh1) unstable-swh; urgency=medium * v0.0.20 * swh.scheduler.cli.archive: Open completed oneshot or disabled * recurring tasks archival endpoint * swh.core.serializer: Move to msgpack serialization format * swh.scheduler.cli: Unify pretty print output * sql/data: Add new task type for loading mercurial dump * swh.scheduler.cli: Add sample use case for the scheduling cli * swh.scheduler.cli: Open policy column to the scheduling cli * swh.scheduler.cli: Open the delimiter option as cli argument * Fix issue when updating task-type without any retry delay defined * swh-scheduler/data: Add new oneshot scheduling load-mercurial task * backend: fix default scheduling_db value for consistency * backend: doc: fix return value of create_tasks -- Antoine R. Dumont (@ardumont) Fri, 30 Mar 2018 11:44:18 +0200 swh-scheduler (0.0.19-1~swh1) unstable-swh; urgency=medium * v0.0.19 * swh.scheduler.utils: Open utility function to create oneshot task -- Antoine R. Dumont (@ardumont) Wed, 29 Nov 2017 12:51:15 +0100 swh-scheduler (0.0.18-1~swh1) unstable-swh; urgency=medium * Release swh.scheduler v0.0.18 * Celery 4 compatibility -- Nicolas Dandrimont Wed, 08 Nov 2017 17:06:22 +0100 swh-scheduler (0.0.17-1~swh1) unstable-swh; urgency=medium * Release swh.scheduler version 0.0.17 * Update packaging runes -- Nicolas Dandrimont Thu, 12 Oct 2017 18:49:02 +0200 swh-scheduler (0.0.16-1~swh1) unstable-swh; urgency=medium * Release swh-scheduler v0.0.16 * add some tests * implement one-shot tasks * implement retry on temporary failure -- Nicolas Dandrimont Mon, 07 Aug 2017 18:44:03 +0200 swh-scheduler (0.0.15-1~swh1) unstable-swh; urgency=medium * Release swh-scheduler v0.0.15 * Add some methods to get the length of task queues * worker: Show logs on stdout if loglevel = debug -- Nicolas Dandrimont Mon, 19 Jun 2017 19:44:56 +0200 swh-scheduler (0.0.14-1~swh1) unstable-swh; urgency=medium * Release swh.scheduler 0.0.14 * Make the return value of tasks available in the listener -- Nicolas Dandrimont Mon, 12 Jun 2017 17:50:32 +0200 swh-scheduler (0.0.13-1~swh1) unstable-swh; urgency=medium * Release swh.scheduler v0.0.13 * Use systemd for logging rather than PostgreSQL -- Nicolas Dandrimont Fri, 07 Apr 2017 11:57:50 +0200 swh-scheduler (0.0.12-1~swh1) unstable-swh; urgency=medium * Release swh.scheduler v0.0.12 * Only log to database if the configuration is present -- Nicolas Dandrimont Thu, 09 Mar 2017 11:12:45 +0100 swh-scheduler (0.0.11-1~swh1) unstable-swh; urgency=medium * Release swh.scheduler v0.0.11 * add utils.get_task -- Nicolas Dandrimont Tue, 14 Feb 2017 19:49:34 +0100 swh-scheduler (0.0.10-1~swh1) unstable-swh; urgency=medium * Release swh.scheduler v0.0.10 * Allow disabling tasks -- Nicolas Dandrimont Thu, 20 Oct 2016 17:20:17 +0200 swh-scheduler (0.0.9-1~swh1) unstable-swh; urgency=medium * Release swh.scheduler v0.0.9 * Revert management of one shot tasks * Add possibility of launching several worker instances -- Nicolas Dandrimont Fri, 02 Sep 2016 17:09:18 +0200 swh-scheduler (0.0.7-1~swh1) unstable-swh; urgency=medium * v0.0.7 * Add oneshot task -- Antoine R. Dumont (@ardumont) Fri, 01 Jul 2016 16:42:45 +0200 swh-scheduler (0.0.6-1~swh1) unstable-swh; urgency=medium * Release swh-scheduler v0.0.6 * More reliability and efficiency when scheduling a lot of tasks -- Nicolas Dandrimont Wed, 24 Feb 2016 18:46:57 +0100 swh-scheduler (0.0.5-1~swh1) unstable-swh; urgency=medium * Release swh.scheduler v0.0.5 * Use copy for task mass-scheduling -- Nicolas Dandrimont Wed, 24 Feb 2016 12:13:38 +0100 swh-scheduler (0.0.4-1~swh1) unstable-swh; urgency=medium * Release swh-scheduler v0.0.4 * general cleanup of the backend * use arrow instead of dateutil * add new cli program -- Nicolas Dandrimont Tue, 23 Feb 2016 17:46:04 +0100 swh-scheduler (0.0.3-1~swh1) unstable-swh; urgency=medium * Release swh.scheduler version 0.0.3 * Implement the timestamp arguments to the task_run functions * Make the celery event listener use a reliable queue -- Nicolas Dandrimont Mon, 22 Feb 2016 15:14:28 +0100 swh-scheduler (0.0.2-1~swh1) unstable-swh; urgency=medium * Release swh.scheduler v0.0.2 * Multiple schema changes * Initial releases for the celery job runner and the event listener -- Nicolas Dandrimont Fri, 19 Feb 2016 18:50:47 +0100 swh-scheduler (0.0.1-1~swh1) unstable-swh; urgency=medium * Initial release * Release swh.scheduler v0.0.1 * Move swh.core.scheduling and swh.core.worker to swh.scheduler -- Nicolas Dandrimont Mon, 15 Feb 2016 11:07:30 +0100 diff --git a/sql/updates/30-bis.sql b/sql/updates/30-bis.sql new file mode 100644 index 0000000..789a0ac --- /dev/null +++ b/sql/updates/30-bis.sql @@ -0,0 +1,8 @@ +-- SWH DB schema upgrade +-- from_version: 30 +-- to_version: 30 +-- description: Bound existing next position offset to a max of 10 + +update origin_visit_stats + set next_position_offset = 10 + where next_position_offset > 10; diff --git a/swh.scheduler.egg-info/PKG-INFO b/swh.scheduler.egg-info/PKG-INFO index 79667e4..2b8fcf9 100644 --- a/swh.scheduler.egg-info/PKG-INFO +++ b/swh.scheduler.egg-info/PKG-INFO @@ -1,37 +1,37 @@ Metadata-Version: 2.1 Name: swh.scheduler -Version: 0.17.0 +Version: 0.17.1 Summary: Software Heritage Scheduler Home-page: https://forge.softwareheritage.org/diffusion/DSCH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-scheduler Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-scheduler/ Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing Provides-Extra: journal Provides-Extra: simulator License-File: LICENSE License-File: LICENSE.Celery License-File: AUTHORS swh-scheduler ============= Job scheduler for the Software Heritage project. Task manager for asynchronous/delayed tasks, used for both recurrent (e.g., listing a forge, loading new stuff from a Git repository) and one-off activities (e.g., loading a specific version of a source package). diff --git a/swh.scheduler.egg-info/SOURCES.txt b/swh.scheduler.egg-info/SOURCES.txt index 53a80c2..d76e928 100644 --- a/swh.scheduler.egg-info/SOURCES.txt +++ b/swh.scheduler.egg-info/SOURCES.txt @@ -1,135 +1,136 @@ .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE LICENSE.Celery MANIFEST.in Makefile README.md conftest.py mypy.ini pyproject.toml pytest.ini requirements-journal.txt requirements-simulator.txt requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini data/README.md data/elastic-template.json data/update-index-settings.json docs/.gitignore docs/Makefile docs/cli.rst docs/conf.py docs/index.rst docs/simulator.rst docs/_static/.placeholder docs/_templates/.placeholder sql/.gitignore sql/Makefile sql/updates/02.sql sql/updates/03.sql sql/updates/04.sql sql/updates/05.sql sql/updates/06.sql sql/updates/07.sql sql/updates/08.sql sql/updates/09.sql sql/updates/10.sql sql/updates/11.sql sql/updates/12.sql sql/updates/13.sql sql/updates/14.sql sql/updates/15.sql sql/updates/16.sql sql/updates/17.sql sql/updates/18.sql sql/updates/19.sql sql/updates/20.sql sql/updates/23.sql sql/updates/24.sql sql/updates/25.sql sql/updates/26.sql sql/updates/27.sql sql/updates/28.sql sql/updates/29.sql +sql/updates/30-bis.sql sql/updates/30.sql swh/__init__.py swh.scheduler.egg-info/PKG-INFO swh.scheduler.egg-info/SOURCES.txt swh.scheduler.egg-info/dependency_links.txt swh.scheduler.egg-info/entry_points.txt swh.scheduler.egg-info/requires.txt swh.scheduler.egg-info/top_level.txt swh/scheduler/__init__.py swh/scheduler/backend.py swh/scheduler/backend_es.py swh/scheduler/cli_utils.py swh/scheduler/elasticsearch_memory.py swh/scheduler/exc.py swh/scheduler/interface.py swh/scheduler/journal_client.py swh/scheduler/model.py swh/scheduler/py.typed swh/scheduler/pytest_plugin.py swh/scheduler/task.py swh/scheduler/utils.py swh/scheduler/api/__init__.py swh/scheduler/api/client.py swh/scheduler/api/serializers.py swh/scheduler/api/server.py swh/scheduler/celery_backend/__init__.py swh/scheduler/celery_backend/config.py swh/scheduler/celery_backend/listener.py swh/scheduler/celery_backend/pika_listener.py swh/scheduler/celery_backend/runner.py swh/scheduler/cli/__init__.py swh/scheduler/cli/admin.py swh/scheduler/cli/celery_monitor.py swh/scheduler/cli/journal.py swh/scheduler/cli/origin.py swh/scheduler/cli/simulator.py swh/scheduler/cli/task.py swh/scheduler/cli/task_type.py swh/scheduler/cli/utils.py swh/scheduler/simulator/__init__.py swh/scheduler/simulator/common.py swh/scheduler/simulator/origin_scheduler.py swh/scheduler/simulator/origins.py swh/scheduler/simulator/task_scheduler.py swh/scheduler/sql/10-superuser-init.sql swh/scheduler/sql/30-schema.sql swh/scheduler/sql/40-func.sql swh/scheduler/sql/50-data.sql swh/scheduler/sql/60-indexes.sql swh/scheduler/tests/__init__.py swh/scheduler/tests/common.py swh/scheduler/tests/conftest.py swh/scheduler/tests/tasks.py swh/scheduler/tests/test_api_client.py swh/scheduler/tests/test_celery_tasks.py swh/scheduler/tests/test_cli.py swh/scheduler/tests/test_cli_celery_monitor.py swh/scheduler/tests/test_cli_journal.py swh/scheduler/tests/test_cli_origin.py swh/scheduler/tests/test_cli_task_type.py swh/scheduler/tests/test_common.py swh/scheduler/tests/test_config.py swh/scheduler/tests/test_init.py swh/scheduler/tests/test_journal_client.py swh/scheduler/tests/test_model.py swh/scheduler/tests/test_scheduler.py swh/scheduler/tests/test_server.py swh/scheduler/tests/test_simulator.py swh/scheduler/tests/test_utils.py swh/scheduler/tests/es/__init__.py swh/scheduler/tests/es/conftest.py swh/scheduler/tests/es/test_backend_es.py swh/scheduler/tests/es/test_cli_task.py swh/scheduler/tests/es/test_elasticsearch_memory.py \ No newline at end of file diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py index 6a8ce61..3835635 100644 --- a/swh/scheduler/journal_client.py +++ b/swh/scheduler/journal_client.py @@ -1,291 +1,299 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import datetime, timedelta import random from typing import Dict, List, Optional, Tuple import attr from swh.scheduler.interface import SchedulerInterface from swh.scheduler.model import LastVisitStatus, OriginVisitStats from swh.scheduler.utils import utcnow msg_type = "origin_visit_status" DISABLE_ORIGIN_THRESHOLD = 3 """Threshold to disable failing origins""" +MAX_NEXT_POSITION_OFFSET = 10 +"""Max next position offset to avoid date computation overflow""" + def max_date(*dates: Optional[datetime]) -> datetime: """Return the max date of given (possibly None) dates At least one date must be not None. """ filtered_dates = [d for d in dates if d is not None] if not filtered_dates: raise ValueError("At least one date should be a valid datetime") return max(filtered_dates) def from_position_offset_to_days(position_offset: int) -> int: - """Compute position offset to interval in days. + """Compute position offset to interval in days. Note that this does not bound the + position_offset input so client code should limit the date computation to avoid + overflow errors. - - index 0 and 1: interval 1 day - - index 2, 3 and 4: interval 2 days - - index 5 and up: interval `4^(n-4)` days for n in (4, 16, 64, 256, 1024, ...) + - index in [0:1]: interval 1 day + - index in [2:4]: interval 2 days + - index in [5:+inf]: interval `4^(index-4)` days Args: position_offset: The actual position offset for a given visit stats Returns: - The offset as an interval in number of days + The offset as an interval in number of days. """ assert position_offset >= 0 if position_offset < 2: result = 1 elif position_offset < 5: result = 2 else: result = 4 ** (position_offset - 4) return result def next_visit_queue_position( queue_position_per_visit_type: Dict, visit_stats: Dict ) -> datetime: """Compute the next visit queue position for the given visit_stats. This takes the visit_stats next_position_offset value and compute a corresponding interval in days (with a random fudge factor of -/+ 10% range to avoid scheduling burst for hosters). Then computes out of this visit interval and the current visit stats's position in the queue a new position. As an implementation detail, if the visit stats does not have a queue position yet, this fallbacks to use the current global position (for the same visit type as the visit stats) to compute the new position in the queue. If there is no global state yet for the visit type, this starts up using the ``utcnow`` function as default value. Args: queue_position_per_visit_type: The global state of the queue per visit type visit_stats: The actual visit information to compute the next position for Returns: The actual next visit queue position for that visit stats """ days = from_position_offset_to_days(visit_stats["next_position_offset"]) random_fudge_factor = random.uniform(-0.1, 0.1) visit_interval = timedelta(days=days * (1 + random_fudge_factor)) # Use the current queue position per visit type as starting position if none is # already set default_queue_position = queue_position_per_visit_type.get( visit_stats["visit_type"], utcnow() ) current_position = ( visit_stats["next_visit_queue_position"] if visit_stats.get("next_visit_queue_position") else default_queue_position ) return current_position + visit_interval def get_last_status( incoming_visit_status: Dict, known_visit_stats: Dict ) -> Tuple[LastVisitStatus, Optional[bool]]: """Determine the `last_visit_status` and eventfulness of an origin according to the received visit_status object, and the state of the origin_visit_stats in db. Note that at the time this function is called, out of order messages were already discarded. Thus why the implementation is rather simple. Args: incoming_visit_status: Incoming visit status read ouf of the journal known_visit_stats: Visit stats already registered in the backend Returns: A tuple of (LastVisitStatus, Optional[bool]). LastVisitStatus represents the successfulness of the visit. Optional[bool] represents whether the snapshot is fresher than before (True/False) or None if there is no snapshot at all. """ status = incoming_visit_status["status"] if status in ("not_found", "failed"): return LastVisitStatus(status), None assert status in ("full", "partial") if incoming_visit_status["snapshot"] is None: return LastVisitStatus.failed, None if incoming_visit_status["snapshot"] != known_visit_stats.get("last_snapshot"): return LastVisitStatus.successful, True return LastVisitStatus.successful, False def process_journal_objects( messages: Dict[str, List[Dict]], *, scheduler: SchedulerInterface ) -> None: f"""Read messages from origin_visit_status journal topic to update "origin_visit_stats" information on (origin, visit_type). The goal is to compute visit stats information per origin and visit_type: `last_successful`, `last_visit`, `last_visit_status`, ... Details: - This journal consumes origin visit status information for final visit status (`"full"`, `"partial"`, `"failed"`, `"not_found"`). It drops the information of non final visit statuses (`"ongoing"`, `"created"`). - This journal client only considers messages that arrive in chronological order. Messages that arrive out of order (i.e. with a date field smaller than the latest recorded visit of the origin) are ignored. This is a tradeoff between correctness and simplicity of implementation [1]_. - The snapshot is used to determine the eventful or uneventful nature of the origin visit. - When no snapshot is provided, the visit is considered as failed. - Finally, the `next_visit_queue_position` (time at which some new objects are expected to be added for the origin), and `next_position_offset` (duration that we expect to wait between visits of this origin) are updated. - When visits fails at least {DISABLE_ORIGIN_THRESHOLD} times in a row, the origins are disabled in the scheduler table. It's up to the lister to activate those back when they are listed again. This is a worker function to be used with `JournalClient.process(worker_fn)`, after currification of `scheduler` and `task_names`. .. [1] Ignoring out of order messages makes the initialization of the origin_visit_status table (from a full journal) less deterministic: only the `last_visit`, `last_visit_state` and `last_successful` fields are guaranteed to be exact, the `next_position_offset` field is a best effort estimate (which should converge once the client has run for a while on in-order messages). """ assert set(messages) <= { msg_type }, f"Got unexpected {', '.join(set(messages) - set([msg_type]))} message types" assert msg_type in messages, f"Expected {msg_type} messages" interesting_messages = [ msg for msg in messages[msg_type] if "type" in msg and msg["status"] not in ("created", "ongoing") ] if not interesting_messages: return origin_visit_stats: Dict[Tuple[str, str], Dict] = { (visit_stats.url, visit_stats.visit_type): attr.asdict(visit_stats) for visit_stats in scheduler.origin_visit_stats_get( list(set((vs["origin"], vs["type"]) for vs in interesting_messages)) ) } # Use the default values from the model object empty_object = { field.name: field.default if field.default != attr.NOTHING else None for field in attr.fields(OriginVisitStats) } disabled_urls: List[str] = [] # Retrieve the global queue state queue_position_per_visit_type = scheduler.visit_scheduler_queue_position_get() for msg_dict in interesting_messages: origin = msg_dict["origin"] visit_type = msg_dict["type"] pk = origin, visit_type if pk not in origin_visit_stats: origin_visit_stats[pk] = { **empty_object, "url": origin, "visit_type": visit_type, } visit_stats_d = origin_visit_stats[pk] if ( visit_stats_d.get("last_visit") and msg_dict["date"] <= visit_stats_d["last_visit"] ): # message received out of order, ignore continue # Compare incoming message to known status of the origin, to determine # eventfulness last_visit_status, eventful = get_last_status(msg_dict, visit_stats_d) # Update the position offset according to the visit status, # if we had already visited this origin before. if visit_stats_d.get("last_visit"): # Update the next position offset according to the existing value and the # eventfulness of the visit. increment = -2 if eventful else 1 - visit_stats_d["next_position_offset"] = max( - 0, visit_stats_d["next_position_offset"] + increment + # Limit the next_position_offset for acceptable date computations + current_offset = min( + visit_stats_d["next_position_offset"] + increment, + MAX_NEXT_POSITION_OFFSET, ) + visit_stats_d["next_position_offset"] = max(0, current_offset) # increment the counter when last_visit_status is the same same_visit_status = last_visit_status == visit_stats_d["last_visit_status"] else: same_visit_status = False # Record current visit date as highest known date (we've rejected out of order # messages earlier). visit_stats_d["last_visit"] = msg_dict["date"] visit_stats_d["last_visit_status"] = last_visit_status # Record last successful visit date if last_visit_status == LastVisitStatus.successful: visit_stats_d["last_successful"] = max_date( msg_dict["date"], visit_stats_d.get("last_successful") ) visit_stats_d["last_snapshot"] = msg_dict["snapshot"] # Update the next visit queue position (which will be used solely for origin # without any last_update, cf. the dedicated scheduling policy # "origins_without_last_update") visit_stats_d["next_visit_queue_position"] = next_visit_queue_position( queue_position_per_visit_type, visit_stats_d ) visit_stats_d["successive_visits"] = ( visit_stats_d["successive_visits"] + 1 if same_visit_status else 1 ) # Disable recurring failing/not-found origins if ( visit_stats_d["last_visit_status"] in [LastVisitStatus.not_found, LastVisitStatus.failed] ) and visit_stats_d["successive_visits"] >= DISABLE_ORIGIN_THRESHOLD: disabled_urls.append(visit_stats_d["url"]) scheduler.origin_visit_stats_upsert( OriginVisitStats(**ovs) for ovs in origin_visit_stats.values() ) # Disable any origins if any if disabled_urls: disabled_origins = [] for url in disabled_urls: origins = scheduler.get_listed_origins(url=url).results if len(origins) > 0: origin = attr.evolve(origins[0], enabled=False) disabled_origins.append(origin) if disabled_origins: scheduler.record_listed_origins(disabled_origins) diff --git a/swh/scheduler/sql/50-data.sql b/swh/scheduler/sql/50-data.sql index 0fab9f0..25a86e4 100644 --- a/swh/scheduler/sql/50-data.sql +++ b/swh/scheduler/sql/50-data.sql @@ -1,182 +1,182 @@ insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'load-svn-from-archive', 'Loading svn repositories from svn dump', 'swh.loader.svn.tasks.MountAndLoadSvnRepository', '1 day', '1 day', '1 day', 1, 1000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'load-svn', 'Create dump of a remote svn repository, mount it and load it', 'swh.loader.svn.tasks.DumpMountAndLoadSvnRepository', '1 day', '1 day', '1 day', 1, 1000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, num_retries, max_queue_length) values ( 'check-deposit', 'Pre-checking deposit step before loading into swh archive', 'swh.deposit.loader.tasks.ChecksDepositTsk', '1 day', '1 day', '1 day', 1, 3, 1000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'cook-vault-bundle', 'Cook a Vault bundle', 'swh.vault.cooking_tasks.SWHCookingTask', '1 day', '1 day', '1 day', 1, 10000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'load-hg', 'Loading mercurial repository swh-loader-mercurial', 'swh.loader.mercurial.tasks.LoadMercurial', '1 day', '1 day', '1 day', 1, 1000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'load-hg-from-archive', 'Loading archive mercurial repository swh-loader-mercurial', 'swh.loader.mercurial.tasks.LoadArchiveMercurial', '1 day', '1 day', '1 day', 1, 1000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'load-git', 'Update an origin of type git', 'swh.loader.git.tasks.UpdateGitRepository', '64 days', '12:00:00', '64 days', 2, 5000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'index-mimetype', 'Mimetype indexer task', 'swh.indexer.tasks.ContentMimetype', '1 day', '12:00:00', '1 days', 2, 5000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'index-mimetype-for-range', 'Mimetype Range indexer task', 'swh.indexer.tasks.ContentRangeMimetype', '1 day', '12:00:00', '1 days', 2, 5000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'index-fossology-license', 'Fossology license indexer task', 'swh.indexer.tasks.ContentFossologyLicense', '1 day', '12:00:00', '1 days', 2, 5000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( - 'index-fossology-license-for-range', - 'Fossology license range indexer task', - 'swh.indexer.tasks.ContentRangeFossologyLicense', + 'index-fossology-license-for-partition', + 'Fossology license partition indexer task', + 'swh.indexer.tasks.ContentFossologyLicensePartition', '1 day', '12:00:00', '1 days', 2, 5000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'index-origin-head', 'Origin Head indexer task', 'swh.indexer.tasks.OriginHead', '1 day', '12:00:00', '1 days', 2, 5000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'index-revision-metadata', 'Revision Metadata indexer task', 'swh.indexer.tasks.RevisionMetadata', '1 day', '12:00:00', '1 days', 2, 5000); insert into task_type( type, description, backend_name, default_interval, min_interval, max_interval, backoff_factor, max_queue_length) values ( 'index-origin-metadata', 'Origin Metadata indexer task', 'swh.indexer.tasks.OriginMetadata', '1 day', '12:00:00', '1 days', 2, 20000); diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py index 17b42f7..b95d81e 100644 --- a/swh/scheduler/tests/test_journal_client.py +++ b/swh/scheduler/tests/test_journal_client.py @@ -1,996 +1,997 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from datetime import timedelta import functools from itertools import permutations from typing import List from unittest.mock import Mock import attr import pytest from swh.model.hashutil import hash_to_bytes from swh.scheduler.journal_client import ( from_position_offset_to_days, max_date, next_visit_queue_position, process_journal_objects, ) from swh.scheduler.model import LastVisitStatus, ListedOrigin, OriginVisitStats from swh.scheduler.utils import utcnow def test_journal_client_origin_visit_status_from_journal_fail(swh_scheduler): process_fn = functools.partial(process_journal_objects, scheduler=swh_scheduler,) with pytest.raises(AssertionError, match="Got unexpected origin_visit"): process_fn({"origin_visit": [{"url": "http://foobar.baz"},]}) with pytest.raises(AssertionError, match="Expected origin_visit_status"): process_fn({}) ONE_DAY = datetime.timedelta(days=1) ONE_YEAR = datetime.timedelta(days=366) DATE3 = utcnow() DATE2 = DATE3 - ONE_DAY DATE1 = DATE2 - ONE_DAY assert DATE1 < DATE2 < DATE3 @pytest.mark.parametrize( "dates,expected_max_date", [ ((DATE1,), DATE1), ((None, DATE2), DATE2), ((DATE1, None), DATE1), ((DATE1, DATE2), DATE2), ((DATE2, DATE1), DATE2), ((DATE1, DATE2, DATE3), DATE3), ((None, DATE2, DATE3), DATE3), ((None, None, DATE3), DATE3), ((DATE1, None, DATE3), DATE3), ], ) def test_max_date(dates, expected_max_date): assert max_date(*dates) == expected_max_date def test_max_date_raise(): with pytest.raises(ValueError, match="valid datetime"): max_date() with pytest.raises(ValueError, match="valid datetime"): max_date(None) with pytest.raises(ValueError, match="valid datetime"): max_date(None, None) def test_journal_client_origin_visit_status_from_journal_ignored_status(swh_scheduler): """Only final statuses (full, partial) are important, the rest remain ignored. """ # Trace method calls on the swh_scheduler swh_scheduler = Mock(wraps=swh_scheduler) visit_statuses = [ { "origin": "foo", "visit": 1, "status": "created", "date": utcnow(), "type": "git", "snapshot": None, }, { "origin": "bar", "visit": 1, "status": "ongoing", "date": utcnow(), "type": "svn", "snapshot": None, }, ] process_journal_objects( {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) # All messages have been ignored: no stats have been upserted swh_scheduler.origin_visit_stats_upsert.assert_not_called() def test_journal_client_ignore_missing_type(swh_scheduler): """Ignore statuses with missing type key""" # Trace method calls on the swh_scheduler swh_scheduler = Mock(wraps=swh_scheduler) date = utcnow() snapshot = hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd") visit_statuses = [ { "origin": "foo", "visit": 1, "status": "full", "date": date, "snapshot": snapshot, }, ] process_journal_objects( {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) # The message has been ignored: no stats have been upserted swh_scheduler.origin_visit_stats_upsert.assert_not_called() def assert_visit_stats_ok( actual_visit_stats: OriginVisitStats, expected_visit_stats: OriginVisitStats, ignore_fields: List[str] = ["next_visit_queue_position"], ): """Utility test function to ensure visits stats read from the backend are in the right shape. The comparison on the next_visit_queue_position will be dealt with in dedicated tests so it's not tested in tests that are calling this function. """ fields = attr.fields_dict(OriginVisitStats) defaults = {field: fields[field].default for field in ignore_fields} actual_visit_stats = attr.evolve(actual_visit_stats, **defaults) assert actual_visit_stats == expected_visit_stats def test_journal_client_origin_visit_status_from_journal_last_not_found(swh_scheduler): visit_status = { "origin": "foo", "visit": 1, "status": "not_found", "date": DATE1, "type": "git", "snapshot": None, } process_journal_objects( {"origin_visit_status": [visit_status]}, scheduler=swh_scheduler ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_visit=visit_status["date"], last_visit_status=LastVisitStatus.not_found, next_position_offset=4, successive_visits=1, ), ) visit_statuses = [ { "origin": "foo", "visit": 3, "status": "not_found", "date": DATE2, "type": "git", "snapshot": None, }, { "origin": "foo", "visit": 4, "status": "not_found", "date": DATE3, "type": "git", "snapshot": None, }, ] process_journal_objects( {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_visit=DATE3, last_visit_status=LastVisitStatus.not_found, next_position_offset=6, successive_visits=3, ), ) def test_journal_client_origin_visit_status_from_journal_last_failed(swh_scheduler): visit_statuses = [ { "origin": "foo", "visit": 1, "status": "partial", "date": utcnow(), "type": "git", "snapshot": None, }, { "origin": "bar", "visit": 1, "status": "full", "date": DATE1, "type": "git", "snapshot": None, }, { "origin": "bar", "visit": 2, "status": "full", "date": DATE2, "type": "git", "snapshot": None, }, { "origin": "bar", "visit": 3, "status": "full", "date": DATE3, "type": "git", "snapshot": None, }, ] process_journal_objects( {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="bar", visit_type="git", last_visit=DATE3, last_visit_status=LastVisitStatus.failed, next_position_offset=6, successive_visits=3, ), ) def test_journal_client_origin_visit_status_from_journal_last_failed2(swh_scheduler): visit_statuses = [ { "origin": "bar", "visit": 2, "status": "failed", "date": DATE1, "type": "git", "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), }, { "origin": "bar", "visit": 3, "status": "failed", "date": DATE2, "type": "git", "snapshot": None, }, ] process_journal_objects( {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="bar", visit_type="git", last_visit=DATE2, last_visit_status=LastVisitStatus.failed, next_position_offset=5, successive_visits=2, ), ) def test_journal_client_origin_visit_status_from_journal_last_successful(swh_scheduler): visit_statuses = [ { "origin": "bar", "visit": 1, "status": "partial", "date": utcnow(), "type": "git", "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), }, { "origin": "foo", "visit": 1, "status": "full", "date": DATE1, "type": "git", "snapshot": hash_to_bytes("eeecc0710eb6cf9efd5b920a8453e1e07157bfff"), }, { "origin": "foo", "visit": 2, "status": "partial", "date": DATE2, "type": "git", "snapshot": hash_to_bytes("aaacc0710eb6cf9efd5b920a8453e1e07157baaa"), }, { "origin": "foo", "visit": 3, "status": "full", "date": DATE3, "type": "git", "snapshot": hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), }, ] process_journal_objects( {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_successful=DATE3, last_visit=DATE3, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), next_position_offset=0, successive_visits=3, ), ) def test_journal_client_origin_visit_status_from_journal_last_uneventful(swh_scheduler): visit_status = { "origin": "foo", "visit": 1, "status": "full", "date": DATE3 + ONE_DAY, "type": "git", "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), } # Let's insert some visit stats with some previous visit information swh_scheduler.origin_visit_stats_upsert( [ OriginVisitStats( url=visit_status["origin"], visit_type=visit_status["type"], last_successful=DATE2, last_visit=DATE3, last_visit_status=LastVisitStatus.failed, last_snapshot=visit_status["snapshot"], next_visit_queue_position=None, next_position_offset=4, successive_visits=1, ) ] ) process_journal_objects( {"origin_visit_status": [visit_status]}, scheduler=swh_scheduler ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( [(visit_status["origin"], visit_status["type"])] ) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url=visit_status["origin"], visit_type=visit_status["type"], last_visit=DATE3 + ONE_DAY, last_successful=DATE3 + ONE_DAY, last_visit_status=LastVisitStatus.successful, last_snapshot=visit_status["snapshot"], next_visit_queue_position=None, next_position_offset=5, successive_visits=1, ), ) VISIT_STATUSES = [ {**ovs, "date": DATE1 + n * ONE_DAY} for n, ovs in enumerate( [ { "origin": "foo", "type": "git", "visit": 1, "status": "created", "snapshot": None, }, { "origin": "foo", "type": "git", "visit": 1, "status": "full", "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), }, { "origin": "foo", "type": "git", "visit": 2, "status": "created", "snapshot": None, }, { "origin": "foo", "type": "git", "visit": 2, "status": "full", "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), }, ] ) ] @pytest.mark.parametrize( "visit_statuses", permutations(VISIT_STATUSES, len(VISIT_STATUSES)) ) def test_journal_client_origin_visit_status_permutation0(visit_statuses, swh_scheduler): """Ensure out of order topic subscription ends up in the same final state """ process_journal_objects( {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) visit_stats = actual_origin_visit_stats[0] assert_visit_stats_ok( visit_stats, OriginVisitStats( url="foo", visit_type="git", last_successful=DATE1 + 3 * ONE_DAY, last_visit=DATE1 + 3 * ONE_DAY, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), ), ignore_fields=[ "next_visit_queue_position", "next_position_offset", "successive_visits", ], ) # We ignore out of order messages, so the next_position_offset isn't exact # depending on the permutation. What matters is consistency of the final # dates (last_visit and last_successful). assert 4 <= visit_stats.next_position_offset <= 5 # same goes for successive_visits assert 1 <= visit_stats.successive_visits <= 2 VISIT_STATUSES_1 = [ {**ovs, "date": DATE1 + n * ONE_DAY} for n, ovs in enumerate( [ { "origin": "cavabarder", "type": "hg", "visit": 1, "status": "partial", "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), }, { "origin": "cavabarder", "type": "hg", "visit": 2, "status": "full", "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), }, { "origin": "cavabarder", "type": "hg", "visit": 3, "status": "full", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), }, { "origin": "cavabarder", "type": "hg", "visit": 4, "status": "full", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), }, ] ) ] @pytest.mark.parametrize( "visit_statuses", permutations(VISIT_STATUSES_1, len(VISIT_STATUSES_1)) ) def test_journal_client_origin_visit_status_permutation1(visit_statuses, swh_scheduler): """Ensure out of order topic subscription ends up in the same final state """ process_journal_objects( {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) actual_visit_stats = swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) visit_stats = actual_visit_stats[0] assert_visit_stats_ok( visit_stats, OriginVisitStats( url="cavabarder", visit_type="hg", last_successful=DATE1 + 3 * ONE_DAY, last_visit=DATE1 + 3 * ONE_DAY, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), ), ignore_fields=[ "next_visit_queue_position", "next_position_offset", "successive_visits", ], ) # We ignore out of order messages, so the next_position_offset isn't exact # depending on the permutation. What matters is consistency of the final # dates (last_visit and last_successful). assert 2 <= visit_stats.next_position_offset <= 5 # same goes for successive_visits assert 1 <= visit_stats.successive_visits <= 4 VISIT_STATUSES_2 = [ {**ovs, "date": DATE1 + n * ONE_DAY} for n, ovs in enumerate( [ { "origin": "cavabarder", "type": "hg", "visit": 1, "status": "full", "snapshot": hash_to_bytes("0000000000000000000000000000000000000000"), }, { "origin": "cavabarder", "type": "hg", "visit": 2, "status": "full", "snapshot": hash_to_bytes("1111111111111111111111111111111111111111"), }, { "origin": "iciaussi", "type": "hg", "visit": 1, "status": "full", "snapshot": hash_to_bytes("2222222222222222222222222222222222222222"), }, { "origin": "iciaussi", "type": "hg", "visit": 2, "status": "full", "snapshot": hash_to_bytes("3333333333333333333333333333333333333333"), }, { "origin": "cavabarder", "type": "git", "visit": 1, "status": "full", "snapshot": hash_to_bytes("4444444444444444444444444444444444444444"), }, { "origin": "cavabarder", "type": "git", "visit": 2, "status": "full", "snapshot": hash_to_bytes("5555555555555555555555555555555555555555"), }, { "origin": "iciaussi", "type": "git", "visit": 1, "status": "full", "snapshot": hash_to_bytes("6666666666666666666666666666666666666666"), }, { "origin": "iciaussi", "type": "git", "visit": 2, "status": "full", "snapshot": hash_to_bytes("7777777777777777777777777777777777777777"), }, ] ) ] def test_journal_client_origin_visit_status_after_grab_next_visits( swh_scheduler, stored_lister ): """Ensure OriginVisitStat entries created in the db as a result of calling grab_next_visits() do not mess the OriginVisitStats upsert mechanism. """ listed_origins = [ ListedOrigin(lister_id=stored_lister.id, url=url, visit_type=visit_type) for (url, visit_type) in set((v["origin"], v["type"]) for v in VISIT_STATUSES_2) ] swh_scheduler.record_listed_origins(listed_origins) before = utcnow() swh_scheduler.grab_next_visits( visit_type="git", count=10, policy="oldest_scheduled_first" ) after = utcnow() assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == [] assert swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0] is not None process_journal_objects( {"origin_visit_status": VISIT_STATUSES_2}, scheduler=swh_scheduler ) for url in ("cavabarder", "iciaussi"): ovs = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert before <= ovs.last_scheduled <= after ovs = swh_scheduler.origin_visit_stats_get([(url, "hg")])[0] assert ovs.last_scheduled is None ovs = swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0] assert ovs.last_successful == DATE1 + 5 * ONE_DAY assert ovs.last_visit == DATE1 + 5 * ONE_DAY assert ovs.last_visit_status == LastVisitStatus.successful assert ovs.last_snapshot == hash_to_bytes( "5555555555555555555555555555555555555555" ) def test_journal_client_origin_visit_status_duplicated_messages(swh_scheduler): """A duplicated message must be ignored """ visit_status = { "origin": "foo", "visit": 1, "status": "full", "date": DATE1, "type": "git", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), } process_journal_objects( {"origin_visit_status": [visit_status]}, scheduler=swh_scheduler ) process_journal_objects( {"origin_visit_status": [visit_status]}, scheduler=swh_scheduler ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_successful=DATE1, last_visit=DATE1, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), successive_visits=1, ), ) def test_journal_client_origin_visit_status_several_upsert(swh_scheduler): """An old message updates old information """ visit_status1 = { "origin": "foo", "visit": 1, "status": "full", "date": DATE1, "type": "git", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), } visit_status2 = { "origin": "foo", "visit": 1, "status": "full", "date": DATE2, "type": "git", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), } process_journal_objects( {"origin_visit_status": [visit_status2]}, scheduler=swh_scheduler ) process_journal_objects( {"origin_visit_status": [visit_status1]}, scheduler=swh_scheduler ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_successful=DATE2, last_visit=DATE2, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), next_position_offset=4, successive_visits=1, ), ) VISIT_STATUSES_SAME_SNAPSHOT = [ {**ovs, "date": DATE1 + n * ONE_YEAR} for n, ovs in enumerate( [ { "origin": "cavabarder", "type": "hg", "visit": 3, "status": "full", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), }, { "origin": "cavabarder", "type": "hg", "visit": 4, "status": "full", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), }, { "origin": "cavabarder", "type": "hg", "visit": 4, "status": "full", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), }, ] ) ] @pytest.mark.parametrize( "visit_statuses", permutations(VISIT_STATUSES_SAME_SNAPSHOT, len(VISIT_STATUSES_SAME_SNAPSHOT)), ) def test_journal_client_origin_visit_statuses_same_snapshot_permutation( visit_statuses, swh_scheduler ): """Ensure out of order topic subscription ends up in the same final state """ process_journal_objects( {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( [("cavabarder", "hg")] ) visit_stats = actual_origin_visit_stats[0] assert_visit_stats_ok( visit_stats, OriginVisitStats( url="cavabarder", visit_type="hg", last_successful=DATE1 + 2 * ONE_YEAR, last_visit=DATE1 + 2 * ONE_YEAR, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), ), ignore_fields=[ "next_visit_queue_position", "next_position_offset", "successive_visits", ], ) # We ignore out of order messages, so the next_position_offset isn't exact # depending on the permutation. What matters is consistency of the final # dates (last_visit and last_successful). assert 4 <= visit_stats.next_position_offset <= 6 # same goes for successive_visits assert 1 <= visit_stats.successive_visits <= 3 @pytest.mark.parametrize( "position_offset, interval", [ (0, 1), (1, 1), (2, 2), (3, 2), (4, 2), (5, 4), (6, 16), (7, 64), (8, 256), (9, 1024), (10, 4096), + (11, 16384), ], ) def test_journal_client_from_position_offset_to_days(position_offset, interval): assert from_position_offset_to_days(position_offset) == interval def test_journal_client_from_position_offset_to_days_only_positive_input(): with pytest.raises(AssertionError): from_position_offset_to_days(-1) @pytest.mark.parametrize( "fudge_factor,next_position_offset", [(0.01, 1), (-0.01, 5), (0.1, 8), (-0.1, 10),] ) def test_next_visit_queue_position(mocker, fudge_factor, next_position_offset): mock_random = mocker.patch("swh.scheduler.journal_client.random.uniform") mock_random.return_value = fudge_factor date_now = utcnow() mock_now = mocker.patch("swh.scheduler.journal_client.utcnow") mock_now.return_value = date_now actual_position = next_visit_queue_position( {}, {"next_position_offset": next_position_offset, "visit_type": "svn",} ) assert actual_position == date_now + timedelta( days=from_position_offset_to_days(next_position_offset) * (1 + fudge_factor) ) assert mock_now.called assert mock_random.called @pytest.mark.parametrize( "fudge_factor,next_position_offset", [(0.02, 2), (-0.02, 3), (0, 7), (-0.09, 9),] ) def test_next_visit_queue_position_with_state( mocker, fudge_factor, next_position_offset ): mock_random = mocker.patch("swh.scheduler.journal_client.random.uniform") mock_random.return_value = fudge_factor date_now = utcnow() actual_position = next_visit_queue_position( {"git": date_now}, {"next_position_offset": next_position_offset, "visit_type": "git",}, ) assert actual_position == date_now + timedelta( days=from_position_offset_to_days(next_position_offset) * (1 + fudge_factor) ) assert mock_random.called @pytest.mark.parametrize( "fudge_factor,next_position_offset", [(0.03, 3), (-0.03, 4), (0.08, 7), (-0.08, 9),] ) def test_next_visit_queue_position_with_next_visit_queue( mocker, fudge_factor, next_position_offset ): mock_random = mocker.patch("swh.scheduler.journal_client.random.uniform") mock_random.return_value = fudge_factor date_now = utcnow() actual_position = next_visit_queue_position( {}, { "next_position_offset": next_position_offset, "visit_type": "hg", "next_visit_queue_position": date_now, }, ) assert actual_position == date_now + timedelta( days=from_position_offset_to_days(next_position_offset) * (1 + fudge_factor) ) assert mock_random.called def test_disable_failing_origins(swh_scheduler): """Origin with too many failed attempts ends up being deactivated in the scheduler. """ # actually store the origin in the scheduler so we can check it's deactivated in the # end. lister = swh_scheduler.get_or_create_lister( name="something", instance_name="something" ) origin = ListedOrigin( url="bar", enabled=True, visit_type="svn", lister_id=lister.id ) swh_scheduler.record_listed_origins([origin]) visit_statuses = [ { "origin": "bar", "visit": 2, "status": "failed", "date": DATE1, "type": "svn", "snapshot": None, }, { "origin": "bar", "visit": 3, "status": "failed", "date": DATE2, "type": "svn", "snapshot": None, }, { "origin": "bar", "visit": 3, "status": "failed", "date": DATE3, "type": "svn", "snapshot": None, }, ] process_journal_objects( {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar", "svn")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="bar", visit_type="svn", last_successful=None, last_visit=DATE3, last_visit_status=LastVisitStatus.failed, next_position_offset=6, successive_visits=3, ), ) # Now check that the origin in question is disabled actual_page = swh_scheduler.get_listed_origins(url="bar") assert len(actual_page.results) == 1 assert actual_page.next_page_token is None for origin in actual_page.results: assert origin.enabled is False assert origin.lister_id == lister.id assert origin.url == "bar" assert origin.visit_type == "svn"