diff --git a/PKG-INFO b/PKG-INFO index 9a92675..b74769d 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,33 +1,33 @@ Metadata-Version: 2.1 Name: swh.scheduler -Version: 1.2.1 +Version: 1.2.2 Summary: Software Heritage Scheduler Home-page: https://forge.softwareheritage.org/diffusion/DSCH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-scheduler Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-scheduler/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing Provides-Extra: journal Provides-Extra: simulator License-File: LICENSE License-File: LICENSE.Celery License-File: AUTHORS swh-scheduler ============= Job scheduler for the Software Heritage project. Task manager for asynchronous/delayed tasks, used for both recurrent (e.g., listing a forge, loading new stuff from a Git repository) and one-off activities (e.g., loading a specific version of a source package). diff --git a/sql/Makefile b/sql/Makefile index cc836d5..1a81be5 100644 --- a/sql/Makefile +++ b/sql/Makefile @@ -1,73 +1,68 @@ # Depends: postgresql-client, postgresql-autodoc DBNAME = softwareheritage-scheduler-dev DOCDIR = autodoc -SQL_SCHEMA = 30-swh-schema.sql -SQL_FUNC = 40-swh-func.sql -SQL_DATA = 50-swh-data.sql -SQL_INDEXES = 60-swh-indexes.sql -SQLS = $(SQL_SCHEMA) $(SQL_FUNC) $(SQL_DATA) $(SQL_INDEXES) -SQL_FILES = $(abspath $(addprefix $(CURDIR)/../swh/scheduler/sql/,$(SQLS))) +SQL_FILES = $(abspath $(shell ls $(CURDIR)/../swh/scheduler/sql/*.sql)) PSQL_BIN = psql PSQL_FLAGS = --echo-all -X -v ON_ERROR_STOP=1 PSQL = $(PSQL_BIN) $(PSQL_FLAGS) PIFPAF=$(findstring postgresql://,$(PIFPAF_URLS)) all: createdb: createdb-stamp createdb-stamp: $(SQL_FILES) ifeq ($(PIFPAF),) -dropdb $(DBNAME) endif createdb $(DBNAME) ifeq ($(PIFPAF),) touch $@ else rm -f $@ endif filldb: filldb-stamp filldb-stamp: createdb-stamp cat $(SQL_FILES) | $(PSQL) $(DBNAME) ifeq ($(PIFPAF),) touch $@ else rm -f $@ endif dropdb: -dropdb $(DBNAME) dumpdb: swh-scheduler.dump swh-scheduler.dump: filldb-stamp pg_dump -Fc $(DBNAME) > $@ $(DOCDIR): test -d $(DOCDIR)/ || mkdir $(DOCDIR) doc: autodoc-stamp $(DOCDIR)/swh-scheduler.pdf autodoc-stamp: filldb-stamp $(DOCDIR) postgresql_autodoc -d $(DBNAME) -f $(DOCDIR)/swh-scheduler cp -a $(DOCDIR)/swh-scheduler.dot $(DOCDIR)/swh-scheduler.dot.orig ifeq ($(PIFPAF),) touch $@ else rm -f $@ endif $(DOCDIR)/swh-scheduler.pdf: $(DOCDIR)/swh-scheduler.dot autodoc-stamp dot -T pdf $< > $@ $(DOCDIR)/swh-scheduler.svg: $(DOCDIR)/swh-scheduler.dot autodoc-stamp dot -T svg $< > $@ clean: rm -rf *-stamp $(DOCDIR)/ distclean: clean dropdb rm -f swh-scheduler.dump .PHONY: all initdb createdb dropdb doc clean diff --git a/swh.scheduler.egg-info/PKG-INFO b/swh.scheduler.egg-info/PKG-INFO index 9a92675..b74769d 100644 --- a/swh.scheduler.egg-info/PKG-INFO +++ b/swh.scheduler.egg-info/PKG-INFO @@ -1,33 +1,33 @@ Metadata-Version: 2.1 Name: swh.scheduler -Version: 1.2.1 +Version: 1.2.2 Summary: Software Heritage Scheduler Home-page: https://forge.softwareheritage.org/diffusion/DSCH/ Author: Software Heritage developers Author-email: swh-devel@inria.fr Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-scheduler Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-scheduler/ Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 5 - Production/Stable Requires-Python: >=3.7 Description-Content-Type: text/markdown Provides-Extra: testing Provides-Extra: journal Provides-Extra: simulator License-File: LICENSE License-File: LICENSE.Celery License-File: AUTHORS swh-scheduler ============= Job scheduler for the Software Heritage project. Task manager for asynchronous/delayed tasks, used for both recurrent (e.g., listing a forge, loading new stuff from a Git repository) and one-off activities (e.g., loading a specific version of a source package). diff --git a/swh/scheduler/api/server.py b/swh/scheduler/api/server.py index abc3ab8..a067589 100644 --- a/swh/scheduler/api/server.py +++ b/swh/scheduler/api/server.py @@ -1,150 +1,151 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os +from typing import Dict from swh.core import config from swh.core.api import JSONFormatter, MsgpackFormatter, RPCServerApp from swh.core.api import encode_data_server as encode_data from swh.core.api import error_handler, negotiate from swh.scheduler import get_scheduler from swh.scheduler.exc import SchedulerException from swh.scheduler.interface import SchedulerInterface from .serializers import DECODERS, ENCODERS scheduler = None def get_global_scheduler(): global scheduler if not scheduler: scheduler = get_scheduler(**app.config["scheduler"]) return scheduler class SchedulerServerApp(RPCServerApp): extra_type_decoders = DECODERS extra_type_encoders = ENCODERS app = SchedulerServerApp( __name__, backend_class=SchedulerInterface, backend_factory=get_global_scheduler ) @app.errorhandler(SchedulerException) def argument_error_handler(exception): return error_handler(exception, encode_data, status_code=400) @app.errorhandler(Exception) def my_error_handler(exception): return error_handler(exception, encode_data) def has_no_empty_params(rule): return len(rule.defaults or ()) >= len(rule.arguments or ()) @app.route("/") def index(): return """ Software Heritage scheduler RPC server

You have reached the Software Heritage scheduler RPC server.
See its documentation and API for more information

""" @app.route("/site-map") @negotiate(MsgpackFormatter) @negotiate(JSONFormatter) def site_map(): links = [] for rule in app.url_map.iter_rules(): if has_no_empty_params(rule) and hasattr(SchedulerInterface, rule.endpoint): links.append( dict( rule=rule.rule, description=getattr(SchedulerInterface, rule.endpoint).__doc__, ) ) # links is now a list of url, endpoint tuples return links -def load_and_check_config(config_path, type="local"): +def load_and_check_config(config_path: str, type: str = "postgresql") -> Dict: """Check the minimal configuration is set to run the api or raise an error explanation. Args: - config_path (str): Path to the configuration file to load - type (str): configuration type. For 'local' type, more - checks are done. + config_path: Configuration file path to load + type: Configuration type, for 'postgresql' type (the default), more checks are + done. Raises: Error if the setup is not as expected Returns: configuration as a dict """ if not config_path: raise EnvironmentError("Configuration file must be defined") if not os.path.exists(config_path): raise FileNotFoundError(f"Configuration file {config_path} does not exist") cfg = config.read(config_path) vcfg = cfg.get("scheduler") if not vcfg: raise KeyError("Missing '%scheduler' configuration") - if type == "local": + if type == "postgresql": cls = vcfg.get("cls") if cls not in ("local", "postgresql"): raise ValueError( "The scheduler backend can only be started with a 'postgresql' " "configuration" ) db = vcfg.get("db") if not db: raise KeyError("Invalid configuration; missing 'db' config entry") return cfg api_cfg = None def make_app_from_configfile(): """Run the WSGI app from the webserver, loading the configuration from a configuration file. SWH_CONFIG_FILENAME environment variable defines the configuration path to load. """ global api_cfg if not api_cfg: config_path = os.environ.get("SWH_CONFIG_FILENAME") api_cfg = load_and_check_config(config_path) app.config.update(api_cfg) handler = logging.StreamHandler() app.logger.addHandler(handler) return app if __name__ == "__main__": print('Please use the "swh-scheduler api-server" command') diff --git a/swh/scheduler/celery_backend/recurrent_visits.py b/swh/scheduler/celery_backend/recurrent_visits.py index 0ec6110..62389f1 100644 --- a/swh/scheduler/celery_backend/recurrent_visits.py +++ b/swh/scheduler/celery_backend/recurrent_visits.py @@ -1,365 +1,369 @@ # Copyright (C) 2021-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """This schedules the recurrent visits, for listed origins, in Celery. For "oneshot" (save code now, lister) tasks, check the :mod:`swh.scheduler.celery_backend.runner` and :mod:`swh.scheduler.celery_backend.pika_listener` modules. """ from __future__ import annotations from itertools import chain import logging from queue import Empty, Queue import random from threading import Thread import time from typing import TYPE_CHECKING, Any, Dict, List, Tuple from kombu.utils.uuid import uuid from swh.scheduler.celery_backend.config import get_available_slots from swh.scheduler.utils import create_origin_task_dicts if TYPE_CHECKING: from ..interface import SchedulerInterface from ..model import ListedOrigin logger = logging.getLogger(__name__) DEFAULT_POLICY = [ {"policy": "already_visited_order_by_lag", "weight": 50}, {"policy": "never_visited_oldest_update_first", "weight": 50}, ] DEFAULT_DVCS_POLICY = [ {"policy": "already_visited_order_by_lag", "weight": 49}, {"policy": "never_visited_oldest_update_first", "weight": 49}, {"policy": "origins_without_last_update", "weight": 2}, ] DEFAULT_GIT_POLICY = [ {"policy": "already_visited_order_by_lag", "weight": 49, "tablesample": 0.1}, {"policy": "never_visited_oldest_update_first", "weight": 49, "tablesample": 0.1}, {"policy": "origins_without_last_update", "weight": 2, "tablesample": 0.1}, ] DEFAULT_POLICY_CONFIG: Dict[str, List[Dict[str, Any]]] = { "default": DEFAULT_POLICY, "hg": DEFAULT_DVCS_POLICY, "svn": DEFAULT_DVCS_POLICY, "cvs": DEFAULT_DVCS_POLICY, "bzr": DEFAULT_DVCS_POLICY, "git": DEFAULT_GIT_POLICY, } """Scheduling policies to use to retrieve visits for the given visit types, with their relative weights""" MIN_SLOTS_RATIO = 0.05 """Quantity of slots that need to be available (with respect to max_queue_length) for :py:func:`~swh.scheduler.interface.SchedulerInterface.grab_next_visits` to trigger""" QUEUE_FULL_BACKOFF = 60 """Backoff time (in seconds) if there's fewer than :py:data:`MIN_SLOTS_RATIO` slots available in the queue.""" -NO_ORIGINS_SCHEDULED_BACKOFF = 20 * 60 +DEFAULT_NO_ORIGINS_SCHEDULED_BACKOFF = 20 * 60 """Backoff time (in seconds) if no origins have been scheduled in the current iteration""" BACKOFF_SPLAY = 5.0 """Amplitude of the fuzziness between backoffs""" TERMINATE = object() """Termination request received from command queue (singleton used for identity comparison)""" def grab_next_visits_policy_weights( scheduler: SchedulerInterface, visit_type: str, num_visits: int, policy_cfg: List[Dict[str, Any]], ) -> List[ListedOrigin]: """Get the next ``num_visits`` for the given ``visit_type`` using the corresponding set of scheduling policies. The :py:data:`POLICY_CFG` list sets, for the current visit type, the scheduling policies used to pull the next tasks. Each policy config entry in the list should at least have a 'policy' (policy name) and a 'weight' entry. For each policy in this policy_cfg list, the number of returned origins to visit will be weighted using this weight config option so that the total number of returned origins is around num_visits. Any other key/value entry in the policy configuration will be passed to the scheduler.grab_next_visit() method. This function emits a warning if the ratio of retrieved origins is off of the requested ratio by more than 5%. Returns: at most ``num_visits`` :py:class:`~swh.scheduler.model.ListedOrigin` objects """ policies = [cfg["policy"] for cfg in policy_cfg] if len(set(policies)) != len(policies): raise ValueError( "A policy weights can only appear once; check your policy " f"configuration for visit type {visit_type}" ) weights = [cfg["weight"] for cfg in policy_cfg] total_weight = sum(weights) if not total_weight: raise ValueError(f"No policy weights set for visit type {visit_type}") ratios = [weight / total_weight for weight in weights] extra_kws = [ {k: v for k, v in cfg.items() if k not in ("weight", "policy")} for cfg in policy_cfg ] fetched_origins: Dict[str, List[ListedOrigin]] = {} for policy, ratio, extra_kw in zip(policies, ratios, extra_kws): num_tasks_to_send = int(num_visits * ratio) fetched_origins[policy] = scheduler.grab_next_visits( visit_type, num_tasks_to_send, policy=policy, **extra_kw, ) all_origins: List[ListedOrigin] = list( chain.from_iterable(fetched_origins.values()) ) if not all_origins: return [] # Check whether the ratios of origins fetched are skewed with respect to the # ones we requested fetched_origin_ratios = { policy: len(origins) / len(all_origins) for policy, origins in fetched_origins.items() } for policy, expected_ratio in zip(policies, ratios): # 5% of skew with respect to request if abs(fetched_origin_ratios[policy] - expected_ratio) / expected_ratio > 0.05: logger.info( "Skewed fetch for visit type %s with policy %s: fetched %s, " "requested %s", visit_type, policy, fetched_origin_ratios[policy], expected_ratio, ) return all_origins def splay(): """Return a random short interval by which to vary the backoffs for the visit scheduling threads""" return random.uniform(0, BACKOFF_SPLAY) def send_visits_for_visit_type( scheduler: SchedulerInterface, app, visit_type: str, task_type: Dict, policy_cfg: List[Dict[str, Any]], + no_origins_scheduled_backoff: int = DEFAULT_NO_ORIGINS_SCHEDULED_BACKOFF, ) -> float: """Schedule the next batch of visits for the given ``visit_type``. First, we determine the number of available slots by introspecting the RabbitMQ queue. If there's fewer than :py:data:`MIN_SLOTS_RATIO` slots available in the queue, we wait for :py:data:`QUEUE_FULL_BACKOFF` seconds. This avoids running the expensive :py:func:`~swh.scheduler.interface.SchedulerInterface.grab_next_visits` queries when there's not many jobs to queue. Once there's more than :py:data:`MIN_SLOTS_RATIO` slots available, we run :py:func:`grab_next_visits_policy_weights` to retrieve the next set of origin visits to schedule, and we send them to celery. - If the last scheduling attempt didn't return any origins, we sleep for - :py:data:`NO_ORIGINS_SCHEDULED_BACKOFF` seconds. This avoids running the expensive + If the last scheduling attempt didn't return any origins, we sleep by default for + :py:data:`DEFAULT_NO_ORIGINS_SCHEDULED_BACKOFF` seconds. This avoids running the expensive :py:func:`~swh.scheduler.interface.SchedulerInterface.grab_next_visits` queries too often if there's nothing left to schedule. The :py:data:`POLICY_CFG` argument is the policy configuration used to choose the next origins to visit. It is passed directly to the :py:func:`grab_next_visits_policy_weights()` function. Returns: the earliest :py:func:`time.monotonic` value at which to run the next iteration of the loop. """ queue_name = task_type["backend_name"] max_queue_length = task_type.get("max_queue_length") or 0 min_available_slots = max_queue_length * MIN_SLOTS_RATIO current_iteration_start = time.monotonic() # Check queue level available_slots = get_available_slots(app, queue_name, max_queue_length) logger.debug( "%s available slots for visit type %s in queue %s", available_slots, visit_type, queue_name, ) if available_slots < min_available_slots: return current_iteration_start + QUEUE_FULL_BACKOFF origins = grab_next_visits_policy_weights( scheduler, visit_type, available_slots, policy_cfg ) if not origins: logger.debug("No origins to visit for type %s", visit_type) - return current_iteration_start + NO_ORIGINS_SCHEDULED_BACKOFF + return current_iteration_start + no_origins_scheduled_backoff # Try to smooth the ingestion load, origins pulled by different # scheduling policies have different resource usage patterns random.shuffle(origins) for task_dict in create_origin_task_dicts(origins, scheduler): app.send_task( queue_name, task_id=uuid(), args=task_dict["arguments"]["args"], kwargs=task_dict["arguments"]["kwargs"], queue=queue_name, ) logger.info( "%s: %s visits scheduled in queue %s", visit_type, len(origins), queue_name, ) # When everything worked, we can try to schedule origins again ASAP. return time.monotonic() def visit_scheduler_thread( config: Dict, visit_type: str, command_queue: Queue[object], exc_queue: Queue[Tuple[str, BaseException]], ): """Target function for the visit sending thread, which initializes local connections and handles exceptions by sending them back to the main thread.""" from swh.scheduler import get_scheduler from swh.scheduler.celery_backend.config import build_app try: # We need to reinitialize these connections because they're not generally # thread-safe app = build_app(config.get("celery")) scheduler = get_scheduler(**config["scheduler"]) task_type = scheduler.get_task_type(f"load-{visit_type}") if task_type is None: raise ValueError(f"Unknown task type: load-{visit_type}") policy_cfg = config.get("scheduling_policy", DEFAULT_POLICY_CONFIG) for policies in policy_cfg.values(): for policy in policies: if "weight" not in policy or "policy" not in policy: raise ValueError( "Each policy configuration needs at least a 'policy' " "and a 'weight' entry" ) policy_cfg = {**DEFAULT_POLICY_CONFIG, **policy_cfg} next_iteration = time.monotonic() while True: # vary the next iteration time a little bit next_iteration = next_iteration + splay() while time.monotonic() < next_iteration: # Wait for next iteration to start. Listen for termination message. try: msg = command_queue.get(block=True, timeout=1) except Empty: continue if msg is TERMINATE: return else: logger.warn("Received unexpected message %s in command queue", msg) next_iteration = send_visits_for_visit_type( scheduler, app, visit_type, task_type, policy_cfg.get(visit_type, policy_cfg["default"]), + config.get( + "no_origins_scheduled_backoff", DEFAULT_NO_ORIGINS_SCHEDULED_BACKOFF + ), ) except BaseException as e: exc_queue.put((visit_type, e)) VisitSchedulerThreads = Dict[str, Tuple[Thread, Queue]] """Dict storing the visit scheduler threads and their command queues""" def spawn_visit_scheduler_thread( threads: VisitSchedulerThreads, exc_queue: Queue[Tuple[str, BaseException]], config: Dict[str, Any], visit_type: str, ): """Spawn a new thread to schedule the visits of type ``visit_type``.""" command_queue: Queue[object] = Queue() thread = Thread( target=visit_scheduler_thread, kwargs={ "config": config, "visit_type": visit_type, "command_queue": command_queue, "exc_queue": exc_queue, }, ) threads[visit_type] = (thread, command_queue) thread.start() def terminate_visit_scheduler_threads(threads: VisitSchedulerThreads) -> List[str]: """Terminate all visit scheduler threads""" logger.info("Termination requested...") for _, command_queue in threads.values(): command_queue.put(TERMINATE) loops = 0 while threads and loops < 10: logger.info( "Terminating visit scheduling threads: %s", ", ".join(sorted(threads)) ) loops += 1 for visit_type, (thread, _) in list(threads.items()): thread.join(timeout=1) if not thread.is_alive(): logger.debug("Thread %s terminated", visit_type) del threads[visit_type] if threads: logger.warn( "Could not reap the following threads after 10 attempts: %s", ", ".join(sorted(threads)), ) return list(sorted(threads)) diff --git a/swh/scheduler/tests/test_cli.py b/swh/scheduler/tests/test_cli.py index 5152656..6a55071 100644 --- a/swh/scheduler/tests/test_cli.py +++ b/swh/scheduler/tests/test_cli.py @@ -1,946 +1,946 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from itertools import islice import logging import random import re import tempfile from unittest.mock import patch from click.testing import CliRunner import pytest from swh.core.api.classes import stream_results from swh.model.model import Origin from swh.scheduler.cli import cli from swh.scheduler.utils import create_task_dict, utcnow CLI_CONFIG = """ scheduler: cls: foo args: {} """ -def invoke(scheduler, catch_exceptions, args): +def invoke(scheduler, catch_exceptions, args, config=CLI_CONFIG): runner = CliRunner() with patch( "swh.scheduler.get_scheduler" ) as get_scheduler_mock, tempfile.NamedTemporaryFile( "a", suffix=".yml" ) as config_fd: - config_fd.write(CLI_CONFIG) + config_fd.write(config) config_fd.seek(0) get_scheduler_mock.return_value = scheduler args = [ "-C" + config_fd.name, ] + args result = runner.invoke(cli, args, obj={"log_level": logging.WARNING}) if not catch_exceptions and result.exception: print(result.output) raise result.exception return result def test_schedule_tasks(swh_scheduler): csv_data = ( b'swh-test-ping;[["arg1", "arg2"]];{"key": "value"};' + utcnow().isoformat().encode() + b"\n" + b'swh-test-ping;[["arg3", "arg4"]];{"key": "value"};' + utcnow().isoformat().encode() + b"\n" ) with tempfile.NamedTemporaryFile(suffix=".csv") as csv_fd: csv_fd.write(csv_data) csv_fd.seek(0) result = invoke( swh_scheduler, False, ["task", "schedule", "-d", ";", csv_fd.name] ) expected = r""" Created 2 tasks Task 1 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: recurring Args: \['arg1', 'arg2'\] Keyword args: key: 'value' Task 2 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: recurring Args: \['arg3', 'arg4'\] Keyword args: key: 'value' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_schedule_tasks_columns(swh_scheduler): with tempfile.NamedTemporaryFile(suffix=".csv") as csv_fd: csv_fd.write(b'swh-test-ping;oneshot;["arg1", "arg2"];{"key": "value"}\n') csv_fd.seek(0) result = invoke( swh_scheduler, False, [ "task", "schedule", "-c", "type", "-c", "policy", "-c", "args", "-c", "kwargs", "-d", ";", csv_fd.name, ], ) expected = r""" Created 1 tasks Task 1 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Args: 'arg1' 'arg2' Keyword args: key: 'value' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_schedule_task(swh_scheduler): result = invoke( swh_scheduler, False, [ "task", "add", "swh-test-ping", "arg1", "arg2", "key=value", ], ) expected = r""" Created 1 tasks Task 1 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: recurring Args: 'arg1' 'arg2' Keyword args: key: 'value' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_pending_tasks_none(swh_scheduler): result = invoke( swh_scheduler, False, [ "task", "list-pending", "swh-test-ping", ], ) expected = r""" Found 0 swh-test-ping tasks """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_pending_tasks(swh_scheduler): task1 = create_task_dict("swh-test-ping", "oneshot", key="value1") task2 = create_task_dict("swh-test-ping", "oneshot", key="value2") task2["next_run"] += datetime.timedelta(days=1) swh_scheduler.create_tasks([task1, task2]) result = invoke( swh_scheduler, False, [ "task", "list-pending", "swh-test-ping", ], ) expected = r""" Found 1 swh-test-ping tasks Task 1 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Args: Keyword args: key: 'value1' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output swh_scheduler.grab_ready_tasks("swh-test-ping") result = invoke( swh_scheduler, False, [ "task", "list-pending", "swh-test-ping", ], ) expected = r""" Found 0 swh-test-ping tasks """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_pending_tasks_filter(swh_scheduler): task = create_task_dict("swh-test-multiping", "oneshot", key="value") swh_scheduler.create_tasks([task]) result = invoke( swh_scheduler, False, [ "task", "list-pending", "swh-test-ping", ], ) expected = r""" Found 0 swh-test-ping tasks """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_pending_tasks_filter_2(swh_scheduler): swh_scheduler.create_tasks( [ create_task_dict("swh-test-multiping", "oneshot", key="value"), create_task_dict("swh-test-ping", "oneshot", key="value2"), ] ) result = invoke( swh_scheduler, False, [ "task", "list-pending", "swh-test-ping", ], ) expected = r""" Found 1 swh-test-ping tasks Task 2 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Args: Keyword args: key: 'value2' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output # Fails because "task list-pending --limit 3" only returns 2 tasks, because # of how compute_nb_tasks_from works. @pytest.mark.xfail def test_list_pending_tasks_limit(swh_scheduler): swh_scheduler.create_tasks( [ create_task_dict("swh-test-ping", "oneshot", key="value%d" % i) for i in range(10) ] ) result = invoke( swh_scheduler, False, [ "task", "list-pending", "swh-test-ping", "--limit", "3", ], ) expected = r""" Found 2 swh-test-ping tasks Task 1 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Args: Keyword args: key: 'value0' Task 2 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Args: Keyword args: key: 'value1' Task 3 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Args: Keyword args: key: 'value2' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_pending_tasks_before(swh_scheduler): task1 = create_task_dict("swh-test-ping", "oneshot", key="value") task2 = create_task_dict("swh-test-ping", "oneshot", key="value2") task1["next_run"] += datetime.timedelta(days=3) task2["next_run"] += datetime.timedelta(days=1) swh_scheduler.create_tasks([task1, task2]) result = invoke( swh_scheduler, False, [ "task", "list-pending", "swh-test-ping", "--before", (datetime.date.today() + datetime.timedelta(days=2)).isoformat(), ], ) expected = r""" Found 1 swh-test-ping tasks Task 2 Next run: tomorrow \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Args: Keyword args: key: 'value2' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_tasks(swh_scheduler): task1 = create_task_dict("swh-test-ping", "oneshot", key="value1") task2 = create_task_dict("swh-test-ping", "oneshot", key="value2") task1["next_run"] += datetime.timedelta(days=3, hours=2) swh_scheduler.create_tasks([task1, task2]) swh_scheduler.grab_ready_tasks("swh-test-ping") result = invoke( swh_scheduler, False, [ "task", "list", ], ) expected = r""" Found 2 tasks Task 1 Next run: .+ \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Status: next_run_not_scheduled Priority:\x20 Args: Keyword args: key: 'value1' Task 2 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Status: next_run_scheduled Priority:\x20 Args: Keyword args: key: 'value2' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_tasks_id(swh_scheduler): task1 = create_task_dict("swh-test-ping", "oneshot", key="value1") task2 = create_task_dict("swh-test-ping", "oneshot", key="value2") task3 = create_task_dict("swh-test-ping", "oneshot", key="value3") swh_scheduler.create_tasks([task1, task2, task3]) result = invoke( swh_scheduler, False, [ "task", "list", "--task-id", "2", ], ) expected = r""" Found 1 tasks Task 2 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Status: next_run_not_scheduled Priority:\x20 Args: Keyword args: key: 'value2' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_tasks_id_2(swh_scheduler): task1 = create_task_dict("swh-test-ping", "oneshot", key="value1") task2 = create_task_dict("swh-test-ping", "oneshot", key="value2") task3 = create_task_dict("swh-test-ping", "oneshot", key="value3") swh_scheduler.create_tasks([task1, task2, task3]) result = invoke( swh_scheduler, False, ["task", "list", "--task-id", "2", "--task-id", "3"] ) expected = r""" Found 2 tasks Task 2 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Status: next_run_not_scheduled Priority:\x20 Args: Keyword args: key: 'value2' Task 3 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Status: next_run_not_scheduled Priority:\x20 Args: Keyword args: key: 'value3' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_tasks_type(swh_scheduler): task1 = create_task_dict("swh-test-ping", "oneshot", key="value1") task2 = create_task_dict("swh-test-multiping", "oneshot", key="value2") task3 = create_task_dict("swh-test-ping", "oneshot", key="value3") swh_scheduler.create_tasks([task1, task2, task3]) result = invoke( swh_scheduler, False, ["task", "list", "--task-type", "swh-test-ping"] ) expected = r""" Found 2 tasks Task 1 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Status: next_run_not_scheduled Priority:\x20 Args: Keyword args: key: 'value1' Task 3 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Status: next_run_not_scheduled Priority:\x20 Args: Keyword args: key: 'value3' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_tasks_limit(swh_scheduler): task1 = create_task_dict("swh-test-ping", "oneshot", key="value1") task2 = create_task_dict("swh-test-ping", "oneshot", key="value2") task3 = create_task_dict("swh-test-ping", "oneshot", key="value3") swh_scheduler.create_tasks([task1, task2, task3]) result = invoke( swh_scheduler, False, [ "task", "list", "--limit", "2", ], ) expected = r""" Found 2 tasks Task 1 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Status: next_run_not_scheduled Priority:\x20 Args: Keyword args: key: 'value1' Task 2 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Status: next_run_not_scheduled Priority:\x20 Args: Keyword args: key: 'value2' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_tasks_before(swh_scheduler): task1 = create_task_dict("swh-test-ping", "oneshot", key="value1") task2 = create_task_dict("swh-test-ping", "oneshot", key="value2") task1["next_run"] += datetime.timedelta(days=3, hours=2) swh_scheduler.create_tasks([task1, task2]) swh_scheduler.grab_ready_tasks("swh-test-ping") result = invoke( swh_scheduler, False, [ "task", "list", "--before", (datetime.date.today() + datetime.timedelta(days=2)).isoformat(), ], ) expected = r""" Found 1 tasks Task 2 Next run: today \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Status: next_run_scheduled Priority:\x20 Args: Keyword args: key: 'value2' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def test_list_tasks_after(swh_scheduler): task1 = create_task_dict("swh-test-ping", "oneshot", key="value1") task2 = create_task_dict("swh-test-ping", "oneshot", key="value2") task1["next_run"] += datetime.timedelta(days=3, hours=2) swh_scheduler.create_tasks([task1, task2]) swh_scheduler.grab_ready_tasks("swh-test-ping") result = invoke( swh_scheduler, False, [ "task", "list", "--after", (datetime.date.today() + datetime.timedelta(days=2)).isoformat(), ], ) expected = r""" Found 1 tasks Task 1 Next run: .+ \(.*\) Interval: 1 day, 0:00:00 Type: swh-test-ping Policy: oneshot Status: next_run_not_scheduled Priority:\x20 Args: Keyword args: key: 'value1' """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), result.output def _fill_storage_with_origins(storage, nb_origins): origins = [Origin(url=f"http://example.com/{i}") for i in range(nb_origins)] storage.origin_add(origins) return origins @patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3) def test_task_schedule_origins_dry_run(swh_scheduler, storage): """Tests the scheduling when origin_batch_size*task_batch_size is a divisor of nb_origins.""" _fill_storage_with_origins(storage, 90) result = invoke( swh_scheduler, False, [ "task", "schedule_origins", "--dry-run", "swh-test-ping", ], ) # Check the output expected = r""" Scheduled 3 tasks \(30 origins\). Scheduled 6 tasks \(60 origins\). Scheduled 9 tasks \(90 origins\). Done. """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), repr(result.output) # Check scheduled tasks tasks = swh_scheduler.search_tasks() assert len(tasks) == 0 def _assert_origin_tasks_contraints(tasks, max_tasks, max_task_size, expected_origins): # check there are not too many tasks assert len(tasks) <= max_tasks # check tasks are not too large assert all(len(task["arguments"]["args"][0]) <= max_task_size for task in tasks) # check the tasks are exhaustive assert sum([len(task["arguments"]["args"][0]) for task in tasks]) == len( expected_origins ) assert set.union(*(set(task["arguments"]["args"][0]) for task in tasks)) == { origin.url for origin in expected_origins } @patch("swh.scheduler.cli.utils.TASK_BATCH_SIZE", 3) def test_task_schedule_origins(swh_scheduler, storage): """Tests the scheduling when neither origin_batch_size or task_batch_size is a divisor of nb_origins.""" origins = _fill_storage_with_origins(storage, 70) result = invoke( swh_scheduler, False, [ "task", "schedule_origins", "swh-test-ping", "--batch-size", "20", ], ) # Check the output expected = r""" Scheduled 3 tasks \(60 origins\). Scheduled 4 tasks \(70 origins\). Done. """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), repr(result.output) # Check tasks tasks = swh_scheduler.search_tasks() _assert_origin_tasks_contraints(tasks, 4, 20, origins) assert all(task["arguments"]["kwargs"] == {} for task in tasks) def test_task_schedule_origins_kwargs(swh_scheduler, storage): """Tests support of extra keyword-arguments.""" origins = _fill_storage_with_origins(storage, 30) result = invoke( swh_scheduler, False, [ "task", "schedule_origins", "swh-test-ping", "--batch-size", "20", 'key1="value1"', 'key2="value2"', ], ) # Check the output expected = r""" Scheduled 2 tasks \(30 origins\). Done. """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), repr(result.output) # Check tasks tasks = swh_scheduler.search_tasks() _assert_origin_tasks_contraints(tasks, 2, 20, origins) assert all( task["arguments"]["kwargs"] == {"key1": "value1", "key2": "value2"} for task in tasks ) def test_task_schedule_origins_with_limit(swh_scheduler, storage): """Tests support of extra keyword-arguments.""" _fill_storage_with_origins(storage, 50) limit = 20 expected_origins = list(islice(stream_results(storage.origin_list), limit)) nb_origins = len(expected_origins) assert nb_origins == limit max_task_size = 5 nb_tasks, remainder = divmod(nb_origins, max_task_size) assert remainder == 0 # made the numbers go round result = invoke( swh_scheduler, False, [ "task", "schedule_origins", "swh-test-ping", "--batch-size", max_task_size, "--limit", limit, ], ) # Check the output expected = rf""" Scheduled {nb_tasks} tasks \({nb_origins} origins\). Done. """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), repr(result.output) tasks = swh_scheduler.search_tasks() _assert_origin_tasks_contraints(tasks, max_task_size, nb_origins, expected_origins) def test_task_schedule_origins_with_page_token(swh_scheduler, storage): """Tests support of extra keyword-arguments.""" nb_total_origins = 50 origins = _fill_storage_with_origins(storage, nb_total_origins) # prepare page_token and origins result expectancy page_result = storage.origin_list(limit=10) assert len(page_result.results) == 10 page_token = page_result.next_page_token assert page_token is not None # remove the first 10 origins listed as we won't see those in tasks expected_origins = [o for o in origins if o not in page_result.results] nb_origins = len(expected_origins) assert nb_origins == nb_total_origins - len(page_result.results) max_task_size = 10 nb_tasks, remainder = divmod(nb_origins, max_task_size) assert remainder == 0 result = invoke( swh_scheduler, False, [ "task", "schedule_origins", "swh-test-ping", "--batch-size", max_task_size, "--page-token", page_token, ], ) # Check the output expected = rf""" Scheduled {nb_tasks} tasks \({nb_origins} origins\). Done. """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), repr(result.output) # Check tasks tasks = swh_scheduler.search_tasks() _assert_origin_tasks_contraints(tasks, max_task_size, nb_origins, expected_origins) def test_cli_task_runner_unknown_task_types(swh_scheduler, storage): """When passing at least one unknown task type, the runner should fail.""" task_types = swh_scheduler.get_task_types() task_type_names = [t["type"] for t in task_types] known_task_type = random.choice(task_type_names) unknown_task_type = "unknown-task-type" assert unknown_task_type not in task_type_names with pytest.raises(ValueError, match="Unknown"): invoke( swh_scheduler, False, [ "start-runner", "--task-type", known_task_type, "--task-type", unknown_task_type, ], ) @pytest.mark.parametrize("flag_priority", ["--with-priority", "--without-priority"]) def test_cli_task_runner_with_known_tasks( swh_scheduler, storage, caplog, flag_priority ): """Trigger runner with known tasks runs smoothly.""" task_types = swh_scheduler.get_task_types() task_type_names = [t["type"] for t in task_types] task_type_name = random.choice(task_type_names) task_type_name2 = random.choice(task_type_names) # The runner will just iterate over the following known tasks and do noop. We are # just checking the runner does not explode here. result = invoke( swh_scheduler, False, [ "start-runner", flag_priority, "--task-type", task_type_name, "--task-type", task_type_name2, ], ) assert result.exit_code == 0, result.output def test_cli_task_runner_no_task(swh_scheduler, storage): """Trigger runner with no parameter should run as before.""" # The runner will just iterate over the existing tasks from the scheduler and do # noop. We are just checking the runner does not explode here. result = invoke( swh_scheduler, False, [ "start-runner", ], ) assert result.exit_code == 0, result.output diff --git a/swh/scheduler/tests/test_recurrent_visits.py b/swh/scheduler/tests/test_recurrent_visits.py index ad3cfd2..484176f 100644 --- a/swh/scheduler/tests/test_recurrent_visits.py +++ b/swh/scheduler/tests/test_recurrent_visits.py @@ -1,216 +1,271 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from datetime import timedelta import logging from queue import Queue -from unittest.mock import MagicMock +import time import pytest +import yaml from swh.scheduler.celery_backend.recurrent_visits import ( DEFAULT_DVCS_POLICY, VisitSchedulerThreads, grab_next_visits_policy_weights, send_visits_for_visit_type, spawn_visit_scheduler_thread, terminate_visit_scheduler_threads, visit_scheduler_thread, ) from .test_cli import invoke TEST_MAX_QUEUE = 10000 MODULE_NAME = "swh.scheduler.celery_backend.recurrent_visits" def _compute_backend_name(visit_type: str) -> str: "Build a dummy reproducible backend name" return f"swh.loader.{visit_type}.tasks" @pytest.fixture def swh_scheduler(swh_scheduler): """Override default fixture of the scheduler to install some more task types.""" for visit_type in ["test-git", "test-hg", "test-svn"]: task_type = f"load-{visit_type}" swh_scheduler.create_task_type( { "type": task_type, "max_queue_length": TEST_MAX_QUEUE, "description": "The {} testing task".format(task_type), "backend_name": _compute_backend_name(visit_type), "default_interval": timedelta(days=1), "min_interval": timedelta(hours=6), "max_interval": timedelta(days=12), } ) return swh_scheduler +@pytest.fixture +def all_task_types(swh_scheduler): + return { + task_type_d["type"]: task_type_d + for task_type_d in swh_scheduler.get_task_types() + } + + def test_cli_schedule_recurrent_unknown_visit_type(swh_scheduler): """When passed an unknown visit type, the recurrent visit scheduler should refuse to start.""" with pytest.raises(ValueError, match="Unknown"): invoke( swh_scheduler, False, [ "schedule-recurrent", "--visit-type", "unknown", "--visit-type", "test-git", ], ) def test_cli_schedule_recurrent_noop(swh_scheduler, mocker): """When passing no visit types, the recurrent visit scheduler should start.""" spawn_visit_scheduler_thread = mocker.patch( f"{MODULE_NAME}.spawn_visit_scheduler_thread" ) spawn_visit_scheduler_thread.side_effect = SystemExit # The actual scheduling threads won't spawn, they'll immediately terminate. This # only exercises the logic to pull task types out of the database result = invoke(swh_scheduler, False, ["schedule-recurrent"]) assert result.exit_code == 0, result.output +def test_send_visits_for_type_no_origin_scheduled_backoff( + swh_scheduler, all_task_types, mocker +): + visit_type = "test-git" + task_type = f"load-{visit_type}" + mocker.patch.object(time, "monotonic", lambda: 0) + no_origins_scheduled_backoff = 60 + assert ( + send_visits_for_visit_type( + swh_scheduler, + mocker.MagicMock(), + visit_type, + all_task_types[task_type], + DEFAULT_DVCS_POLICY, + no_origins_scheduled_backoff, + ) + == no_origins_scheduled_backoff + ) + + +def test_cli_schedule_recurrent_no_origins_scheduled_backoff_in_config( + swh_scheduler, mocker +): + """When passing no visit types, the recurrent visit scheduler should start.""" + + config = """ + scheduler: + cls: foo + args: {} + no_origins_scheduled_backoff: 60 + """ + + spawn_visit_scheduler_thread = mocker.patch( + f"{MODULE_NAME}.spawn_visit_scheduler_thread" + ) + spawn_visit_scheduler_thread.side_effect = SystemExit + # The actual scheduling threads won't spawn, they'll immediately terminate. This + # only exercises the logic to pull task types out of the database + + result = invoke( + swh_scheduler, + False, + ["schedule-recurrent", "--visit-type", "test-git"], + config=config, + ) + assert result.exit_code == 0, result.output + + assert yaml.safe_load(config) in spawn_visit_scheduler_thread.call_args[0] + + def test_recurrent_visit_scheduling( swh_scheduler, caplog, listed_origins_by_type, + all_task_types, mocker, ): """Scheduling known tasks is ok.""" caplog.set_level(logging.DEBUG, MODULE_NAME) nb_origins = 1000 - mock_celery_app = MagicMock() + mock_celery_app = mocker.MagicMock() mock_available_slots = mocker.patch(f"{MODULE_NAME}.get_available_slots") mock_available_slots.return_value = nb_origins # Slots available in queue # Make sure the scheduler is properly configured in terms of visit/task types - all_task_types = { - task_type_d["type"]: task_type_d - for task_type_d in swh_scheduler.get_task_types() - } - visit_types = list(listed_origins_by_type.keys()) assert len(visit_types) > 0 task_types = [] origins = [] for visit_type, _origins in listed_origins_by_type.items(): origins.extend(swh_scheduler.record_listed_origins(_origins)) task_type_name = f"load-{visit_type}" assert task_type_name in all_task_types.keys() task_type = all_task_types[task_type_name] task_type["visit_type"] = visit_type # we'll limit the orchestrator to the origins' type we know task_types.append(task_type) for visit_type in ["test-git", "test-svn"]: task_type = f"load-{visit_type}" send_visits_for_visit_type( swh_scheduler, mock_celery_app, visit_type, all_task_types[task_type], DEFAULT_DVCS_POLICY, ) assert mock_available_slots.called, "The available slots functions should be called" records = [record.message for record in caplog.records] # Mapping over the dict ratio/policies entries can change overall order so let's # check the set of records expected_records = set() for task_type in task_types: visit_type = task_type["visit_type"] queue_name = task_type["backend_name"] msg = ( f"{nb_origins} available slots for visit type {visit_type} " f"in queue {queue_name}" ) expected_records.add(msg) for expected_record in expected_records: assert expected_record in set(records) @pytest.mark.parametrize( "visit_type, extras", [("test-hg", {}), ("test-git", {"tablesample": 0.1})], ) def test_recurrent_visit_additional_parameters( swh_scheduler, mocker, visit_type, extras ): """Testing additional policy parameters""" mock_grab_next_visits = mocker.patch.object(swh_scheduler, "grab_next_visits") mock_grab_next_visits.return_value = [] policy_cfg = DEFAULT_DVCS_POLICY[:] for policy in policy_cfg: policy.update(extras) grab_next_visits_policy_weights(swh_scheduler, visit_type, 10, policy_cfg) for call in mock_grab_next_visits.call_args_list: assert call[1].get("tablesample") == extras.get("tablesample") @pytest.fixture def scheduler_config(swh_scheduler_config): return {"scheduler": {"cls": "postgresql", **swh_scheduler_config}, "celery": {}} def test_visit_scheduler_thread_unknown_task( swh_scheduler, scheduler_config, ): """Starting a thread with unknown task type reports the error""" unknown_visit_type = "unknown" command_queue = Queue() exc_queue = Queue() visit_scheduler_thread( scheduler_config, unknown_visit_type, command_queue, exc_queue ) assert command_queue.empty() is True assert exc_queue.empty() is False assert len(exc_queue.queue) == 1 result = exc_queue.queue.pop() assert result[0] == unknown_visit_type assert isinstance(result[1], ValueError) def test_spawn_visit_scheduler_thread_noop(scheduler_config, visit_types, mocker): """Spawning and terminating threads runs smoothly""" threads: VisitSchedulerThreads = {} exc_queue = Queue() mock_build_app = mocker.patch("swh.scheduler.celery_backend.config.build_app") - mock_build_app.return_value = MagicMock() + mock_build_app.return_value = mocker.MagicMock() assert len(threads) == 0 for visit_type in visit_types: spawn_visit_scheduler_thread(threads, exc_queue, scheduler_config, visit_type) # This actually only checks the spawning and terminating logic is sound assert len(threads) == len(visit_types) actual_threads = terminate_visit_scheduler_threads(threads) assert not len(actual_threads) assert mock_build_app.called diff --git a/swh/scheduler/tests/test_server.py b/swh/scheduler/tests/test_server.py index 50c5b41..4156fac 100644 --- a/swh/scheduler/tests/test_server.py +++ b/swh/scheduler/tests/test_server.py @@ -1,100 +1,97 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import pytest import yaml from swh.scheduler.api.server import load_and_check_config def prepare_config_file(tmpdir, content, name="config.yml"): """Prepare configuration file in `$tmpdir/name` with content `content`. Args: tmpdir (LocalPath): root directory content (str/dict): Content of the file either as string or as a dict. If a dict, converts the dict into a yaml string. name (str): configuration filename Returns path (str) of the configuration file prepared. """ config_path = tmpdir / name if isinstance(content, dict): # convert if needed content = yaml.dump(content) config_path.write_text(content, encoding="utf-8") # pytest on python3.5 does not support LocalPath manipulation, so # convert path to string return str(config_path) @pytest.mark.parametrize("scheduler_class", [None, ""]) def test_load_and_check_config_no_configuration(scheduler_class): """Inexistent configuration files raises""" with pytest.raises(EnvironmentError, match="Configuration file must be defined"): load_and_check_config(scheduler_class) def test_load_and_check_config_inexistent_fil(): """Inexistent config filepath should raise""" config_path = "/some/inexistent/config.yml" expected_error = f"Configuration file {config_path} does not exist" with pytest.raises(FileNotFoundError, match=expected_error): load_and_check_config(config_path) def test_load_and_check_config_wrong_configuration(tmpdir): """Wrong configuration raises""" config_path = prepare_config_file(tmpdir, "something: useless") with pytest.raises(KeyError, match="Missing '%scheduler' configuration"): load_and_check_config(config_path) def test_load_and_check_config_remote_config_local_type_raise(tmpdir): - """Configuration without 'local' storage is rejected""" - config = {"scheduler": {"cls": "remote"}} - config_path = prepare_config_file(tmpdir, config) - expected_error = ( - "The scheduler backend can only be started with a 'postgresql'" " configuration" - ) - with pytest.raises(ValueError, match=expected_error): - load_and_check_config(config_path, type="local") + """Configuration without 'postgresql' storage is rejected""" + config_path = prepare_config_file(tmpdir, {"scheduler": {"cls": "remote"}}) + with pytest.raises(ValueError, match="'postgresql'"): + load_and_check_config(config_path) def test_load_and_check_config_local_incomplete_configuration(tmpdir): """Incomplete 'local' configuration should raise""" config = { "scheduler": { "cls": "postgresql", "something": "needed-for-test", } } config_path = prepare_config_file(tmpdir, config) expected_error = "Invalid configuration; missing 'db' config entry" with pytest.raises(KeyError, match=expected_error): load_and_check_config(config_path) -def test_load_and_check_config_local_config_fine(tmpdir): +@pytest.mark.parametrize("clazz", ["local", "postgresql"]) +def test_load_and_check_config_local_config_fine(tmpdir, clazz): """Local configuration is fine""" config = { "scheduler": { - "cls": "postgresql", + "cls": clazz, "db": "db", } } config_path = prepare_config_file(tmpdir, config) - cfg = load_and_check_config(config_path, type="local") + cfg = load_and_check_config(config_path, type="postgresql") assert cfg == config def test_load_and_check_config_remote_config_fine(tmpdir): """Remote configuration is fine""" config = {"scheduler": {"cls": "remote"}} config_path = prepare_config_file(tmpdir, config) cfg = load_and_check_config(config_path, type="any") assert cfg == config