diff --git a/docs/simulator.rst b/docs/simulator.rst --- a/docs/simulator.rst +++ b/docs/simulator.rst @@ -63,3 +63,18 @@ # Run the simulator itself, interacting with the scheduler database you've # just seeded. swh scheduler -d "dbname=$PGDATABASE" simulator run --scheduler origin_scheduler + + +Origin model +------------ + +The origin model is how we represent the behaviors of origins: when they are +created/discovered, how many commits they get and when, and when they fail to load. + +For now it is only a simple approximation designed to exercise simple cases: +origin creation/discovery, a continuous stream of commits, and failure if they have +too many commits to load at once. +For details, see :py:`swh.scheduler.simulator.origins`. + +To keep the simulation fast enough, each origin's state is kept in memory, so the +simulator process will linearly increase in memory usage as it runs. diff --git a/swh/scheduler/simulator/__init__.py b/swh/scheduler/simulator/__init__.py --- a/swh/scheduler/simulator/__init__.py +++ b/swh/scheduler/simulator/__init__.py @@ -17,11 +17,10 @@ from simpy import Event from swh.scheduler.interface import SchedulerInterface -from swh.scheduler.model import ListedOrigin from . import origin_scheduler, task_scheduler from .common import Environment, Queue, SimulationReport, Task -from .origins import load_task_process +from .origins import generate_listed_origin, lister_process, load_task_process logger = logging.getLogger(__name__) @@ -105,21 +104,18 @@ worker_name = f"worker-{visit_type}-{i}" env.process(worker_process(env, worker_name, task_queue, status_queue)) + lister = env.scheduler.get_or_create_lister(name="example") + assert lister.id + env.process(lister_process(env, lister.id)) + def fill_test_data(scheduler: SchedulerInterface, num_origins: int = 100000): """Fills the database with mock data to test the simulator.""" stored_lister = scheduler.get_or_create_lister(name="example") assert stored_lister.id is not None - origins = [ - ListedOrigin( - lister_id=stored_lister.id, - url=f"https://example.com/{i:04d}.git", - visit_type="git", - last_update=datetime(2020, 6, 15, 16, 0, 0, i, tzinfo=timezone.utc), - ) - for i in range(num_origins) - ] + # Generate 'num_origins' new origins + origins = [generate_listed_origin(stored_lister.id) for _ in range(num_origins)] scheduler.record_listed_origins(origins) scheduler.create_tasks( diff --git a/swh/scheduler/simulator/origins.py b/swh/scheduler/simulator/origins.py --- a/swh/scheduler/simulator/origins.py +++ b/swh/scheduler/simulator/origins.py @@ -4,25 +4,65 @@ # See top-level LICENSE file for more information """This module implements a model of the frequency of updates of an origin -and how long it takes to load it.""" +and how long it takes to load it. -from datetime import timedelta +For each origin, a commit frequency is chosen deterministically based on the +hash of its URL and assume all origins were created on an arbitrary epoch. +From this we compute a number of commits, that is the product of these two. + +And the run time of a load task is approximated as proportional to the number +of commits since the previous visit of the origin (possibly 0).""" + +from datetime import datetime, timedelta, timezone import hashlib import logging import os -from typing import Iterator, Optional, Tuple +from typing import Generator, Iterator, List, Optional, Tuple +import uuid import attr from simpy import Event from swh.model.model import OriginVisitStatus -from swh.scheduler.model import OriginVisitStats +from swh.scheduler.model import ListedOrigin, OriginVisitStats from .common import Environment, Queue, Task, TaskEvent logger = logging.getLogger(__name__) +_nb_generated_origins = 0 + + +def generate_listed_origin( + lister_id: uuid.UUID, now: Optional[datetime] = None +) -> ListedOrigin: + """Returns a globally unique new origin. Seed the `last_update` value + according to the OriginModel and the passed timestamp. + + Arguments: + lister: instance of the lister that generated this origin + now: time of listing, to emulate last_update (defaults to :func:`datetime.now`) + """ + global _nb_generated_origins + _nb_generated_origins += 1 + assert _nb_generated_origins < 10 ** 6, "Too many origins!" + + if now is None: + now = datetime.now(tz=timezone.utc) + + url = f"https://example.com/{_nb_generated_origins:6d}.git" + visit_type = "git" + origin = OriginModel(visit_type, url) + + return ListedOrigin( + lister_id=lister_id, + url=url, + visit_type=visit_type, + last_update=origin.get_last_update(now), + ) + + class OriginModel: MIN_RUN_TIME = 0.5 """Minimal run time for a visit (retrieved from production data)""" @@ -33,6 +73,9 @@ PER_COMMIT_RUN_TIME = 0.1 """Run time per commit""" + EPOCH = datetime(2015, 9, 1, 0, 0, 0, tzinfo=timezone.utc) + """The origin of all origins (at least according to Software Heritage)""" + def __init__(self, type: str, origin: str): self.type = type self.origin = origin @@ -56,6 +99,19 @@ return ten_y ** (bucket / num_buckets) # return 1 + (ten_y - 1) * (bucket / (num_buckets - 1)) + def get_last_update(self, now: datetime): + """Get the last_update value for this origin. + + We assume that the origin had its first commit at `EPOCH`, and that one + commit happened every `self.seconds_between_commits()`. This returns + the last commit date before or equal to `now`. + """ + _, time_since_last_commit = divmod( + (now - self.EPOCH).total_seconds(), self.seconds_between_commits() + ) + + return now - timedelta(seconds=time_since_last_commit) + def load_task_characteristics( self, env: Environment, stats: Optional[OriginVisitStats] ) -> Tuple[float, bool, str]: @@ -84,6 +140,34 @@ return (run_time, True, "full") +def lister_process( + env: Environment, lister_id: uuid.UUID +) -> Generator[Event, Event, None]: + """Every hour, generate new origins and update the `last_update` field for + the ones this process generated in the past""" + NUM_NEW_ORIGINS = 100 + + origins: List[ListedOrigin] = [] + + while True: + updated_origins = [] + for origin in origins: + model = OriginModel(origin.visit_type, origin.url) + updated_origins.append( + attr.evolve(origin, last_update=model.get_last_update(env.time)) + ) + + origins = updated_origins + origins.extend( + generate_listed_origin(lister_id, now=env.time) + for _ in range(NUM_NEW_ORIGINS) + ) + + env.scheduler.record_listed_origins(origins) + + yield env.timeout(3600) + + def load_task_process( env: Environment, task: Task, status_queue: Queue ) -> Iterator[Event]: