diff --git a/conftest.py b/conftest.py --- a/conftest.py +++ b/conftest.py @@ -1 +1 @@ -pytest_plugins = ["swh.auth.pytest_plugin"] +pytest_plugins = ["swh.auth.pytest_plugin", "swh.scheduler.pytest_plugin"] diff --git a/requirements-test.txt b/requirements-test.txt --- a/requirements-test.txt +++ b/requirements-test.txt @@ -9,6 +9,7 @@ requests-mock != 1.9.0, != 1.9.1 swh.core[http] >= 0.0.95 swh.loader.git >= 0.8.0 +swh-scheduler[testing] >= 0.5.0 swh.storage >= 0.1.1 types-docutils types-pyyaml diff --git a/swh/web/common/management/commands/refresh_savecodenow_statuses.py b/swh/web/common/management/commands/refresh_savecodenow_statuses.py --- a/swh/web/common/management/commands/refresh_savecodenow_statuses.py +++ b/swh/web/common/management/commands/refresh_savecodenow_statuses.py @@ -3,16 +3,57 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Set + from django.core.management.base import BaseCommand +from swh.scheduler.model import ListedOrigin +from swh.web.common.models import VISIT_STATUS_FULL, VISIT_STATUS_PARTIAL from swh.web.common.origin_save import refresh_save_origin_request_statuses +from swh.web.config import get_config +from swh.web.config import scheduler as get_scheduler class Command(BaseCommand): help = "Refresh save code now origin request statuses periodically" def handle(self, *args, **options): + """Refresh origin save code now requests. + + For the origin visit types, svn, git, hg, this also installs the origins as + recurring origins to visit. + + """ refreshed_statuses = refresh_save_origin_request_statuses() + scheduler = get_scheduler() + + # then schedule the origins with meaningful status and type to be ingested + # regularly + lister = scheduler.get_or_create_lister( + name="save-code-now", instance_name=get_config()["instance_name"] + ) + + origins: Set[str, str] = set() + listed_origins = [] + for status in refreshed_statuses: + visit_type = status["visit_type"] + # only deal with git, svn, hg visit types + if visit_type == "archives": + continue + # only keep satisfying visit statuses + if status["visit_status"] not in (VISIT_STATUS_PARTIAL, VISIT_STATUS_FULL): + continue + origin = status["origin_url"] + # drop duplicates within the same batch + if (visit_type, origin) in origins: + continue + origins.add((visit_type, origin)) + listed_origins.append( + ListedOrigin(lister_id=lister.id, visit_type=visit_type, url=origin) + ) + + if listed_origins: + scheduler.record_listed_origins(listed_origins) if len(refreshed_statuses) > 0: msg = f"Successfully updated {len(refreshed_statuses)} save request(s)." diff --git a/swh/web/config.py b/swh/web/config.py --- a/swh/web/config.py +++ b/swh/web/config.py @@ -131,9 +131,10 @@ "metadata_search_backend": ("string", "swh-indexer-storage"), # or "swh-search" "counters_backend": ("string", "swh-storage"), # or "swh-counters" "staging_server_names": ("list", STAGING_SERVER_NAMES), + "instance_name": ("str", "archive-test.softwareheritage.org"), } -swhweb_config = {} # type: Dict[str, Any] +swhweb_config: Dict[str, Any] = {} def get_config(config_file="web/web"): diff --git a/swh/web/tests/common/test_django_command.py b/swh/web/tests/common/test_django_command.py --- a/swh/web/tests/common/test_django_command.py +++ b/swh/web/tests/common/test_django_command.py @@ -3,33 +3,175 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information +from datetime import datetime, timedelta, timezone from io import StringIO import pytest from django.core.management import call_command +from swh.core.api.classes import stream_results +from swh.web.common.models import ( + SAVE_REQUEST_ACCEPTED, + SAVE_TASK_FAILED, + SAVE_TASK_SCHEDULED, + SAVE_TASK_SUCCEEDED, + VISIT_STATUS_FAILED, + VISIT_STATUS_FULL, + VISIT_STATUS_PARTIAL, +) +from swh.web.common.typing import SaveOriginRequestInfo +from swh.web.config import get_config + +MODULE_FQDN = "swh.web.common.management.commands" +COMMAND_NAME = "refresh_savecodenow_statuses" + +AUTHORIZED_ORIGIN_URL = "https://scm.ourproject.org/anonscm/%s" + + +@pytest.fixture +def mock_refresh(mocker): + return mocker.patch( + f"{MODULE_FQDN}.{COMMAND_NAME}.refresh_save_origin_request_statuses" + ) + + +@pytest.fixture +def mock_scheduler(mocker, swh_scheduler): + mock_scheduler = mocker.patch(f"{MODULE_FQDN}.{COMMAND_NAME}.get_scheduler") + mock_scheduler.return_value = swh_scheduler + + return mock_scheduler + @pytest.mark.parametrize("nb_results", [0, 10, 20]) -def test_command_refresh__with_statuses_refreshed(mocker, nb_results): - """Refresh status command reported updated non-terminal statuses. +def test_command_refresh__with_statuses_refreshed( + mock_scheduler, mock_refresh, nb_results +): + """Refresh status command reports non-terminal statuses updates. """ - command_name = "refresh_savecodenow_statuses" - module_fqdn = "swh.web.common.management.commands" - mock_refresh = mocker.patch( - f"{module_fqdn}.{command_name}.refresh_save_origin_request_statuses" - ) - # fake returned refreshed status - mock_refresh.return_value = [{"": ""}] * nb_results + # fake returned refreshed status for 'archives' visit type + mock_refresh.return_value = [{"visit_type": "archives",}] * nb_results out = StringIO() - call_command(command_name, stdout=out) - - assert mock_refresh.called + call_command(COMMAND_NAME, stdout=out) actual_output = out.getvalue() if nb_results > 0: assert f"updated {nb_results}" in actual_output else: assert "Nothing" in actual_output + + assert mock_scheduler.called + assert mock_refresh.called + + +@pytest.fixture +def fake_refreshed_data(): + """Prepare test data within the scheduler and the swh-web model db + + """ + duplicated_origin_url = AUTHORIZED_ORIGIN_URL % "specific-origin" + entries = ( + [ + { + "visit_type": "archives", # ignored from recurring task scheduling + "visit_status": VISIT_STATUS_FULL, + "task_status": SAVE_TASK_SUCCEEDED, + }, + { + "visit_type": "hg", # scheduled as recurring task + "visit_status": VISIT_STATUS_PARTIAL, + "task_status": SAVE_TASK_SUCCEEDED, + }, + { + "visit_type": "svn", # scheduled as recurring task + "visit_status": VISIT_STATUS_PARTIAL, + "task_status": SAVE_TASK_SCHEDULED, + }, + { + "visit_type": "svn", # ignored from recurring task scheduling + "visit_status": VISIT_STATUS_FAILED, + "task_status": SAVE_TASK_FAILED, + }, + { + "visit_type": "hg", # ignored from recurring task scheduling + "visit_status": "created", + "task_status": SAVE_TASK_SCHEDULED, + }, + ] + + [ + { + "visit_type": "git", + "visit_status": VISIT_STATUS_FULL, + "task_status": SAVE_TASK_SUCCEEDED, + "origin": duplicated_origin_url, + } + ] + * 3 + ) # only 1 of the origin duplicates will be scheduled as recurring task + + time_now = datetime.now(tz=timezone.utc) - timedelta(days=len(entries)) + return [ + SaveOriginRequestInfo( + visit_type=meta["visit_type"], + visit_status=meta["visit_status"], + origin_url=( + meta["origin"] if "origin" in meta else AUTHORIZED_ORIGIN_URL % i + ), + save_request_date=time_now + timedelta(days=i - 1), + save_request_status=SAVE_REQUEST_ACCEPTED, + visit_date=time_now + timedelta(days=i), + save_task_status=meta["task_status"], + id=i, + loading_task_id=i, + ) + for i, meta in enumerate(entries) + ] + + +def test_command_refresh__with_recurrent_tasks_scheduling( + mock_scheduler, mock_refresh, fake_refreshed_data, swh_scheduler +): + """Refresh status command report updates of statuses. The successful ones without the + type 'archived' are also scheduled recurringly. + + """ + mock_refresh.return_value = fake_refreshed_data + + # only visit types (git, hg, svn) types with status (full, partial) are taken into + # account for scheduling, so only 3 of those matches in the fake data set. + expected_nb_scheduled = 0 + + origins = set() + expected_nb_scheduled = 0 + for entry in fake_refreshed_data: + visit_type = entry["visit_type"] + if visit_type == "archives": # only deal with git, svn, hg + continue + if entry["visit_status"] not in ("partial", "full"): + continue + origin = entry["origin_url"] + if (visit_type, origin) in origins: + continue + origins.add((visit_type, origin)) + expected_nb_scheduled += 1 + + assert expected_nb_scheduled == 3 + + out = StringIO() + call_command(COMMAND_NAME, stdout=out) + + actual_output = out.getvalue() + assert f"Successfully updated {len(fake_refreshed_data)}" in actual_output + + lister = swh_scheduler.get_or_create_lister( + name="save-code-now", instance_name=get_config()["instance_name"] + ) + + result = list(stream_results(swh_scheduler.get_listed_origins, lister.id)) + assert len(result) == expected_nb_scheduled + + assert mock_scheduler.called + assert mock_refresh.called