Page MenuHomeSoftware Heritage

D5858.diff
No OneTemporary

D5858.diff

diff --git a/conftest.py b/conftest.py
--- a/conftest.py
+++ b/conftest.py
@@ -1 +1 @@
-pytest_plugins = ["swh.auth.pytest_plugin"]
+pytest_plugins = ["swh.auth.pytest_plugin", "swh.scheduler.pytest_plugin"]
diff --git a/requirements-test.txt b/requirements-test.txt
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -9,6 +9,7 @@
requests-mock != 1.9.0, != 1.9.1
swh.core[http] >= 0.0.95
swh.loader.git >= 0.8.0
+swh-scheduler[testing] >= 0.5.0
swh.storage >= 0.1.1
types-docutils
types-pyyaml
diff --git a/swh/web/common/management/commands/refresh_savecodenow_statuses.py b/swh/web/common/management/commands/refresh_savecodenow_statuses.py
--- a/swh/web/common/management/commands/refresh_savecodenow_statuses.py
+++ b/swh/web/common/management/commands/refresh_savecodenow_statuses.py
@@ -3,16 +3,57 @@
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Set
+
from django.core.management.base import BaseCommand
+from swh.scheduler.model import ListedOrigin
+from swh.web.common.models import VISIT_STATUS_FULL, VISIT_STATUS_PARTIAL
from swh.web.common.origin_save import refresh_save_origin_request_statuses
+from swh.web.config import get_config
+from swh.web.config import scheduler as get_scheduler
class Command(BaseCommand):
help = "Refresh save code now origin request statuses periodically"
def handle(self, *args, **options):
+ """Refresh origin save code now requests.
+
+ For the origin visit types, svn, git, hg, this also installs the origins as
+ recurring origins to visit.
+
+ """
refreshed_statuses = refresh_save_origin_request_statuses()
+ scheduler = get_scheduler()
+
+ # then schedule the origins with meaningful status and type to be ingested
+ # regularly
+ lister = scheduler.get_or_create_lister(
+ name="save-code-now", instance_name=get_config()["instance_name"]
+ )
+
+ origins: Set[str, str] = set()
+ listed_origins = []
+ for status in refreshed_statuses:
+ visit_type = status["visit_type"]
+ # only deal with git, svn, hg visit types
+ if visit_type == "archives":
+ continue
+ # only keep satisfying visit statuses
+ if status["visit_status"] not in (VISIT_STATUS_PARTIAL, VISIT_STATUS_FULL):
+ continue
+ origin = status["origin_url"]
+ # drop duplicates within the same batch
+ if (visit_type, origin) in origins:
+ continue
+ origins.add((visit_type, origin))
+ listed_origins.append(
+ ListedOrigin(lister_id=lister.id, visit_type=visit_type, url=origin)
+ )
+
+ if listed_origins:
+ scheduler.record_listed_origins(listed_origins)
if len(refreshed_statuses) > 0:
msg = f"Successfully updated {len(refreshed_statuses)} save request(s)."
diff --git a/swh/web/config.py b/swh/web/config.py
--- a/swh/web/config.py
+++ b/swh/web/config.py
@@ -131,9 +131,10 @@
"metadata_search_backend": ("string", "swh-indexer-storage"), # or "swh-search"
"counters_backend": ("string", "swh-storage"), # or "swh-counters"
"staging_server_names": ("list", STAGING_SERVER_NAMES),
+ "instance_name": ("str", "archive-test.softwareheritage.org"),
}
-swhweb_config = {} # type: Dict[str, Any]
+swhweb_config: Dict[str, Any] = {}
def get_config(config_file="web/web"):
diff --git a/swh/web/tests/common/test_django_command.py b/swh/web/tests/common/test_django_command.py
--- a/swh/web/tests/common/test_django_command.py
+++ b/swh/web/tests/common/test_django_command.py
@@ -3,33 +3,175 @@
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from datetime import datetime, timedelta, timezone
from io import StringIO
import pytest
from django.core.management import call_command
+from swh.core.api.classes import stream_results
+from swh.web.common.models import (
+ SAVE_REQUEST_ACCEPTED,
+ SAVE_TASK_FAILED,
+ SAVE_TASK_SCHEDULED,
+ SAVE_TASK_SUCCEEDED,
+ VISIT_STATUS_FAILED,
+ VISIT_STATUS_FULL,
+ VISIT_STATUS_PARTIAL,
+)
+from swh.web.common.typing import SaveOriginRequestInfo
+from swh.web.config import get_config
+
+MODULE_FQDN = "swh.web.common.management.commands"
+COMMAND_NAME = "refresh_savecodenow_statuses"
+
+AUTHORIZED_ORIGIN_URL = "https://scm.ourproject.org/anonscm/%s"
+
+
+@pytest.fixture
+def mock_refresh(mocker):
+ return mocker.patch(
+ f"{MODULE_FQDN}.{COMMAND_NAME}.refresh_save_origin_request_statuses"
+ )
+
+
+@pytest.fixture
+def mock_scheduler(mocker, swh_scheduler):
+ mock_scheduler = mocker.patch(f"{MODULE_FQDN}.{COMMAND_NAME}.get_scheduler")
+ mock_scheduler.return_value = swh_scheduler
+
+ return mock_scheduler
+
@pytest.mark.parametrize("nb_results", [0, 10, 20])
-def test_command_refresh__with_statuses_refreshed(mocker, nb_results):
- """Refresh status command reported updated non-terminal statuses.
+def test_command_refresh__with_statuses_refreshed(
+ mock_scheduler, mock_refresh, nb_results
+):
+ """Refresh status command reports non-terminal statuses updates.
"""
- command_name = "refresh_savecodenow_statuses"
- module_fqdn = "swh.web.common.management.commands"
- mock_refresh = mocker.patch(
- f"{module_fqdn}.{command_name}.refresh_save_origin_request_statuses"
- )
- # fake returned refreshed status
- mock_refresh.return_value = [{"": ""}] * nb_results
+ # fake returned refreshed status for 'archives' visit type
+ mock_refresh.return_value = [{"visit_type": "archives",}] * nb_results
out = StringIO()
- call_command(command_name, stdout=out)
-
- assert mock_refresh.called
+ call_command(COMMAND_NAME, stdout=out)
actual_output = out.getvalue()
if nb_results > 0:
assert f"updated {nb_results}" in actual_output
else:
assert "Nothing" in actual_output
+
+ assert mock_scheduler.called
+ assert mock_refresh.called
+
+
+@pytest.fixture
+def fake_refreshed_data():
+ """Prepare test data within the scheduler and the swh-web model db
+
+ """
+ duplicated_origin_url = AUTHORIZED_ORIGIN_URL % "specific-origin"
+ entries = (
+ [
+ {
+ "visit_type": "archives", # ignored from recurring task scheduling
+ "visit_status": VISIT_STATUS_FULL,
+ "task_status": SAVE_TASK_SUCCEEDED,
+ },
+ {
+ "visit_type": "hg", # scheduled as recurring task
+ "visit_status": VISIT_STATUS_PARTIAL,
+ "task_status": SAVE_TASK_SUCCEEDED,
+ },
+ {
+ "visit_type": "svn", # scheduled as recurring task
+ "visit_status": VISIT_STATUS_PARTIAL,
+ "task_status": SAVE_TASK_SCHEDULED,
+ },
+ {
+ "visit_type": "svn", # ignored from recurring task scheduling
+ "visit_status": VISIT_STATUS_FAILED,
+ "task_status": SAVE_TASK_FAILED,
+ },
+ {
+ "visit_type": "hg", # ignored from recurring task scheduling
+ "visit_status": "created",
+ "task_status": SAVE_TASK_SCHEDULED,
+ },
+ ]
+ + [
+ {
+ "visit_type": "git",
+ "visit_status": VISIT_STATUS_FULL,
+ "task_status": SAVE_TASK_SUCCEEDED,
+ "origin": duplicated_origin_url,
+ }
+ ]
+ * 3
+ ) # only 1 of the origin duplicates will be scheduled as recurring task
+
+ time_now = datetime.now(tz=timezone.utc) - timedelta(days=len(entries))
+ return [
+ SaveOriginRequestInfo(
+ visit_type=meta["visit_type"],
+ visit_status=meta["visit_status"],
+ origin_url=(
+ meta["origin"] if "origin" in meta else AUTHORIZED_ORIGIN_URL % i
+ ),
+ save_request_date=time_now + timedelta(days=i - 1),
+ save_request_status=SAVE_REQUEST_ACCEPTED,
+ visit_date=time_now + timedelta(days=i),
+ save_task_status=meta["task_status"],
+ id=i,
+ loading_task_id=i,
+ )
+ for i, meta in enumerate(entries)
+ ]
+
+
+def test_command_refresh__with_recurrent_tasks_scheduling(
+ mock_scheduler, mock_refresh, fake_refreshed_data, swh_scheduler
+):
+ """Refresh status command report updates of statuses. The successful ones without the
+ type 'archived' are also scheduled recurringly.
+
+ """
+ mock_refresh.return_value = fake_refreshed_data
+
+ # only visit types (git, hg, svn) types with status (full, partial) are taken into
+ # account for scheduling, so only 3 of those matches in the fake data set.
+ expected_nb_scheduled = 0
+
+ origins = set()
+ expected_nb_scheduled = 0
+ for entry in fake_refreshed_data:
+ visit_type = entry["visit_type"]
+ if visit_type == "archives": # only deal with git, svn, hg
+ continue
+ if entry["visit_status"] not in ("partial", "full"):
+ continue
+ origin = entry["origin_url"]
+ if (visit_type, origin) in origins:
+ continue
+ origins.add((visit_type, origin))
+ expected_nb_scheduled += 1
+
+ assert expected_nb_scheduled == 3
+
+ out = StringIO()
+ call_command(COMMAND_NAME, stdout=out)
+
+ actual_output = out.getvalue()
+ assert f"Successfully updated {len(fake_refreshed_data)}" in actual_output
+
+ lister = swh_scheduler.get_or_create_lister(
+ name="save-code-now", instance_name=get_config()["instance_name"]
+ )
+
+ result = list(stream_results(swh_scheduler.get_listed_origins, lister.id))
+ assert len(result) == expected_nb_scheduled
+
+ assert mock_scheduler.called
+ assert mock_refresh.called

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 10:03 PM (2 d, 13 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220371

Event Timeline