Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123115
D5858.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Subscribers
None
D5858.diff
View Options
diff --git a/conftest.py b/conftest.py
--- a/conftest.py
+++ b/conftest.py
@@ -1 +1 @@
-pytest_plugins = ["swh.auth.pytest_plugin"]
+pytest_plugins = ["swh.auth.pytest_plugin", "swh.scheduler.pytest_plugin"]
diff --git a/requirements-test.txt b/requirements-test.txt
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -9,6 +9,7 @@
requests-mock != 1.9.0, != 1.9.1
swh.core[http] >= 0.0.95
swh.loader.git >= 0.8.0
+swh-scheduler[testing] >= 0.5.0
swh.storage >= 0.1.1
types-docutils
types-pyyaml
diff --git a/swh/web/common/management/commands/refresh_savecodenow_statuses.py b/swh/web/common/management/commands/refresh_savecodenow_statuses.py
--- a/swh/web/common/management/commands/refresh_savecodenow_statuses.py
+++ b/swh/web/common/management/commands/refresh_savecodenow_statuses.py
@@ -3,16 +3,57 @@
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Set
+
from django.core.management.base import BaseCommand
+from swh.scheduler.model import ListedOrigin
+from swh.web.common.models import VISIT_STATUS_FULL, VISIT_STATUS_PARTIAL
from swh.web.common.origin_save import refresh_save_origin_request_statuses
+from swh.web.config import get_config
+from swh.web.config import scheduler as get_scheduler
class Command(BaseCommand):
help = "Refresh save code now origin request statuses periodically"
def handle(self, *args, **options):
+ """Refresh origin save code now requests.
+
+ For the origin visit types, svn, git, hg, this also installs the origins as
+ recurring origins to visit.
+
+ """
refreshed_statuses = refresh_save_origin_request_statuses()
+ scheduler = get_scheduler()
+
+ # then schedule the origins with meaningful status and type to be ingested
+ # regularly
+ lister = scheduler.get_or_create_lister(
+ name="save-code-now", instance_name=get_config()["instance_name"]
+ )
+
+ origins: Set[str, str] = set()
+ listed_origins = []
+ for status in refreshed_statuses:
+ visit_type = status["visit_type"]
+ # only deal with git, svn, hg visit types
+ if visit_type == "archives":
+ continue
+ # only keep satisfying visit statuses
+ if status["visit_status"] not in (VISIT_STATUS_PARTIAL, VISIT_STATUS_FULL):
+ continue
+ origin = status["origin_url"]
+ # drop duplicates within the same batch
+ if (visit_type, origin) in origins:
+ continue
+ origins.add((visit_type, origin))
+ listed_origins.append(
+ ListedOrigin(lister_id=lister.id, visit_type=visit_type, url=origin)
+ )
+
+ if listed_origins:
+ scheduler.record_listed_origins(listed_origins)
if len(refreshed_statuses) > 0:
msg = f"Successfully updated {len(refreshed_statuses)} save request(s)."
diff --git a/swh/web/config.py b/swh/web/config.py
--- a/swh/web/config.py
+++ b/swh/web/config.py
@@ -131,9 +131,10 @@
"metadata_search_backend": ("string", "swh-indexer-storage"), # or "swh-search"
"counters_backend": ("string", "swh-storage"), # or "swh-counters"
"staging_server_names": ("list", STAGING_SERVER_NAMES),
+ "instance_name": ("str", "archive-test.softwareheritage.org"),
}
-swhweb_config = {} # type: Dict[str, Any]
+swhweb_config: Dict[str, Any] = {}
def get_config(config_file="web/web"):
diff --git a/swh/web/tests/common/test_django_command.py b/swh/web/tests/common/test_django_command.py
--- a/swh/web/tests/common/test_django_command.py
+++ b/swh/web/tests/common/test_django_command.py
@@ -3,33 +3,175 @@
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from datetime import datetime, timedelta, timezone
from io import StringIO
import pytest
from django.core.management import call_command
+from swh.core.api.classes import stream_results
+from swh.web.common.models import (
+ SAVE_REQUEST_ACCEPTED,
+ SAVE_TASK_FAILED,
+ SAVE_TASK_SCHEDULED,
+ SAVE_TASK_SUCCEEDED,
+ VISIT_STATUS_FAILED,
+ VISIT_STATUS_FULL,
+ VISIT_STATUS_PARTIAL,
+)
+from swh.web.common.typing import SaveOriginRequestInfo
+from swh.web.config import get_config
+
+MODULE_FQDN = "swh.web.common.management.commands"
+COMMAND_NAME = "refresh_savecodenow_statuses"
+
+AUTHORIZED_ORIGIN_URL = "https://scm.ourproject.org/anonscm/%s"
+
+
+@pytest.fixture
+def mock_refresh(mocker):
+ return mocker.patch(
+ f"{MODULE_FQDN}.{COMMAND_NAME}.refresh_save_origin_request_statuses"
+ )
+
+
+@pytest.fixture
+def mock_scheduler(mocker, swh_scheduler):
+ mock_scheduler = mocker.patch(f"{MODULE_FQDN}.{COMMAND_NAME}.get_scheduler")
+ mock_scheduler.return_value = swh_scheduler
+
+ return mock_scheduler
+
@pytest.mark.parametrize("nb_results", [0, 10, 20])
-def test_command_refresh__with_statuses_refreshed(mocker, nb_results):
- """Refresh status command reported updated non-terminal statuses.
+def test_command_refresh__with_statuses_refreshed(
+ mock_scheduler, mock_refresh, nb_results
+):
+ """Refresh status command reports non-terminal statuses updates.
"""
- command_name = "refresh_savecodenow_statuses"
- module_fqdn = "swh.web.common.management.commands"
- mock_refresh = mocker.patch(
- f"{module_fqdn}.{command_name}.refresh_save_origin_request_statuses"
- )
- # fake returned refreshed status
- mock_refresh.return_value = [{"": ""}] * nb_results
+ # fake returned refreshed status for 'archives' visit type
+ mock_refresh.return_value = [{"visit_type": "archives",}] * nb_results
out = StringIO()
- call_command(command_name, stdout=out)
-
- assert mock_refresh.called
+ call_command(COMMAND_NAME, stdout=out)
actual_output = out.getvalue()
if nb_results > 0:
assert f"updated {nb_results}" in actual_output
else:
assert "Nothing" in actual_output
+
+ assert mock_scheduler.called
+ assert mock_refresh.called
+
+
+@pytest.fixture
+def fake_refreshed_data():
+ """Prepare test data within the scheduler and the swh-web model db
+
+ """
+ duplicated_origin_url = AUTHORIZED_ORIGIN_URL % "specific-origin"
+ entries = (
+ [
+ {
+ "visit_type": "archives", # ignored from recurring task scheduling
+ "visit_status": VISIT_STATUS_FULL,
+ "task_status": SAVE_TASK_SUCCEEDED,
+ },
+ {
+ "visit_type": "hg", # scheduled as recurring task
+ "visit_status": VISIT_STATUS_PARTIAL,
+ "task_status": SAVE_TASK_SUCCEEDED,
+ },
+ {
+ "visit_type": "svn", # scheduled as recurring task
+ "visit_status": VISIT_STATUS_PARTIAL,
+ "task_status": SAVE_TASK_SCHEDULED,
+ },
+ {
+ "visit_type": "svn", # ignored from recurring task scheduling
+ "visit_status": VISIT_STATUS_FAILED,
+ "task_status": SAVE_TASK_FAILED,
+ },
+ {
+ "visit_type": "hg", # ignored from recurring task scheduling
+ "visit_status": "created",
+ "task_status": SAVE_TASK_SCHEDULED,
+ },
+ ]
+ + [
+ {
+ "visit_type": "git",
+ "visit_status": VISIT_STATUS_FULL,
+ "task_status": SAVE_TASK_SUCCEEDED,
+ "origin": duplicated_origin_url,
+ }
+ ]
+ * 3
+ ) # only 1 of the origin duplicates will be scheduled as recurring task
+
+ time_now = datetime.now(tz=timezone.utc) - timedelta(days=len(entries))
+ return [
+ SaveOriginRequestInfo(
+ visit_type=meta["visit_type"],
+ visit_status=meta["visit_status"],
+ origin_url=(
+ meta["origin"] if "origin" in meta else AUTHORIZED_ORIGIN_URL % i
+ ),
+ save_request_date=time_now + timedelta(days=i - 1),
+ save_request_status=SAVE_REQUEST_ACCEPTED,
+ visit_date=time_now + timedelta(days=i),
+ save_task_status=meta["task_status"],
+ id=i,
+ loading_task_id=i,
+ )
+ for i, meta in enumerate(entries)
+ ]
+
+
+def test_command_refresh__with_recurrent_tasks_scheduling(
+ mock_scheduler, mock_refresh, fake_refreshed_data, swh_scheduler
+):
+ """Refresh status command report updates of statuses. The successful ones without the
+ type 'archived' are also scheduled recurringly.
+
+ """
+ mock_refresh.return_value = fake_refreshed_data
+
+ # only visit types (git, hg, svn) types with status (full, partial) are taken into
+ # account for scheduling, so only 3 of those matches in the fake data set.
+ expected_nb_scheduled = 0
+
+ origins = set()
+ expected_nb_scheduled = 0
+ for entry in fake_refreshed_data:
+ visit_type = entry["visit_type"]
+ if visit_type == "archives": # only deal with git, svn, hg
+ continue
+ if entry["visit_status"] not in ("partial", "full"):
+ continue
+ origin = entry["origin_url"]
+ if (visit_type, origin) in origins:
+ continue
+ origins.add((visit_type, origin))
+ expected_nb_scheduled += 1
+
+ assert expected_nb_scheduled == 3
+
+ out = StringIO()
+ call_command(COMMAND_NAME, stdout=out)
+
+ actual_output = out.getvalue()
+ assert f"Successfully updated {len(fake_refreshed_data)}" in actual_output
+
+ lister = swh_scheduler.get_or_create_lister(
+ name="save-code-now", instance_name=get_config()["instance_name"]
+ )
+
+ result = list(stream_results(swh_scheduler.get_listed_origins, lister.id))
+ assert len(result) == expected_nb_scheduled
+
+ assert mock_scheduler.called
+ assert mock_refresh.called
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 10:03 PM (2 d, 8 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220371
Attached To
D5858: Schedule save code now as recurring origins to ingest when successful
Event Timeline
Log In to Comment