diff --git a/assets/src/bundles/save/artifact-form-row.ejs b/assets/src/bundles/save/artifact-form-row.ejs
new file mode 100644
index 00000000..f9d426eb
--- /dev/null
+++ b/assets/src/bundles/save/artifact-form-row.ejs
@@ -0,0 +1,34 @@
+<%#
+ Copyright (C) 2021 The Software Heritage developers
+ See the AUTHORS file at the top-level directory of this distribution
+ License: GNU Affero General Public License version 3, or any later version
+ See top-level LICENSE file for more information
+%>
+
+
You can contribute to extend the content of the Software Heritage archive by submitting an origin
save request. To do so, fill the required info in the form below:
Processing "save code now" request ...
A "Save code now" request takes the following parameters:
- Origin type: the type of version control system the software origin is using.
Currently, the supported types are:
- Origin url: the url of the remote repository for the software origin.
In order to avoid saving errors from Software Heritage, you should provide the clone/checkout url
as given by the provider hosting the software origin.
It can easily be found in the
web interface used to browse the software origin.
For instance, if you want to save a git
origin into the archive, you should check that the command $ git clone <origin_url>
does not return an error before submitting a request.
Once submitted, your save request can either be:
- accepted: a visit to the provided origin will then be scheduled by Software Heritage in order to
load its content into the archive as soon as possible
- rejected: the provided origin url is blacklisted and no visit will be scheduled
- put in pending state: a manual review will then be performed in order to determine if the
origin can be safely loaded or not into the archive
Once a save request has been accepted, you can follow its current status in the
submitted save requests list.
If you submitted requests while authenticated, you will be able
to only display your own requests.
Date |
Type |
Url |
Request |
Status |
Info |
|
{% endblock %}
diff --git a/swh/web/tests/api/views/test_origin_save.py b/swh/web/tests/api/views/test_origin_save.py
index be2f87b8..9705c6b5 100644
--- a/swh/web/tests/api/views/test_origin_save.py
+++ b/swh/web/tests/api/views/test_origin_save.py
@@ -1,595 +1,625 @@
# Copyright (C) 2018-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timedelta
import pytest
from django.contrib.auth.models import User
from django.core.exceptions import ObjectDoesNotExist
from django.utils import timezone
from swh.web.auth.utils import SWH_AMBASSADOR_PERMISSION
from swh.web.common.models import (
SAVE_REQUEST_ACCEPTED,
SAVE_REQUEST_PENDING,
SAVE_REQUEST_REJECTED,
SAVE_TASK_FAILED,
SAVE_TASK_NOT_CREATED,
SAVE_TASK_NOT_YET_SCHEDULED,
SAVE_TASK_SCHEDULED,
SAVE_TASK_SUCCEEDED,
VISIT_STATUS_FAILED,
VISIT_STATUS_FULL,
SaveAuthorizedOrigin,
SaveOriginRequest,
SaveUnauthorizedOrigin,
)
from swh.web.common.typing import OriginExistenceCheckInfo
from swh.web.common.utils import reverse
from swh.web.settings.tests import save_origin_rate_post
from swh.web.tests.utils import (
check_api_get_responses,
check_api_post_response,
check_api_post_responses,
)
pytestmark = pytest.mark.django_db
@pytest.fixture(autouse=True)
def populated_db():
SaveAuthorizedOrigin.objects.create(url="https://github.com/"),
SaveAuthorizedOrigin.objects.create(url="https://gitlab.com/"),
SaveUnauthorizedOrigin.objects.create(url="https://github.com/user/illegal_repo")
SaveUnauthorizedOrigin.objects.create(url="https://gitlab.com/user_to_exclude")
def test_invalid_visit_type(api_client):
url = reverse(
"api-1-save-origin",
url_args={
"visit_type": "foo",
"origin_url": "https://github.com/torvalds/linux",
},
)
check_api_get_responses(api_client, url, status_code=400)
def test_invalid_origin_url(api_client):
url = reverse(
"api-1-save-origin", url_args={"visit_type": "git", "origin_url": "bar"}
)
check_api_get_responses(api_client, url, status_code=400)
def check_created_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status,
scheduler_task_status=None,
scheduler_task_run_status=None,
expected_task_status=None,
visit_date=None,
):
mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler")
mock_origin_exists = mocker.patch("swh.web.common.origin_save.origin_exists")
mock_origin_exists.return_value = OriginExistenceCheckInfo(
origin_url=origin_url, exists=True, last_modified=None, content_length=None
)
if scheduler_task_status is None:
mock_scheduler.get_tasks.return_value = []
else:
mock_scheduler.get_tasks.return_value = [
{
"priority": "high",
"policy": "oneshot",
"type": "load-git",
"arguments": {"kwargs": {"repo_url": origin_url}, "args": []},
"status": scheduler_task_status,
"id": 1,
}
]
if scheduler_task_run_status is None:
mock_scheduler.get_task_runs.return_value = []
else:
mock_scheduler.get_task_runs.return_value = [
{
"backend_id": "f00c712c-e820-41ce-a07c-9bf8df914205",
"ended": datetime.now(tz=timezone.utc) + timedelta(minutes=5),
"id": 1,
"metadata": {},
"scheduled": datetime.now(tz=timezone.utc),
"started": None,
"status": scheduler_task_run_status,
"task": 1,
}
]
mock_scheduler.create_tasks.return_value = [
{
"priority": "high",
"policy": "oneshot",
"type": "load-git",
"arguments": {"kwargs": {"repo_url": origin_url}, "args": []},
"status": "next_run_not_scheduled",
"id": 1,
}
]
url = reverse(
"api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url}
)
mock_visit_date = mocker.patch(
("swh.web.common.origin_save._get_visit_info_for_save_request")
)
mock_visit_date.return_value = (visit_date, None)
if expected_request_status != SAVE_REQUEST_REJECTED:
response = check_api_post_responses(api_client, url, data=None, status_code=200)
assert response.data["save_request_status"] == expected_request_status
assert response.data["save_task_status"] == expected_task_status
else:
check_api_post_responses(api_client, url, data=None, status_code=403)
def check_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status,
expected_task_status,
scheduler_task_status="next_run_not_scheduled",
scheduler_task_run_status=None,
visit_date=None,
visit_status=None,
):
mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler")
mock_scheduler.get_tasks.return_value = [
{
"priority": "high",
"policy": "oneshot",
"type": "load-git",
"arguments": {"kwargs": {"repo_url": origin_url}, "args": []},
"status": scheduler_task_status,
"id": 1,
}
]
if scheduler_task_run_status is None:
mock_scheduler.get_task_runs.return_value = []
else:
mock_scheduler.get_task_runs.return_value = [
{
"backend_id": "f00c712c-e820-41ce-a07c-9bf8df914205",
"ended": datetime.now(tz=timezone.utc) + timedelta(minutes=5),
"id": 1,
"metadata": {},
"scheduled": datetime.now(tz=timezone.utc),
"started": None,
"status": scheduler_task_run_status,
"task": 1,
}
]
url = reverse(
"api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url}
)
mock_visit_date = mocker.patch(
("swh.web.common.origin_save._get_visit_info_for_save_request")
)
mock_visit_date.return_value = (visit_date, visit_status)
response = check_api_get_responses(api_client, url, status_code=200)
save_request_data = response.data[0]
assert save_request_data["save_request_status"] == expected_request_status
assert save_request_data["save_task_status"] == expected_task_status
assert save_request_data["visit_status"] == visit_status
# Check that save task status is still available when
# the scheduler task has been archived
mock_scheduler.get_tasks.return_value = []
response = check_api_get_responses(api_client, url, status_code=200)
save_request_data = response.data[0]
assert save_request_data["save_task_status"] == expected_task_status
assert save_request_data["visit_status"] == visit_status
def test_save_request_rejected(api_client, mocker):
origin_url = "https://github.com/user/illegal_repo"
check_created_save_request_status(
api_client, mocker, origin_url, expected_request_status=SAVE_REQUEST_REJECTED,
)
check_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status=SAVE_REQUEST_REJECTED,
expected_task_status=SAVE_TASK_NOT_CREATED,
)
def test_save_request_pending(api_client, mocker):
origin_url = "https://unkwownforge.com/user/repo"
check_created_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status=SAVE_REQUEST_PENDING,
expected_task_status=SAVE_TASK_NOT_CREATED,
)
check_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status=SAVE_REQUEST_PENDING,
expected_task_status=SAVE_TASK_NOT_CREATED,
)
def test_save_request_succeed(api_client, mocker):
origin_url = "https://github.com/Kitware/CMake"
check_created_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED,
)
check_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_SCHEDULED,
scheduler_task_status="next_run_scheduled",
scheduler_task_run_status="scheduled",
)
check_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_SUCCEEDED,
scheduler_task_status="completed",
scheduler_task_run_status="eventful",
visit_date=None,
)
visit_date = datetime.now(tz=timezone.utc) + timedelta(hours=1)
check_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_SUCCEEDED,
scheduler_task_status="completed",
scheduler_task_run_status="eventful",
visit_date=visit_date,
visit_status=VISIT_STATUS_FULL,
)
def test_save_request_failed(api_client, mocker):
origin_url = "https://gitlab.com/inkscape/inkscape"
check_created_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED,
)
check_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_SCHEDULED,
scheduler_task_status="next_run_scheduled",
scheduler_task_run_status="scheduled",
)
check_save_request_status(
api_client,
mocker,
origin_url,
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_FAILED,
scheduler_task_status="disabled",
scheduler_task_run_status="failed",
visit_status=VISIT_STATUS_FAILED,
)
def test_create_save_request_only_when_needed(api_client, mocker):
origin_url = "https://github.com/webpack/webpack"
SaveOriginRequest.objects.create(
visit_type="git",
origin_url=origin_url,
status=SAVE_REQUEST_ACCEPTED,
loading_task_id=56,
)
check_created_save_request_status(
api_client,
mocker,
origin_url,
scheduler_task_status="next_run_not_scheduled",
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED,
)
sors = list(
SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url)
)
assert len(sors) == 1
check_created_save_request_status(
api_client,
mocker,
origin_url,
scheduler_task_status="next_run_scheduled",
scheduler_task_run_status="scheduled",
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_SCHEDULED,
)
sors = list(
SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url)
)
assert len(sors) == 1
visit_date = datetime.now(tz=timezone.utc) + timedelta(hours=1)
check_created_save_request_status(
api_client,
mocker,
origin_url,
scheduler_task_status="completed",
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED,
visit_date=visit_date,
)
sors = list(
SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url)
)
# check_api_post_responses sends two POST requests to check YAML and JSON response
assert len(sors) == 3
check_created_save_request_status(
api_client,
mocker,
origin_url,
scheduler_task_status="disabled",
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED,
)
sors = list(
SaveOriginRequest.objects.filter(visit_type="git", origin_url=origin_url)
)
assert len(sors) == 5
def test_get_save_requests_unknown_origin(api_client):
unknown_origin_url = "https://gitlab.com/foo/bar"
url = reverse(
"api-1-save-origin",
url_args={"visit_type": "git", "origin_url": unknown_origin_url},
)
response = check_api_get_responses(api_client, url, status_code=404)
assert response.data == {
"exception": "NotFoundExc",
"reason": (
"No save requests found for visit of type git on origin with url %s."
)
% unknown_origin_url,
}
_visit_type = "git"
_origin_url = "https://github.com/python/cpython"
def test_save_requests_rate_limit(api_client, mocker):
create_save_origin_request = mocker.patch(
"swh.web.api.views.origin_save.create_save_origin_request"
)
def _save_request_dict(*args, **kwargs):
return {
"id": 1,
"visit_type": _visit_type,
"origin_url": _origin_url,
"save_request_date": datetime.now().isoformat(),
"save_request_status": SAVE_REQUEST_ACCEPTED,
"save_task_status": SAVE_TASK_NOT_YET_SCHEDULED,
"visit_date": None,
"visit_status": None,
}
create_save_origin_request.side_effect = _save_request_dict
url = reverse(
"api-1-save-origin",
url_args={"visit_type": _visit_type, "origin_url": _origin_url},
)
for _ in range(save_origin_rate_post):
check_api_post_response(api_client, url, status_code=200)
check_api_post_response(api_client, url, status_code=429)
def test_save_request_form_server_error(api_client, mocker):
create_save_origin_request = mocker.patch(
"swh.web.api.views.origin_save.create_save_origin_request"
)
create_save_origin_request.side_effect = Exception("Server error")
url = reverse(
"api-1-save-origin",
url_args={"visit_type": _visit_type, "origin_url": _origin_url},
)
check_api_post_responses(api_client, url, status_code=500)
@pytest.fixture
def origin_to_review():
return "https://git.example.org/user/project"
def test_create_save_request_pending_review_anonymous_user(
api_client, origin_to_review
):
url = reverse(
"api-1-save-origin",
url_args={"visit_type": "git", "origin_url": origin_to_review},
)
response = check_api_post_responses(api_client, url, status_code=200)
assert response.data["save_request_status"] == SAVE_REQUEST_PENDING
with pytest.raises(ObjectDoesNotExist):
SaveAuthorizedOrigin.objects.get(url=origin_to_review)
def test_create_save_request_archives_with_ambassador_user(
api_client, origin_to_review, keycloak_oidc, mocker, requests_mock,
):
keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION]
oidc_profile = keycloak_oidc.login()
api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}")
originUrl = "https://somewhere.org/simple"
artifact_version = "1.2.3"
artifact_filename = f"tarball-{artifact_version}.tar.gz"
artifact_url = f"{originUrl}/{artifact_filename}"
content_length = "100"
last_modified = "Sun, 21 Aug 2011 16:26:32 GMT"
requests_mock.head(
artifact_url,
status_code=200,
headers={"content-length": content_length, "last-modified": last_modified,},
)
mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler")
mock_scheduler.get_task_runs.return_value = []
mock_scheduler.create_tasks.return_value = [
{
"id": 10,
"priority": "high",
"policy": "oneshot",
"status": "next_run_not_scheduled",
"type": "load-archive-files",
"arguments": {
"args": [],
"kwargs": {
"url": originUrl,
"artifacts": [
{
"url": artifact_url,
"version": artifact_version,
"time": last_modified,
"length": content_length,
}
],
},
},
},
]
- # then
url = reverse(
"api-1-save-origin",
url_args={"visit_type": "archives", "origin_url": originUrl,},
)
response = check_api_post_response(
api_client,
url,
status_code=200,
- data={"artifact_url": artifact_url, "artifact_version": artifact_version,},
+ data={
+ "archives_data": [
+ {"artifact_url": artifact_url, "artifact_version": artifact_version,}
+ ]
+ },
)
assert response.data["save_request_status"] == SAVE_REQUEST_ACCEPTED
assert SaveAuthorizedOrigin.objects.get(url=originUrl)
+def test_create_save_request_archives_missing_artifacts_data(
+ api_client, origin_to_review, keycloak_oidc, mocker, requests_mock,
+):
+
+ keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION]
+ oidc_profile = keycloak_oidc.login()
+ api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}")
+
+ originUrl = "https://somewhere.org/simple"
+
+ url = reverse(
+ "api-1-save-origin",
+ url_args={"visit_type": "archives", "origin_url": originUrl,},
+ )
+
+ response = check_api_post_response(api_client, url, status_code=400, data={},)
+ assert "Artifacts data are missing" in response.data["reason"]
+
+ response = check_api_post_response(
+ api_client,
+ url,
+ status_code=400,
+ data={"archives_data": [{"artifact_url": "", "arttifact_version": "1.0"}]},
+ )
+ assert "Missing url or version for an artifact to load" in response.data["reason"]
+
+
def test_create_save_request_archives_accepted_ambassador_user(
api_client, origin_to_review, keycloak_oidc, mocker
):
keycloak_oidc.realm_permissions = [SWH_AMBASSADOR_PERMISSION]
oidc_profile = keycloak_oidc.login()
api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}")
check_created_save_request_status(
api_client,
mocker,
origin_to_review,
expected_request_status=SAVE_REQUEST_ACCEPTED,
expected_task_status=SAVE_TASK_NOT_YET_SCHEDULED,
)
assert SaveAuthorizedOrigin.objects.get(url=origin_to_review)
def test_create_save_request_anonymous_user_no_user_id(api_client):
origin_url = "https://some.git.hosters/user/repo"
url = reverse(
"api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url},
)
check_api_post_responses(api_client, url, status_code=200)
sor = SaveOriginRequest.objects.get(origin_url=origin_url)
assert sor.user_ids is None
def test_create_save_request_authenticated_user_id(
api_client, origin_to_review, keycloak_oidc, mocker
):
oidc_profile = keycloak_oidc.login()
api_client.credentials(HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}")
origin_url = "https://some.git.hosters/user/repo2"
url = reverse(
"api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url},
)
response = check_api_post_response(api_client, url, status_code=200)
assert response.wsgi_request.user.id is not None
user_id = str(response.wsgi_request.user.id)
sor = SaveOriginRequest.objects.get(user_ids=f'"{user_id}"')
assert sor.user_ids == f'"{user_id}"'
def test_create_pending_save_request_multiple_authenticated_users(api_client):
origin_url = "https://some.git.hosters/user/repo3"
first_user = User.objects.create_user(username="first_user", password="")
second_user = User.objects.create_user(username="second_user", password="")
url = reverse(
"api-1-save-origin", url_args={"visit_type": "git", "origin_url": origin_url},
)
api_client.force_login(first_user)
check_api_post_response(api_client, url, status_code=200)
api_client.force_login(second_user)
check_api_post_response(api_client, url, status_code=200)
assert SaveOriginRequest.objects.get(user_ids__contains=f'"{first_user.id}"')
assert SaveOriginRequest.objects.get(user_ids__contains=f'"{second_user.id}"')
diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py
index 1c6a0294..59329394 100644
--- a/swh/web/tests/common/test_origin_save.py
+++ b/swh/web/tests/common/test_origin_save.py
@@ -1,600 +1,594 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timedelta, timezone
from functools import partial
import re
from typing import Optional
import iso8601
import pytest
import requests
from swh.core.pytest_plugin import get_response_cb
from swh.web.common.exc import BadInputExc
from swh.web.common.models import (
SAVE_REQUEST_ACCEPTED,
SAVE_TASK_FAILED,
SAVE_TASK_RUNNING,
SAVE_TASK_SCHEDULED,
SAVE_TASK_SUCCEEDED,
VISIT_STATUS_FULL,
SaveOriginRequest,
)
from swh.web.common.origin_save import (
_check_origin_exists,
_check_visit_type_savable,
_visit_type_task,
_visit_type_task_privileged,
get_savable_visit_types,
get_save_origin_requests,
get_save_origin_task_info,
origin_exists,
refresh_save_origin_request_statuses,
)
from swh.web.common.typing import (
OriginExistenceCheckInfo,
OriginVisitInfo,
SaveOriginRequestInfo,
)
from swh.web.config import get_config
_es_url = "http://esnode1.internal.softwareheritage.org:9200"
_es_workers_index_url = "%s/swh_workers-*" % _es_url
_origin_url = "https://gitlab.com/inkscape/inkscape"
_visit_type = "git"
_task_id = 203525448
@pytest.fixture(autouse=True)
def requests_mock_datadir(datadir, requests_mock_datadir):
"""Override default behavior to deal with post method"""
cb = partial(get_response_cb, datadir=datadir)
requests_mock_datadir.post(re.compile("https?://"), body=cb)
return requests_mock_datadir
@pytest.mark.django_db
def test_get_save_origin_archived_task_info(mocker):
_get_save_origin_task_info_test(mocker, task_archived=True)
@pytest.mark.django_db
def test_get_save_origin_task_full_info_with_es(mocker):
_get_save_origin_task_info_test(mocker, es_available=True)
@pytest.mark.django_db
def test_get_save_origin_task_info_with_es(mocker):
_get_save_origin_task_info_test(mocker, es_available=True, full_info=False)
@pytest.mark.django_db
def test_get_save_origin_task_info_without_es(mocker):
_get_save_origin_task_info_test(mocker, es_available=False)
def _mock_scheduler(
mocker, task_status="completed", task_run_status="eventful", task_archived=False
):
mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler")
task = {
"arguments": {"args": [], "kwargs": {"repo_url": _origin_url},},
"current_interval": timedelta(days=64),
"id": _task_id,
"next_run": datetime.now(tz=timezone.utc) + timedelta(days=64),
"policy": "oneshot",
"priority": "high",
"retries_left": 0,
"status": task_status,
"type": "load-git",
}
mock_scheduler.get_tasks.return_value = [dict(task) if not task_archived else None]
task_run = {
"backend_id": "f00c712c-e820-41ce-a07c-9bf8df914205",
"ended": datetime.now(tz=timezone.utc) + timedelta(minutes=5),
"id": 654270631,
"metadata": {},
"scheduled": datetime.now(tz=timezone.utc),
"started": None,
"status": task_run_status,
"task": _task_id,
}
mock_scheduler.get_task_runs.return_value = [
dict(task_run) if not task_archived else None
]
return task, task_run
@pytest.mark.parametrize(
"wrong_type,privileged_user",
[
("dummy", True),
("dumb", False),
("archives", False), # when no privilege, this is rejected
],
)
def test__check_visit_type_savable(wrong_type, privileged_user):
with pytest.raises(BadInputExc, match="Allowed types"):
_check_visit_type_savable(wrong_type, privileged_user)
# when privileged_user, the following is accepted though
_check_visit_type_savable("archives", True)
def test_get_savable_visit_types():
default_list = list(_visit_type_task.keys())
assert set(get_savable_visit_types()) == set(default_list)
privileged_list = default_list.copy()
privileged_list += list(_visit_type_task_privileged.keys())
assert set(get_savable_visit_types(privileged_user=True)) == set(privileged_list)
def _get_save_origin_task_info_test(
mocker, task_archived=False, es_available=True, full_info=True
):
swh_web_config = get_config()
if es_available:
swh_web_config.update({"es_workers_index_url": _es_workers_index_url})
else:
swh_web_config.update({"es_workers_index_url": ""})
sor = SaveOriginRequest.objects.create(
request_date=datetime.now(tz=timezone.utc),
visit_type=_visit_type,
origin_url="https://gitlab.com/inkscape/inkscape",
status=SAVE_REQUEST_ACCEPTED,
visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1),
loading_task_id=_task_id,
)
task, task_run = _mock_scheduler(mocker, task_archived=task_archived)
es_response = requests.post("%s/_search" % _es_workers_index_url).json()
task_exec_data = es_response["hits"]["hits"][-1]["_source"]
sor_task_info = get_save_origin_task_info(sor.id, full_info=full_info)
expected_result = (
{
"type": task["type"],
"arguments": task["arguments"],
"id": task["id"],
"backend_id": task_run["backend_id"],
"scheduled": task_run["scheduled"],
"started": task_run["started"],
"ended": task_run["ended"],
"status": task_run["status"],
"visit_status": sor.visit_status,
}
if not task_archived
else {}
)
if es_available and not task_archived:
expected_result.update(
{
"message": task_exec_data["message"],
"name": task_exec_data["swh_task_name"],
"worker": task_exec_data["hostname"],
}
)
if not full_info:
expected_result.pop("id", None)
expected_result.pop("backend_id", None)
expected_result.pop("worker", None)
if "message" in expected_result:
message = ""
message_lines = expected_result["message"].split("\n")
for line in message_lines:
if line.startswith("Traceback"):
break
message += f"{line}\n"
message += message_lines[-1]
expected_result["message"] = message
assert sor_task_info == expected_result
@pytest.mark.django_db
def test_get_save_origin_requests_find_visit_date(mocker):
# create a save request
SaveOriginRequest.objects.create(
request_date=datetime.now(tz=timezone.utc),
visit_type=_visit_type,
origin_url=_origin_url,
status=SAVE_REQUEST_ACCEPTED,
visit_date=None,
loading_task_id=_task_id,
)
# mock scheduler and archive
_mock_scheduler(mocker)
mock_archive = mocker.patch("swh.web.common.origin_save.archive")
mock_archive.lookup_origin.return_value = {"url": _origin_url}
mock_get_origin_visits = mocker.patch(
"swh.web.common.origin_save.get_origin_visits"
)
# create a visit for the save request
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
date=visit_date,
formatted_date="",
metadata={},
origin=_origin_url,
snapshot="",
status="full",
type=_visit_type,
url="",
visit=34,
)
mock_get_origin_visits.return_value = [visit_info]
# check visit date has been correctly found
sors = get_save_origin_requests(_visit_type, _origin_url)
assert len(sors) == 1
assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED
assert sors[0]["visit_date"] == visit_date
mock_get_origin_visits.assert_called_once()
# check visit is not searched again when it has been found
get_save_origin_requests(_visit_type, _origin_url)
mock_get_origin_visits.assert_called_once()
# check visit date are not searched for save requests older than
# one month
sor = SaveOriginRequest.objects.create(
visit_type=_visit_type,
origin_url=_origin_url,
status=SAVE_REQUEST_ACCEPTED,
loading_task_id=_task_id,
visit_date=None,
)
sor.request_date = datetime.now(tz=timezone.utc) - timedelta(days=31)
sor.save()
_mock_scheduler(mocker, task_status="disabled", task_run_status="failed")
sors = get_save_origin_requests(_visit_type, _origin_url)
assert len(sors) == 2
assert sors[0]["save_task_status"] == SAVE_TASK_FAILED
assert sors[0]["visit_date"] is None
mock_get_origin_visits.assert_called_once()
def _get_save_origin_requests(
mocker, load_status, visit_status, request_date: Optional[datetime] = None
):
"""Wrapper around the get_origin_save_origin_request call.
"""
SaveOriginRequest.objects.create(
request_date=datetime.now(tz=timezone.utc),
visit_type=_visit_type,
visit_status=visit_status,
origin_url=_origin_url,
status=SAVE_REQUEST_ACCEPTED,
visit_date=None,
loading_task_id=_task_id,
)
# mock scheduler and archives
_mock_scheduler(
mocker, task_status="next_run_scheduled", task_run_status=load_status
)
mock_archive = mocker.patch("swh.web.common.origin_save.archive")
mock_archive.lookup_origin.return_value = {"url": _origin_url}
mock_get_origin_visits = mocker.patch(
"swh.web.common.origin_save.get_origin_visits"
)
# create a visit for the save request with status created
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
date=visit_date,
formatted_date="",
metadata={},
origin=_origin_url,
snapshot="", # make mypy happy
status=visit_status,
type=_visit_type,
url="",
visit=34,
)
mock_get_origin_visits.return_value = [visit_info]
sors = get_save_origin_requests(_visit_type, _origin_url)
mock_get_origin_visits.assert_called_once()
return sors
@pytest.mark.parametrize("visit_date", [None, "some-date"])
def test_from_save_origin_request_to_save_request_info_dict(visit_date):
"""Ensure save request to json serializable dict is fine
"""
request_date = datetime.now(tz=timezone.utc)
_visit_date = request_date + timedelta(minutes=5) if visit_date else None
request_date = datetime.now(tz=timezone.utc)
sor = SaveOriginRequest(
request_date=request_date,
visit_type=_visit_type,
visit_status=VISIT_STATUS_FULL,
origin_url=_origin_url,
status=SAVE_REQUEST_ACCEPTED,
loading_task_status=None,
visit_date=_visit_date,
loading_task_id=1,
)
assert sor.to_dict() == SaveOriginRequestInfo(
id=sor.id,
origin_url=sor.origin_url,
visit_type=sor.visit_type,
save_request_date=sor.request_date.isoformat(),
save_request_status=sor.status,
save_task_status=sor.loading_task_status,
visit_status=sor.visit_status,
visit_date=_visit_date.isoformat() if _visit_date else None,
loading_task_id=sor.loading_task_id,
)
def test__check_origin_exists_404(requests_mock):
url_ko = "https://example.org/some-inexistant-url"
requests_mock.head(url_ko, status_code=404)
with pytest.raises(BadInputExc, match="not exist"):
_check_origin_exists(url_ko)
-@pytest.mark.parametrize("invalid_origin", [None, ""])
-def test__check_origin_invalid_input(invalid_origin):
- with pytest.raises(BadInputExc, match="must be set"):
- _check_origin_exists(invalid_origin)
-
-
def test__check_origin_exists_200(requests_mock):
url = "https://example.org/url"
requests_mock.head(url, status_code=200)
# passes the check
actual_metadata = _check_origin_exists(url)
# and we actually may have retrieved some metadata on the origin
assert actual_metadata == origin_exists(url)
def test_origin_exists_404(requests_mock):
"""Origin which does not exist should be reported as inexistent"""
url_ko = "https://example.org/some-inexistant-url"
requests_mock.head(url_ko, status_code=404)
actual_result = origin_exists(url_ko)
assert actual_result == OriginExistenceCheckInfo(
origin_url=url_ko, exists=False, last_modified=None, content_length=None,
)
def test_origin_exists_200_no_data(requests_mock):
"""Existing origin should be reported as such (no extra information)"""
url = "http://example.org/real-url"
requests_mock.head(
url, status_code=200,
)
actual_result = origin_exists(url)
assert actual_result == OriginExistenceCheckInfo(
origin_url=url, exists=True, last_modified=None, content_length=None,
)
def test_origin_exists_200_with_data(requests_mock):
"""Existing origin should be reported as such (+ extra information)"""
url = "http://example.org/real-url"
requests_mock.head(
url,
status_code=200,
headers={
"content-length": "10",
"last-modified": "Sun, 21 Aug 2011 16:26:32 GMT",
},
)
actual_result = origin_exists(url)
assert actual_result == OriginExistenceCheckInfo(
origin_url=url,
exists=True,
content_length=10,
last_modified="2011-08-21T16:26:32",
)
def test_origin_exists_200_with_data_unexpected_date_format(requests_mock):
"""Existing origin should be ok, unexpected last modif time result in no time"""
url = "http://example.org/real-url2"
# this is parsable but not as expected
unexpected_format_date = "Sun, 21 Aug 2021 16:26:32"
requests_mock.head(
url, status_code=200, headers={"last-modified": unexpected_format_date,},
)
actual_result = origin_exists(url)
# so the resulting date is None
assert actual_result == OriginExistenceCheckInfo(
origin_url=url, exists=True, content_length=None, last_modified=None,
)
@pytest.mark.django_db
@pytest.mark.parametrize("visit_status", ["created", "ongoing",])
def test_get_save_origin_requests_no_visit_date_found(mocker, visit_status):
"""Uneventful visits with failed visit status are marked as failed
"""
sors = _get_save_origin_requests(
mocker, load_status="scheduled", visit_status=visit_status,
)
# check no visit date has been found
assert len(sors) == 1
assert sors[0]["save_task_status"] == SAVE_TASK_RUNNING
assert sors[0]["visit_date"] is None
assert sors[0]["visit_status"] == visit_status
@pytest.mark.django_db
@pytest.mark.parametrize("visit_status", ["not_found", "failed",])
def test_get_save_origin_requests_no_failed_status_override(mocker, visit_status):
"""Uneventful visits with failed statuses (failed, not found) are marked as failed
"""
sors = _get_save_origin_requests(
mocker, load_status="uneventful", visit_status=visit_status
)
assert len(sors) == 1
assert sors[0]["save_task_status"] == SAVE_TASK_FAILED
visit_date = sors[0]["visit_date"]
if visit_status == "failed":
assert visit_date is None
else:
assert visit_date is not None
sors = get_save_origin_requests(_visit_type, _origin_url)
assert len(sors) == 1
assert sors[0]["save_task_status"] == SAVE_TASK_FAILED
assert sors[0]["visit_status"] == visit_status
@pytest.mark.django_db
@pytest.mark.parametrize(
"load_status,visit_status",
[("eventful", "full"), ("eventful", "partial"), ("uneventful", "partial"),],
)
def test_get_visit_info_for_save_request_succeeded(mocker, load_status, visit_status):
"""Nominal scenario, below 30 days, returns something"""
sors = _get_save_origin_requests(
mocker, load_status=load_status, visit_status=visit_status
)
assert len(sors) == 1
assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED
assert sors[0]["visit_date"] is not None
assert sors[0]["visit_status"] == visit_status
sors = get_save_origin_requests(_visit_type, _origin_url)
assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED
assert sors[0]["visit_status"] == visit_status
@pytest.mark.django_db
@pytest.mark.parametrize("load_status", ["eventful", "uneventful",])
def test_get_visit_info_incomplete_visit_still_successful(mocker, load_status):
"""Incomplete visit information, yet the task is updated partially
"""
sors = _get_save_origin_requests(
mocker, load_status=load_status, visit_status=None,
)
assert len(sors) == 1
assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED
# As the entry is missing the following information though
assert sors[0]["visit_date"] is None
assert sors[0]["visit_status"] is None
# It's still detected as to be updated by the refresh routine
sors = refresh_save_origin_request_statuses()
assert len(sors) == 1
assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED
assert sors[0]["visit_date"] is None
assert sors[0]["visit_status"] is None
@pytest.mark.django_db
def test_refresh_save_request_statuses(mocker, api_client, archive_data):
"""Refresh filters save origins requests and update if changes
"""
date_now = datetime.now(tz=timezone.utc)
date_pivot = date_now - timedelta(days=30)
# returned visit status
sors = _get_save_origin_requests(
mocker, load_status=SAVE_TASK_SCHEDULED, visit_status=None,
)
assert len(sors) == 1
# no changes so refresh does detect the entry but does nothing
sors = refresh_save_origin_request_statuses()
assert len(sors) == 1
for sor in sors:
assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot
# as it turns out, in this test, this won't update anything as no new status got
# returned by the scheduler
assert sor["save_task_status"] == SAVE_TASK_SCHEDULED
# Information is empty
assert sor["visit_date"] is None
assert sor["visit_status"] is None
# make the scheduler return eventful event for that origin
_mock_scheduler(mocker)
# updates will be detected, entry should be updated but we are still missing info
sors = refresh_save_origin_request_statuses()
assert len(sors) == 1
for sor in sors:
assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot
# The status is updated
assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED
# but the following entries are missing so it's not updated
assert sor["visit_date"] is None
assert sor["visit_status"] is None
# This time around, the origin returned will have all information updated
mock_get_origin_visits = mocker.patch(
"swh.web.common.origin_save.get_origin_visits"
)
# create a visit for the save request with status created
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
date=visit_date,
formatted_date="",
metadata={},
origin=_origin_url,
snapshot="", # make mypy happy
status="full",
type=_visit_type,
url="",
visit=34,
)
mock_get_origin_visits.return_value = [visit_info]
# Detected entry, this time it should be updated
sors = refresh_save_origin_request_statuses()
assert len(sors) == 1
for sor in sors:
assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot
# as it turns out, in this test, this won't update anything as no new status got
# returned by the scheduler
assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED
assert sor["visit_date"] == visit_date
assert sor["visit_status"] == "full"
# This time, nothing left to update
sors = refresh_save_origin_request_statuses()
assert len(sors) == 0