diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py --- a/swh/web/common/origin_save.py +++ b/swh/web/common/origin_save.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2019 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -148,19 +148,25 @@ def _get_visit_info_for_save_request(save_request): visit_date = None visit_status = None - try: - origin = {"url": save_request.origin_url} - origin_info = service.lookup_origin(origin) - origin_visits = get_origin_visits(origin_info) - visit_dates = [parse_timestamp(v["date"]) for v in origin_visits] - i = bisect_right(visit_dates, save_request.request_date) - if i != len(visit_dates): - visit_date = visit_dates[i] - visit_status = origin_visits[i]["status"] - if origin_visits[i]["status"] == "ongoing": - visit_date = None - except Exception as exc: - sentry_sdk.capture_exception(exc) + time_now = datetime.now(tz=timezone.utc) + time_delta = time_now - save_request.request_date + # stop trying to find a visit date one month after save request submission + # as those requests to storage are expensive and associated loading task + # surely ended up with errors + if time_delta.days <= 30: + try: + origin = {"url": save_request.origin_url} + origin_info = service.lookup_origin(origin) + origin_visits = get_origin_visits(origin_info) + visit_dates = [parse_timestamp(v["date"]) for v in origin_visits] + i = bisect_right(visit_dates, save_request.request_date) + if i != len(visit_dates): + visit_date = visit_dates[i] + visit_status = origin_visits[i]["status"] + if origin_visits[i]["status"] == "ongoing": + visit_date = None + except Exception as exc: + sentry_sdk.capture_exception(exc) return visit_date, visit_status diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py --- a/swh/web/tests/common/test_origin_save.py +++ b/swh/web/tests/common/test_origin_save.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -14,13 +14,21 @@ from swh.core.pytest_plugin import get_response_cb from swh.web.common.models import SaveOriginRequest -from swh.web.common.origin_save import get_save_origin_task_info +from swh.web.common.origin_save import ( + get_save_origin_task_info, + get_save_origin_requests, +) +from swh.web.common.typing import OriginVisitInfo from swh.web.config import get_config _es_url = "http://esnode1.internal.softwareheritage.org:9200" _es_workers_index_url = "%s/swh_workers-*" % _es_url +_origin_url = "https://gitlab.com/inkscape/inkscape" +_visit_type = "git" +_task_id = 203525448 + @pytest.fixture(autouse=True) def requests_mock_datadir(datadir, requests_mock_datadir): @@ -47,36 +55,14 @@ _get_save_origin_task_info_test(mocker, es_available=False) -def _get_save_origin_task_info_test(mocker, task_archived=False, es_available=True): - swh_web_config = get_config() - - if es_available: - swh_web_config.update({"es_workers_index_url": _es_workers_index_url}) - else: - swh_web_config.update({"es_workers_index_url": ""}) - - sor_id = 4473 - - SaveOriginRequest.objects.create( - id=sor_id, - request_date=datetime(2019, 8, 30, 23, 7, 3, 474294, tzinfo=timezone.utc), - visit_type="git", - origin_url="https://gitlab.com/inkscape/inkscape", - status="accepted", - loading_task_id=203525448, - visit_date=datetime(2019, 8, 30, 23, 18, 11, 54341, tzinfo=timezone.utc), - ) - +def _mock_scheduler(mocker, task_status="succeed", task_archived=False): mock_scheduler = mocker.patch("swh.web.common.origin_save.scheduler") task = ( { - "arguments": { - "args": [], - "kwargs": {"repo_url": "https://gitlab.com/inkscape/inkscape"}, - }, + "arguments": {"args": [], "kwargs": {"repo_url": _origin_url},}, "current_interval": timedelta(days=64), - "id": 203525448, - "next_run": datetime(2019, 8, 30, 23, 7, 1, 614823), + "id": _task_id, + "next_run": datetime.now(tz=timezone.utc) + timedelta(days=64), "policy": "oneshot", "priority": "high", "retries_left": 0, @@ -90,21 +76,42 @@ task_run = { "backend_id": "f00c712c-e820-41ce-a07c-9bf8df914205", - "ended": datetime(2019, 8, 30, 23, 18, 13, 770800), + "ended": datetime.now(tz=timezone.utc) + timedelta(minutes=5), "id": 654270631, "metadata": {}, - "scheduled": datetime(2019, 8, 30, 23, 8, 34, 282021), + "scheduled": datetime.now(tz=timezone.utc), "started": None, - "status": "failed", - "task": 203525448, + "status": task_status, + "task": _task_id, } mock_scheduler.get_task_runs.return_value = [task_run] + return task, task_run + + +def _get_save_origin_task_info_test(mocker, task_archived=False, es_available=True): + swh_web_config = get_config() + + if es_available: + swh_web_config.update({"es_workers_index_url": _es_workers_index_url}) + else: + swh_web_config.update({"es_workers_index_url": ""}) + + sor = SaveOriginRequest.objects.create( + request_date=datetime.now(tz=timezone.utc), + visit_type=_visit_type, + origin_url="https://gitlab.com/inkscape/inkscape", + status="accepted", + visit_date=datetime.now(tz=timezone.utc) + timedelta(hours=1), + loading_task_id=_task_id, + ) + + task, task_run = _mock_scheduler(mocker, task_archived=task_archived) es_response = requests.post("%s/_search" % _es_workers_index_url).json() task_exec_data = es_response["hits"]["hits"][-1]["_source"] - sor_task_info = get_save_origin_task_info(sor_id) + sor_task_info = get_save_origin_task_info(sor.id) expected_result = ( { @@ -130,3 +137,68 @@ ) assert sor_task_info == expected_result + + +@pytest.mark.django_db +def test_get_save_origin_requests_find_visit_date(mocker): + # create a save request + SaveOriginRequest.objects.create( + request_date=datetime.now(tz=timezone.utc), + visit_type=_visit_type, + origin_url=_origin_url, + status="accepted", + visit_date=None, + loading_task_id=_task_id, + ) + + # mock scheduler and services + _mock_scheduler(mocker) + mock_service = mocker.patch("swh.web.common.origin_save.service") + mock_service.lookup_origin.return_value = {"url": _origin_url} + mock_get_origin_visits = mocker.patch( + "swh.web.common.origin_save.get_origin_visits" + ) + # create a visit for the save request + visit_date = datetime.now(tz=timezone.utc).isoformat() + visit_info = OriginVisitInfo( + date=visit_date, + formatted_date="", + metadata={}, + origin=_origin_url, + snapshot="", + status="full", + type=_visit_type, + url="", + visit=34, + ) + mock_get_origin_visits.return_value = [visit_info] + + # check visit date has been correctly found + sors = get_save_origin_requests(_visit_type, _origin_url) + assert len(sors) == 1 + assert sors[0]["visit_date"] == visit_date + mock_get_origin_visits.assert_called_once() + + # check visit is not searched again when it has been found + get_save_origin_requests(_visit_type, _origin_url) + mock_get_origin_visits.assert_called_once() + + # check visit date are not searched for save requests older than + # one month + sor = SaveOriginRequest.objects.create( + visit_type=_visit_type, + origin_url=_origin_url, + status="accepted", + loading_task_id=_task_id, + visit_date=None, + ) + sor.request_date = datetime.now(tz=timezone.utc) - timedelta(days=31) + sor.save() + + _mock_scheduler(mocker, task_status="failed") + + sors = get_save_origin_requests(_visit_type, _origin_url) + + assert len(sors) == 2 + assert sors[0]["visit_date"] is None + mock_get_origin_visits.assert_called_once()