diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py --- a/swh/web/common/origin_save.py +++ b/swh/web/common/origin_save.py @@ -33,6 +33,8 @@ SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, + VISIT_STATUS_CREATED, + VISIT_STATUS_ONGOING, SaveAuthorizedOrigin, SaveOriginRequest, SaveUnauthorizedOrigin, @@ -52,6 +54,12 @@ # Number of days in the past to lookup for information MAX_THRESHOLD_DAYS = 30 +# Non terminal visit statuses which needs updates +NON_TERMINAL_STATUSES = [ + VISIT_STATUS_CREATED, + VISIT_STATUS_ONGOING, +] + def get_origin_save_authorized_urls() -> List[str]: """ @@ -268,45 +276,55 @@ if i != len(visit_dates): visit_date = visit_dates[i] visit_status = origin_visits[i]["status"] - if visit_status not in ("full", "partial", "not_found"): - visit_date = None except Exception as exc: sentry_sdk.capture_exception(exc) return visit_date, visit_status def _check_visit_update_status( - save_request: SaveOriginRequest, save_task_status: str -) -> Tuple[Optional[datetime], str]: - """Given a save request and a save task status, determine whether a save request was - successful or failed. + save_request: SaveOriginRequest, +) -> Tuple[Optional[datetime], Optional[str], Optional[str]]: + """Given a save request, determine whether a save request was successful or failed. Args: save_request: Input save origin request to retrieve information for. Returns: - Tuple of (optional visit date, save task status) for such save request origin + Tuple of (optional visit date, optional visit status, optional save task status) + for such save request origin """ visit_date, visit_status = _get_visit_info_for_save_request(save_request) - save_request.visit_date = visit_date - save_request.visit_status = visit_status + loading_task_status = None if visit_date and visit_status in ("full", "partial"): # visit has been performed, mark the saving task as succeeded - save_task_status = SAVE_TASK_SUCCEEDED + loading_task_status = SAVE_TASK_SUCCEEDED elif visit_status in ("created", "ongoing"): # visit is currently running - save_task_status = SAVE_TASK_RUNNING + loading_task_status = SAVE_TASK_RUNNING elif visit_status in ("not_found", "failed"): - save_task_status = SAVE_TASK_FAILED + loading_task_status = SAVE_TASK_FAILED else: time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # consider the task as failed if it is still in scheduled state # 30 days after its submission if time_delta.days > MAX_THRESHOLD_DAYS: - save_task_status = SAVE_TASK_FAILED - return visit_date, save_task_status + loading_task_status = SAVE_TASK_FAILED + return visit_date, visit_status, loading_task_status + + +def _compute_task_loading_status( + task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, +) -> Optional[str]: + loading_task_status: Optional[str] = None + # First determine the loading task status out of task information + if task: + loading_task_status = _save_task_status[task["status"]] + if task_run: + loading_task_status = _save_task_run_status[task_run["status"]] + + return loading_task_status def _update_save_request_info( @@ -314,7 +332,8 @@ task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, ) -> SaveOriginRequestInfo: - """Update save request information out of task and task_run information. + """Update save request information out of the visit status and fallback to the task and + task_run information if the visit status is missing. Args: save_request: Save request @@ -326,53 +345,36 @@ """ must_save = False - visit_date = save_request.visit_date - # save task still in scheduler db - if task: - save_task_status = _save_task_status[task["status"]] - if task_run: - save_task_status = _save_task_run_status[task_run["status"]] + # To determine the save code now request's final status, the visit date must be set + # and the visit status must be a final one. Once they do, the save code now is + # definitely done. + if ( + not save_request.visit_date + or not save_request.visit_status + or save_request.visit_status in NON_TERMINAL_STATUSES + ): + visit_date, visit_status, loading_task_status = _check_visit_update_status( + save_request + ) - # Consider request from which a visit date has already been found - # as succeeded to avoid retrieving it again - if save_task_status == SAVE_TASK_SCHEDULED and visit_date: - save_task_status = SAVE_TASK_SUCCEEDED - if ( - save_task_status in (SAVE_TASK_FAILED, SAVE_TASK_SUCCEEDED) - and not visit_date - ): - visit_date, visit_status = _get_visit_info_for_save_request(save_request) - save_request.visit_date = visit_date - save_request.visit_status = visit_status - if visit_status in ("failed", "not_found"): - save_task_status = SAVE_TASK_FAILED - must_save = True - # Check tasks still marked as scheduled / not yet scheduled - if save_task_status in (SAVE_TASK_SCHEDULED, SAVE_TASK_NOT_YET_SCHEDULED): - visit_date, save_task_status = _check_visit_update_status( - save_request, save_task_status - ) + if not loading_task_status: # fallback when not provided + loading_task_status = _compute_task_loading_status(task, task_run) - # save task may have been archived - else: - save_task_status = save_request.loading_task_status - if save_task_status in (SAVE_TASK_SCHEDULED, SAVE_TASK_NOT_YET_SCHEDULED): - visit_date, save_task_status = _check_visit_update_status( - save_request, save_task_status - ) + if visit_date != save_request.visit_date: + must_save = True + save_request.visit_date = visit_date - else: - save_task_status = save_request.loading_task_status + if visit_status != save_request.visit_status: + must_save = True + save_request.visit_status = visit_status - if ( - # avoid to override final loading task status when already found - # as visit status is no longer checked once a visit date has been found - save_request.loading_task_status not in (SAVE_TASK_FAILED, SAVE_TASK_SUCCEEDED) - and save_request.loading_task_status != save_task_status - ): - save_request.loading_task_status = save_task_status - must_save = True + if ( + loading_task_status is not None + and loading_task_status != save_request.loading_task_status + ): + must_save = True + save_request.loading_task_status = loading_task_status if must_save: save_request.save() @@ -467,11 +469,11 @@ ) task_kwargs = dict(**task_kwargs, artifacts=artifacts, snapshot_append=True) sor = None - # get list of previously sumitted save requests + # get list of previously submitted save requests (most recent first) current_sors = list( SaveOriginRequest.objects.filter( visit_type=visit_type, origin_url=origin_url - ) + ).order_by("-request_date") ) can_create_task = False @@ -619,10 +621,13 @@ # Retrieve accepted request statuses (all statuses) Q(status=SAVE_REQUEST_ACCEPTED), # those without the required information we need to update - Q(visit_date__isnull=True) | Q(visit_status__isnull=True), + Q(visit_date__isnull=True) + | Q(visit_status__isnull=True) + | Q(visit_status__in=NON_TERMINAL_STATUSES), # limit results to recent ones (that is roughly 30 days old at best) Q(request_date__gte=pivot_date), ) + return ( update_save_origin_requests_from_queryset(save_requests) if save_requests.count() > 0 diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py --- a/swh/web/tests/common/test_origin_save.py +++ b/swh/web/tests/common/test_origin_save.py @@ -20,6 +20,7 @@ SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, + VISIT_STATUS_CREATED, VISIT_STATUS_FULL, SaveOriginRequest, ) @@ -480,7 +481,7 @@ # check no visit date has been found assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_RUNNING - assert sors[0]["visit_date"] is None + assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] == visit_status @@ -497,10 +498,7 @@ assert sors[0]["save_task_status"] == SAVE_TASK_FAILED visit_date = sors[0]["visit_date"] - if visit_status == "failed": - assert visit_date is None - else: - assert visit_date is not None + assert visit_date is not None sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 1 @@ -542,7 +540,7 @@ assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED # As the entry is missing the following information though - assert sors[0]["visit_date"] is None + assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] is None # It's still detected as to be updated by the refresh routine @@ -550,7 +548,7 @@ assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED - assert sors[0]["visit_date"] is None + assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] is None @@ -563,10 +561,39 @@ visit_started_date = date_now - timedelta(minutes=1) # returned visit status - sors = _get_save_origin_requests( - mocker, load_status=SAVE_TASK_SCHEDULED, visit_status="created", + SaveOriginRequest.objects.create( + request_date=datetime.now(tz=timezone.utc), + visit_type=_visit_type, + visit_status=VISIT_STATUS_CREATED, + origin_url=_origin_url, + status=SAVE_REQUEST_ACCEPTED, + visit_date=None, + loading_task_id=_task_id, ) - assert len(sors) == 1 + + # mock scheduler and archives + _mock_scheduler( + mocker, task_status="next_run_scheduled", task_run_status=SAVE_TASK_SCHEDULED + ) + mock_archive = mocker.patch("swh.web.common.origin_save.archive") + mock_archive.lookup_origin.return_value = {"url": _origin_url} + mock_get_origin_visits = mocker.patch( + "swh.web.common.origin_save.get_origin_visits" + ) + # create a visit for the save request with status created + visit_date = datetime.now(tz=timezone.utc).isoformat() + visit_info = OriginVisitInfo( + date=visit_date, + formatted_date="", + metadata={}, + origin=_origin_url, + snapshot="", # make mypy happy + status=VISIT_STATUS_CREATED, + type=_visit_type, + url="", + visit=34, + ) + mock_get_origin_visits.return_value = [visit_info] # make the scheduler return a running event _mock_scheduler( @@ -578,6 +605,8 @@ # The visit is detected but still running sors = refresh_save_origin_request_statuses() + + assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 assert len(sors) == 1 for sor in sors: @@ -585,8 +614,8 @@ # The status is updated assert sor["save_task_status"] == SAVE_TASK_RUNNING # but the following entries are missing so it's not updated - assert sor["visit_date"] is None - assert sor["visit_status"] == "created" + assert sor["visit_date"] is not None + assert sor["visit_status"] == VISIT_STATUS_CREATED # make the visit status completed # make the scheduler return a running event @@ -597,36 +626,25 @@ visit_started_date=visit_started_date, ) - # This time around, the origin returned will have all information updated - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) - # create a visit for the save request with status created + # This time around, the origin returned will have all required information updated + # (visit date and visit status in final state) visit_date = datetime.now(tz=timezone.utc).isoformat() - visit_info = OriginVisitInfo( - date=visit_date, - formatted_date="", - metadata={}, - origin=_origin_url, - snapshot="", # make mypy happy - status="full", - type=_visit_type, - url="", - visit=34, - ) + visit_info.update({"date": visit_date, "status": VISIT_STATUS_FULL}) mock_get_origin_visits.return_value = [visit_info] # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses() assert len(sors) == 1 + assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 + 1 + for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED assert sor["visit_date"] == visit_date - assert sor["visit_status"] == "full" + assert sor["visit_status"] == VISIT_STATUS_FULL # Once in final state, a sor should not be updated anymore sors = refresh_save_origin_request_statuses() @@ -641,10 +659,39 @@ date_now = datetime.now(tz=timezone.utc) date_pivot = date_now - timedelta(days=30) # returned visit status - sors = _get_save_origin_requests( - mocker, load_status=SAVE_TASK_SCHEDULED, visit_status=None, + SaveOriginRequest.objects.create( + request_date=datetime.now(tz=timezone.utc), + visit_type=_visit_type, + visit_status=None, + origin_url=_origin_url, + status=SAVE_REQUEST_ACCEPTED, + visit_date=None, + loading_task_id=_task_id, ) - assert len(sors) == 1 + + # mock scheduler and archives + _mock_scheduler( + mocker, task_status="next_run_scheduled", task_run_status=SAVE_TASK_SCHEDULED + ) + mock_archive = mocker.patch("swh.web.common.origin_save.archive") + mock_archive.lookup_origin.return_value = {"url": _origin_url} + mock_get_origin_visits = mocker.patch( + "swh.web.common.origin_save.get_origin_visits" + ) + # create a visit for the save request with status created + visit_date = datetime.now(tz=timezone.utc).isoformat() + visit_info = OriginVisitInfo( + date=visit_date, + formatted_date="", + metadata={}, + origin=_origin_url, + snapshot="", # make mypy happy + status=VISIT_STATUS_CREATED, + type=_visit_type, + url="", + visit=34, + ) + mock_get_origin_visits.return_value = [visit_info] # no changes so refresh does detect the entry but does nothing sors = refresh_save_origin_request_statuses() @@ -654,29 +701,26 @@ assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler - assert sor["save_task_status"] == SAVE_TASK_SCHEDULED + assert sor["save_task_status"] == SAVE_TASK_RUNNING # Information is empty - assert sor["visit_date"] is None - assert sor["visit_status"] is None + assert sor["visit_date"] == visit_date + assert sor["visit_status"] == VISIT_STATUS_CREATED - # make the scheduler return eventful event for that origin - _mock_scheduler(mocker) - # updates will be detected, entry should be updated but we are still missing info + # A save code now entry is detected for update, but as nothing changes, the entry + # remains in the same state sors = refresh_save_origin_request_statuses() assert len(sors) == 1 for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot - # The status is updated - assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED - # but the following entries are missing so it's not updated - assert sor["visit_date"] is None - assert sor["visit_status"] is None + # Status is not updated as no new information is available on the visit status + # and the task status has not moved + assert sor["save_task_status"] == SAVE_TASK_RUNNING + # Information is empty + assert sor["visit_date"] == visit_date + assert sor["visit_status"] == VISIT_STATUS_CREATED # This time around, the origin returned will have all information updated - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -685,7 +729,7 @@ metadata={}, origin=_origin_url, snapshot="", # make mypy happy - status="full", + status=VISIT_STATUS_FULL, type=_visit_type, url="", visit=34,