diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py --- a/swh/web/common/origin_save.py +++ b/swh/web/common/origin_save.py @@ -268,45 +268,57 @@ if i != len(visit_dates): visit_date = visit_dates[i] visit_status = origin_visits[i]["status"] - if visit_status not in ("full", "partial", "not_found"): - visit_date = None + # if visit_status not in ("full", "partial", "not_found"): + # visit_date = None except Exception as exc: sentry_sdk.capture_exception(exc) return visit_date, visit_status def _check_visit_update_status( - save_request: SaveOriginRequest, save_task_status: str -) -> Tuple[Optional[datetime], str]: - """Given a save request and a save task status, determine whether a save request was - successful or failed. + save_request: SaveOriginRequest, +) -> Tuple[Optional[datetime], Optional[str], Optional[str]]: + """Given a save request, determine whether a save request was successful or failed. Args: save_request: Input save origin request to retrieve information for. Returns: - Tuple of (optional visit date, save task status) for such save request origin + Tuple of (optional visit date, optional visit status, optional save task status) + for such save request origin """ visit_date, visit_status = _get_visit_info_for_save_request(save_request) - save_request.visit_date = visit_date - save_request.visit_status = visit_status + loading_task_status = None if visit_date and visit_status in ("full", "partial"): # visit has been performed, mark the saving task as succeeded - save_task_status = SAVE_TASK_SUCCEEDED + loading_task_status = SAVE_TASK_SUCCEEDED elif visit_status in ("created", "ongoing"): # visit is currently running - save_task_status = SAVE_TASK_RUNNING + loading_task_status = SAVE_TASK_RUNNING elif visit_status in ("not_found", "failed"): - save_task_status = SAVE_TASK_FAILED + loading_task_status = SAVE_TASK_FAILED else: time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # consider the task as failed if it is still in scheduled state # 30 days after its submission if time_delta.days > MAX_THRESHOLD_DAYS: - save_task_status = SAVE_TASK_FAILED - return visit_date, save_task_status + loading_task_status = SAVE_TASK_FAILED + return visit_date, visit_status, loading_task_status + + +def _compute_task_loading_status( + task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, +) -> Optional[str]: + loading_task_status: Optional[str] = None + # First determine the loading task status out of task information + if task: + loading_task_status = _save_task_status[task["status"]] + if task_run: + loading_task_status = _save_task_run_status[task_run["status"]] + + return loading_task_status def _update_save_request_info( @@ -326,53 +338,32 @@ """ must_save = False - visit_date = save_request.visit_date - # save task still in scheduler db - if task: - save_task_status = _save_task_status[task["status"]] - if task_run: - save_task_status = _save_task_run_status[task_run["status"]] + # To determine the save code now request's full status, the visit date and visit + # status must be retrieved and set. Once they do, the save code now is definitely + # done. + if not save_request.visit_date or not save_request.visit_status: + visit_date, visit_status, loading_task_status = _check_visit_update_status( + save_request + ) - # Consider request from which a visit date has already been found - # as succeeded to avoid retrieving it again - if save_task_status == SAVE_TASK_SCHEDULED and visit_date: - save_task_status = SAVE_TASK_SUCCEEDED - if ( - save_task_status in (SAVE_TASK_FAILED, SAVE_TASK_SUCCEEDED) - and not visit_date - ): - visit_date, visit_status = _get_visit_info_for_save_request(save_request) - save_request.visit_date = visit_date - save_request.visit_status = visit_status - if visit_status in ("failed", "not_found"): - save_task_status = SAVE_TASK_FAILED + if not loading_task_status: # fallback when not provided + loading_task_status = _compute_task_loading_status(task, task_run) + + if visit_date != save_request.visit_date: must_save = True - # Check tasks still marked as scheduled / not yet scheduled - if save_task_status in (SAVE_TASK_SCHEDULED, SAVE_TASK_NOT_YET_SCHEDULED): - visit_date, save_task_status = _check_visit_update_status( - save_request, save_task_status - ) + save_request.visit_date = visit_date - # save task may have been archived - else: - save_task_status = save_request.loading_task_status - if save_task_status in (SAVE_TASK_SCHEDULED, SAVE_TASK_NOT_YET_SCHEDULED): - visit_date, save_task_status = _check_visit_update_status( - save_request, save_task_status - ) + if visit_status != save_request.visit_status: + must_save = True + save_request.visit_status = visit_status - else: - save_task_status = save_request.loading_task_status - - if ( - # avoid to override final loading task status when already found - # as visit status is no longer checked once a visit date has been found - save_request.loading_task_status not in (SAVE_TASK_FAILED, SAVE_TASK_SUCCEEDED) - and save_request.loading_task_status != save_task_status - ): - save_request.loading_task_status = save_task_status - must_save = True + if ( + loading_task_status is not None + and loading_task_status != save_request.loading_task_status + ): + must_save = True + save_request.loading_task_status = loading_task_status if must_save: save_request.save()