diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py --- a/swh/web/common/origin_save.py +++ b/swh/web/common/origin_save.py @@ -33,6 +33,8 @@ SAVE_TASK_RUNNING, SAVE_TASK_SCHEDULED, SAVE_TASK_SUCCEEDED, + VISIT_STATUS_CREATED, + VISIT_STATUS_ONGOING, SaveAuthorizedOrigin, SaveOriginRequest, SaveUnauthorizedOrigin, @@ -268,45 +270,57 @@ if i != len(visit_dates): visit_date = visit_dates[i] visit_status = origin_visits[i]["status"] - if visit_status not in ("full", "partial", "not_found"): - visit_date = None + # if visit_status not in ("full", "partial", "not_found"): + # visit_date = None except Exception as exc: sentry_sdk.capture_exception(exc) return visit_date, visit_status def _check_visit_update_status( - save_request: SaveOriginRequest, save_task_status: str -) -> Tuple[Optional[datetime], str]: - """Given a save request and a save task status, determine whether a save request was - successful or failed. + save_request: SaveOriginRequest, +) -> Tuple[Optional[datetime], Optional[str], Optional[str]]: + """Given a save request, determine whether a save request was successful or failed. Args: save_request: Input save origin request to retrieve information for. Returns: - Tuple of (optional visit date, save task status) for such save request origin + Tuple of (optional visit date, optional visit status, optional save task status) + for such save request origin """ visit_date, visit_status = _get_visit_info_for_save_request(save_request) - save_request.visit_date = visit_date - save_request.visit_status = visit_status + loading_task_status = None if visit_date and visit_status in ("full", "partial"): # visit has been performed, mark the saving task as succeeded - save_task_status = SAVE_TASK_SUCCEEDED + loading_task_status = SAVE_TASK_SUCCEEDED elif visit_status in ("created", "ongoing"): # visit is currently running - save_task_status = SAVE_TASK_RUNNING + loading_task_status = SAVE_TASK_RUNNING elif visit_status in ("not_found", "failed"): - save_task_status = SAVE_TASK_FAILED + loading_task_status = SAVE_TASK_FAILED else: time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # consider the task as failed if it is still in scheduled state # 30 days after its submission if time_delta.days > MAX_THRESHOLD_DAYS: - save_task_status = SAVE_TASK_FAILED - return visit_date, save_task_status + loading_task_status = SAVE_TASK_FAILED + return visit_date, visit_status, loading_task_status + + +def _compute_task_loading_status( + task: Optional[Dict[str, Any]] = None, task_run: Optional[Dict[str, Any]] = None, +) -> Optional[str]: + loading_task_status: Optional[str] = None + # First determine the loading task status out of task information + if task: + loading_task_status = _save_task_status[task["status"]] + if task_run: + loading_task_status = _save_task_run_status[task_run["status"]] + + return loading_task_status def _update_save_request_info( @@ -326,53 +340,32 @@ """ must_save = False - visit_date = save_request.visit_date - # save task still in scheduler db - if task: - save_task_status = _save_task_status[task["status"]] - if task_run: - save_task_status = _save_task_run_status[task_run["status"]] + # To determine the save code now request's full status, the visit date and visit + # status must be retrieved and set. Once they do, the save code now is definitely + # done. + if not save_request.visit_date or not save_request.visit_status: + visit_date, visit_status, loading_task_status = _check_visit_update_status( + save_request + ) - # Consider request from which a visit date has already been found - # as succeeded to avoid retrieving it again - if save_task_status == SAVE_TASK_SCHEDULED and visit_date: - save_task_status = SAVE_TASK_SUCCEEDED - if ( - save_task_status in (SAVE_TASK_FAILED, SAVE_TASK_SUCCEEDED) - and not visit_date - ): - visit_date, visit_status = _get_visit_info_for_save_request(save_request) - save_request.visit_date = visit_date - save_request.visit_status = visit_status - if visit_status in ("failed", "not_found"): - save_task_status = SAVE_TASK_FAILED - must_save = True - # Check tasks still marked as scheduled / not yet scheduled - if save_task_status in (SAVE_TASK_SCHEDULED, SAVE_TASK_NOT_YET_SCHEDULED): - visit_date, save_task_status = _check_visit_update_status( - save_request, save_task_status - ) + if not loading_task_status: # fallback when not provided + loading_task_status = _compute_task_loading_status(task, task_run) - # save task may have been archived - else: - save_task_status = save_request.loading_task_status - if save_task_status in (SAVE_TASK_SCHEDULED, SAVE_TASK_NOT_YET_SCHEDULED): - visit_date, save_task_status = _check_visit_update_status( - save_request, save_task_status - ) + if visit_date != save_request.visit_date: + must_save = True + save_request.visit_date = visit_date - else: - save_task_status = save_request.loading_task_status + if visit_status != save_request.visit_status: + must_save = True + save_request.visit_status = visit_status - if ( - # avoid to override final loading task status when already found - # as visit status is no longer checked once a visit date has been found - save_request.loading_task_status not in (SAVE_TASK_FAILED, SAVE_TASK_SUCCEEDED) - and save_request.loading_task_status != save_task_status - ): - save_request.loading_task_status = save_task_status - must_save = True + if ( + loading_task_status is not None + and loading_task_status != save_request.loading_task_status + ): + must_save = True + save_request.loading_task_status = loading_task_status if must_save: save_request.save() @@ -619,7 +612,9 @@ # Retrieve accepted request statuses (all statuses) Q(status=SAVE_REQUEST_ACCEPTED), # those without the required information we need to update - Q(visit_date__isnull=True) | Q(visit_status__isnull=True), + Q(visit_date__isnull=True) + | Q(visit_status__isnull=True) + | Q(visit_status__in=[VISIT_STATUS_CREATED, VISIT_STATUS_ONGOING,]), # limit results to recent ones (that is roughly 30 days old at best) Q(request_date__gte=pivot_date), ) diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py --- a/swh/web/tests/common/test_origin_save.py +++ b/swh/web/tests/common/test_origin_save.py @@ -480,7 +480,7 @@ # check no visit date has been found assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_RUNNING - assert sors[0]["visit_date"] is None + assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] == visit_status @@ -497,10 +497,7 @@ assert sors[0]["save_task_status"] == SAVE_TASK_FAILED visit_date = sors[0]["visit_date"] - if visit_status == "failed": - assert visit_date is None - else: - assert visit_date is not None + assert visit_date is not None sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 1 @@ -542,7 +539,7 @@ assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED # As the entry is missing the following information though - assert sors[0]["visit_date"] is None + assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] is None # It's still detected as to be updated by the refresh routine @@ -550,7 +547,7 @@ assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED - assert sors[0]["visit_date"] is None + assert sors[0]["visit_date"] is not None assert sors[0]["visit_status"] is None @@ -585,7 +582,7 @@ # The status is updated assert sor["save_task_status"] == SAVE_TASK_RUNNING # but the following entries are missing so it's not updated - assert sor["visit_date"] is None + assert sor["visit_date"] is not None assert sor["visit_status"] == "created" # make the visit status completed @@ -624,7 +621,7 @@ assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot # as it turns out, in this test, this won't update anything as no new status got # returned by the scheduler - assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED + assert sor["save_task_status"] == SAVE_TASK_RUNNING assert sor["visit_date"] == visit_date assert sor["visit_status"] == "full" @@ -656,7 +653,7 @@ # returned by the scheduler assert sor["save_task_status"] == SAVE_TASK_SCHEDULED # Information is empty - assert sor["visit_date"] is None + assert sor["visit_date"] is not None assert sor["visit_status"] is None # make the scheduler return eventful event for that origin @@ -670,7 +667,7 @@ # The status is updated assert sor["save_task_status"] == SAVE_TASK_SUCCEEDED # but the following entries are missing so it's not updated - assert sor["visit_date"] is None + assert sor["visit_date"] is not None assert sor["visit_status"] is None # This time around, the origin returned will have all information updated