diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py --- a/swh/web/common/archive.py +++ b/swh/web/common/archive.py @@ -2,8 +2,8 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information - from collections import defaultdict +import datetime import itertools import os import re @@ -1066,6 +1066,26 @@ return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type}) +def origin_visit_status_find_by_date( + origin_url: str, visit_date: datetime.datetime +) -> Optional[OriginVisitInfo]: + """Retrieve the origin visit status whose date is closest to the provided timestamp. + + Args: + origin_url: origin concerned by the visit + visit_date: provided timestamp + + Returns: + The dict origin_visit_status concerned if any. + + """ + visit = storage.origin_visit_find_by_date(origin_url, visit_date) + if not visit: + return None + visit_status = storage.origin_visit_status_get_latest(origin_url, visit.visit_id) + return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type}) + + def lookup_snapshot_sizes( snapshot_id: str, branch_name_exclude_prefix: Optional[str] = "refs/pull/" ) -> Dict[str, int]: diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py --- a/swh/web/common/origin_save.py +++ b/swh/web/common/origin_save.py @@ -1,9 +1,8 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from bisect import bisect_right from datetime import datetime, timedelta, timezone from functools import lru_cache from itertools import product @@ -39,12 +38,7 @@ SaveOriginRequest, SaveUnauthorizedOrigin, ) -from swh.web.common.origin_visits import get_origin_visits -from swh.web.common.typing import ( - OriginExistenceCheckInfo, - OriginInfo, - SaveOriginRequestInfo, -) +from swh.web.common.typing import OriginExistenceCheckInfo, SaveOriginRequestInfo from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc from swh.web.config import get_config, scheduler @@ -285,16 +279,11 @@ # as those requests to storage are expensive and associated loading task # surely ended up with errors if time_delta.days <= MAX_THRESHOLD_DAYS: - try: - origin_info = archive.lookup_origin(OriginInfo(url=save_request.origin_url)) - origin_visits = get_origin_visits(origin_info) - visit_dates = [parse_iso8601_date_to_utc(v["date"]) for v in origin_visits] - i = bisect_right(visit_dates, save_request.request_date) - if i != len(visit_dates): - visit_date = visit_dates[i] - visit_status = origin_visits[i]["status"] - except Exception as exc: - sentry_sdk.capture_exception(exc) + origin = save_request.origin_url + ovs = archive.origin_visit_status_find_by_date(origin, time_now) + if ovs: + visit_date = parse_iso8601_date_to_utc(ovs["date"]) + visit_status = ovs["status"] return visit_date, visit_status diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py --- a/swh/web/tests/common/test_origin_save.py +++ b/swh/web/tests/common/test_origin_save.py @@ -215,9 +215,6 @@ _fill_scheduler_db(swh_scheduler) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -231,18 +228,18 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_status_find_by_date.return_value = visit_info # check visit date has been correctly found sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] == visit_date - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_status_find_by_date.assert_called_once() # check visit is not searched again when it has been found get_save_origin_requests(_visit_type, _origin_url) - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_status_find_by_date.assert_called_once() # check visit date are not searched for save requests older than # one month @@ -263,7 +260,7 @@ assert len(sors) == 2 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED assert sors[0]["visit_date"] is None - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_status_find_by_date.assert_called_once() def _get_save_origin_requests( @@ -292,9 +289,6 @@ ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -308,10 +302,10 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_status_find_by_date.return_value = visit_info sors = get_save_origin_requests(_visit_type, _origin_url) - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_status_find_by_date.assert_called_once() return sors @@ -587,9 +581,6 @@ ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -603,7 +594,7 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_status_find_by_date.return_value = visit_info # make the scheduler return a running event _fill_scheduler_db( @@ -616,7 +607,11 @@ # The visit is detected but still running sors = refresh_save_origin_request_statuses() - assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 + assert ( + mock_archive.origin_visit_status_find_by_date.called + and mock_archive.origin_visit_status_find_by_date.call_count == 1 + ) + assert len(sors) == 1 for sor in sors: @@ -640,13 +635,16 @@ # (visit date and visit status in final state) visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info.update({"date": visit_date, "status": VISIT_STATUS_FULL}) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_status_find_by_date.return_value = visit_info # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses() assert len(sors) == 1 - assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 + 1 + assert ( + mock_archive.origin_visit_status_find_by_date.called + and mock_archive.origin_visit_status_find_by_date.call_count == 1 + 1 + ) for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot @@ -687,9 +685,6 @@ ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -703,7 +698,7 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_status_find_by_date.return_value = visit_info # no changes so refresh does detect the entry but does nothing sors = refresh_save_origin_request_statuses() @@ -746,7 +741,7 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_status_find_by_date.return_value = visit_info # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses()