diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py --- a/swh/web/common/archive.py +++ b/swh/web/common/archive.py @@ -2,8 +2,8 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information - from collections import defaultdict +import datetime import itertools import os import re @@ -1066,6 +1066,26 @@ return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type}) +def origin_visit_find_latest_status_by_date( + origin_url: str, visit_date: datetime.datetime +) -> Optional[OriginVisitInfo]: + """Retrieve the origin visit status whose date is closest to the provided timestamp. + + Args: + origin_url: origin concerned by the visit + visit_date: provided timestamp + + Returns: + The dict origin_visit_status concerned if any. + + """ + visit = storage.origin_visit_find_by_date(origin_url, visit_date) + if not visit: + return None + visit_status = storage.origin_visit_status_get_latest(origin_url, visit.visit) + return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type}) + + def lookup_snapshot_sizes( snapshot_id: str, branch_name_exclude_prefix: Optional[str] = "refs/pull/" ) -> Dict[str, int]: diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py --- a/swh/web/common/origin_save.py +++ b/swh/web/common/origin_save.py @@ -1,9 +1,8 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from bisect import bisect_right from datetime import datetime, timedelta, timezone from functools import lru_cache from itertools import product @@ -39,12 +38,7 @@ SaveOriginRequest, SaveUnauthorizedOrigin, ) -from swh.web.common.origin_visits import get_origin_visits -from swh.web.common.typing import ( - OriginExistenceCheckInfo, - OriginInfo, - SaveOriginRequestInfo, -) +from swh.web.common.typing import OriginExistenceCheckInfo, SaveOriginRequestInfo from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc from swh.web.config import get_config, scheduler @@ -285,16 +279,11 @@ # as those requests to storage are expensive and associated loading task # surely ended up with errors if time_delta.days <= MAX_THRESHOLD_DAYS: - try: - origin_info = archive.lookup_origin(OriginInfo(url=save_request.origin_url)) - origin_visits = get_origin_visits(origin_info) - visit_dates = [parse_iso8601_date_to_utc(v["date"]) for v in origin_visits] - i = bisect_right(visit_dates, save_request.request_date) - if i != len(visit_dates): - visit_date = visit_dates[i] - visit_status = origin_visits[i]["status"] - except Exception as exc: - sentry_sdk.capture_exception(exc) + origin = save_request.origin_url + ovs = archive.origin_visit_find_latest_status_by_date(origin, time_now) + if ovs: + visit_date = parse_iso8601_date_to_utc(ovs["date"]) + visit_status = ovs["status"] return visit_date, visit_status diff --git a/swh/web/tests/common/test_archive.py b/swh/web/tests/common/test_archive.py --- a/swh/web/tests/common/test_archive.py +++ b/swh/web/tests/common/test_archive.py @@ -1,9 +1,9 @@ -# Copyright (C) 2015-2021 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information - from collections import defaultdict +import datetime import hashlib import itertools import random @@ -18,6 +18,7 @@ DirectoryEntry, Origin, OriginVisit, + OriginVisitStatus, Revision, Snapshot, SnapshotBranch, @@ -207,6 +208,94 @@ assert actual_origin_visit == expected_visit +@given(new_origin(), visit_dates()) +def test_origin_visit_find_latest_status_by_date_no_result( + archive_data, new_origin, visit_dates +): + """No visit registered in storage for an origin should return no visit""" + archive_data.origin_add([new_origin]) + + for visit_date in visit_dates: + # No visit yet, so nothing will get returned + actual_origin_visit_status = archive.origin_visit_find_latest_status_by_date( + new_origin.url, visit_date + ) + assert actual_origin_visit_status is None + + +@given(new_origin(), visit_dates()) +def test_origin_visit_find_latest_status_by_date_only_created( + archive_data, new_origin, visit_dates +): + """Only created visits in storage for an origin should return those most recent""" + archive_data.origin_add([new_origin]) + + # Adding some visits + archive_data.origin_visit_add( + [ + OriginVisit( + origin=new_origin.url, + date=ts - datetime.timedelta(days=offset), + type="git", + ) + for offset, ts in enumerate(visit_dates) + ] + ) + # We will only find reference to created status since we have nothing else for now + actual_origin_visit_status = archive.origin_visit_find_latest_status_by_date( + new_origin.url, max(visit_dates) + ) + assert actual_origin_visit_status is not None + assert actual_origin_visit_status["status"] == "created" + + +@given(new_origin(), visit_dates()) +def test_origin_visit_find_latest_status_by_date_with_status( + subtest, archive_data, new_origin, visit_dates +): + """More exhaustive checks with visits with statuses""" + archive_data.origin_add([new_origin]) + # Add some visits + visits = archive_data.origin_visit_add( + [ + OriginVisit( + origin=new_origin.url, + date=ts - datetime.timedelta(days=offset) if offset != 0 else ts, + type="git", + ) + for offset, ts in enumerate(visit_dates) + ] + ) + # Then finalize some visits + # ovss = archive.lookup_origin_visits(new_origin.url, visit_date) + all_statuses = ["not_found", "failed", "partial", "full"] + # Add an origin visit status + visit = visits[0] + visit_statuses = [ + OriginVisitStatus( + origin=new_origin.url, + visit=visit.visit, + date=visit.date + datetime.timedelta(days=offset_day), + type=visit.type, + status=status, + snapshot=None, + ) + for (offset_day, status) in enumerate(all_statuses) + ] + archive_data.origin_visit_status_add(visit_statuses) + + # visit_date = max(ovs.date for ovs in visit_statuses) + expected_visit_status = archive.lookup_origin_visit(new_origin.url, visit.visit) + + for offset_day, status in enumerate(all_statuses): + visit_date = visit.date + datetime.timedelta(days=offset_day) + actual_origin_visit_status = archive.origin_visit_find_latest_status_by_date( + new_origin.url, visit_date + ) + assert actual_origin_visit_status == expected_visit_status + assert expected_visit_status["status"] == "full" + + @given(new_origin()) def test_lookup_origin(archive_data, new_origin): archive_data.origin_add([new_origin]) diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py --- a/swh/web/tests/common/test_origin_save.py +++ b/swh/web/tests/common/test_origin_save.py @@ -215,9 +215,6 @@ _fill_scheduler_db(swh_scheduler) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -231,18 +228,18 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status_by_date.return_value = visit_info # check visit date has been correctly found sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] == visit_date - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_find_latest_status_by_date.assert_called_once() # check visit is not searched again when it has been found get_save_origin_requests(_visit_type, _origin_url) - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_find_latest_status_by_date.assert_called_once() # check visit date are not searched for save requests older than # one month @@ -263,7 +260,7 @@ assert len(sors) == 2 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED assert sors[0]["visit_date"] is None - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_find_latest_status_by_date.assert_called_once() def _get_save_origin_requests( @@ -292,9 +289,6 @@ ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -308,10 +302,10 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status_by_date.return_value = visit_info sors = get_save_origin_requests(_visit_type, _origin_url) - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_find_latest_status_by_date.assert_called_once() return sors @@ -587,9 +581,6 @@ ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -603,7 +594,7 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status_by_date.return_value = visit_info # make the scheduler return a running event _fill_scheduler_db( @@ -616,7 +607,11 @@ # The visit is detected but still running sors = refresh_save_origin_request_statuses() - assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 + assert ( + mock_archive.origin_visit_find_latest_status_by_date.called + and mock_archive.origin_visit_find_latest_status_by_date.call_count == 1 + ) + assert len(sors) == 1 for sor in sors: @@ -640,13 +635,16 @@ # (visit date and visit status in final state) visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info.update({"date": visit_date, "status": VISIT_STATUS_FULL}) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status_by_date.return_value = visit_info # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses() assert len(sors) == 1 - assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 + 1 + assert ( + mock_archive.origin_visit_find_latest_status_by_date.called + and mock_archive.origin_visit_find_latest_status_by_date.call_count == 1 + 1 + ) for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot @@ -687,9 +685,6 @@ ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -703,7 +698,7 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status_by_date.return_value = visit_info # no changes so refresh does detect the entry but does nothing sors = refresh_save_origin_request_statuses() @@ -746,7 +741,7 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status_by_date.return_value = visit_info # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses()