diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py --- a/swh/web/common/archive.py +++ b/swh/web/common/archive.py @@ -2,7 +2,6 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information - from collections import defaultdict import itertools import os @@ -16,6 +15,7 @@ from swh.storage.algos import diff, revisions_walker from swh.storage.algos.origin import origin_get_latest_visit_status from swh.storage.algos.snapshot import snapshot_get_latest, snapshot_resolve_alias +from swh.storage.interface import ListOrder from swh.vault.exc import NotFoundExc as VaultNotFoundExc from swh.web import config from swh.web.common import converters, query @@ -1066,6 +1066,25 @@ return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type}) +def origin_visit_find_latest_status(origin_url: str,) -> Optional[OriginVisitInfo]: + """Retrieve origin visit status whose date is most recent than the provided visit_date. + + Args: + origin_url: origin concerned by the visit + visit_date: provided visit date + + Returns: + The dict origin_visit_status matching the criteria if any. + + """ + visit_page = storage.origin_visit_get(origin_url, limit=1, order=ListOrder.DESC) + if not visit_page.results: + return None + visit = visit_page.results[0] + visit_status = storage.origin_visit_status_get_latest(origin_url, visit.visit) + return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type}) + + def lookup_snapshot_sizes( snapshot_id: str, branch_name_exclude_prefix: Optional[str] = "refs/pull/" ) -> Dict[str, int]: diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py --- a/swh/web/common/origin_save.py +++ b/swh/web/common/origin_save.py @@ -1,9 +1,8 @@ -# Copyright (C) 2018-2021 The Software Heritage developers +# Copyright (C) 2018-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information -from bisect import bisect_right from datetime import datetime, timedelta, timezone from functools import lru_cache from itertools import product @@ -39,12 +38,7 @@ SaveOriginRequest, SaveUnauthorizedOrigin, ) -from swh.web.common.origin_visits import get_origin_visits -from swh.web.common.typing import ( - OriginExistenceCheckInfo, - OriginInfo, - SaveOriginRequestInfo, -) +from swh.web.common.typing import OriginExistenceCheckInfo, SaveOriginRequestInfo from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc from swh.web.config import get_config, scheduler @@ -285,16 +279,11 @@ # as those requests to storage are expensive and associated loading task # surely ended up with errors if time_delta.days <= MAX_THRESHOLD_DAYS: - try: - origin_info = archive.lookup_origin(OriginInfo(url=save_request.origin_url)) - origin_visits = get_origin_visits(origin_info) - visit_dates = [parse_iso8601_date_to_utc(v["date"]) for v in origin_visits] - i = bisect_right(visit_dates, save_request.request_date) - if i != len(visit_dates): - visit_date = visit_dates[i] - visit_status = origin_visits[i]["status"] - except Exception as exc: - sentry_sdk.capture_exception(exc) + origin = save_request.origin_url + ovs = archive.origin_visit_find_latest_status(origin) + if ovs: + visit_date = parse_iso8601_date_to_utc(ovs["date"]) + visit_status = ovs["status"] return visit_date, visit_status diff --git a/swh/web/tests/common/test_archive.py b/swh/web/tests/common/test_archive.py --- a/swh/web/tests/common/test_archive.py +++ b/swh/web/tests/common/test_archive.py @@ -1,9 +1,10 @@ -# Copyright (C) 2015-2021 The Software Heritage developers +# Copyright (C) 2015-2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict +import datetime import hashlib import itertools import random @@ -18,6 +19,7 @@ DirectoryEntry, Origin, OriginVisit, + OriginVisitStatus, Revision, Snapshot, SnapshotBranch, @@ -207,6 +209,84 @@ assert actual_origin_visit == expected_visit +@given(new_origin(), visit_dates()) +def test_origin_visit_find_latest_status_no_result( + archive_data, new_origin, visit_dates +): + """No visit registered in storage for an origin should return no visit""" + archive_data.origin_add([new_origin]) + + for visit_date in visit_dates: + # No visit yet, so nothing will get returned + actual_origin_visit_status = archive.origin_visit_find_latest_status( + new_origin.url + ) + assert actual_origin_visit_status is None + + +@given(new_origin(), visit_dates()) +def test_origin_visit_find_latest_status_only_created( + archive_data, new_origin, visit_dates +): + """Only created visits in storage for an origin should return those most recent""" + archive_data.origin_add([new_origin]) + + # Adding some visits + archive_data.origin_visit_add( + [ + OriginVisit( + origin=new_origin.url, + date=ts - datetime.timedelta(days=offset), + type="git", + ) + for offset, ts in enumerate(visit_dates) + ] + ) + # We will only find reference to created status since we have nothing else for now + actual_origin_visit_status = archive.origin_visit_find_latest_status(new_origin.url) + assert actual_origin_visit_status is not None + assert actual_origin_visit_status["status"] == "created" + + +@given(new_origin(), visit_dates()) +def test_origin_visit_find_latest_status_with_status( + subtest, archive_data, new_origin, visit_dates +): + """More exhaustive checks with visits with statuses""" + archive_data.origin_add([new_origin]) + # Add some visits + visits = archive_data.origin_visit_add( + [ + OriginVisit(origin=new_origin.url, date=ts, type="git",) + for ts in sorted(visit_dates) + ] + ) + # Then finalize some visits + # ovss = archive.lookup_origin_visits(new_origin.url, visit_date) + all_statuses = ["not_found", "failed", "partial", "full"] + # Add visit status on most recent visit + visit = visits[-1] + visit_statuses = [ + OriginVisitStatus( + origin=new_origin.url, + visit=visit.visit, + date=visit.date + datetime.timedelta(days=offset_day), + type=visit.type, + status=status, + snapshot=None, + ) + for (offset_day, status) in enumerate(all_statuses) + ] + archive_data.origin_visit_status_add(visit_statuses) + + # visit_date = max(ovs.date for ovs in visit_statuses) + expected_visit_status = archive.lookup_origin_visit(new_origin.url, visit.visit) + + actual_origin_visit_status = archive.origin_visit_find_latest_status(new_origin.url) + assert actual_origin_visit_status == expected_visit_status + assert actual_origin_visit_status["status"] == "full" + + @given(new_origin()) def test_lookup_origin(archive_data, new_origin): archive_data.origin_add([new_origin]) diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py --- a/swh/web/tests/common/test_origin_save.py +++ b/swh/web/tests/common/test_origin_save.py @@ -215,9 +215,6 @@ _fill_scheduler_db(swh_scheduler) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -231,18 +228,18 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status.return_value = visit_info # check visit date has been correctly found sors = get_save_origin_requests(_visit_type, _origin_url) assert len(sors) == 1 assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED assert sors[0]["visit_date"] == visit_date - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_find_latest_status.assert_called_once() # check visit is not searched again when it has been found get_save_origin_requests(_visit_type, _origin_url) - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_find_latest_status.assert_called_once() # check visit date are not searched for save requests older than # one month @@ -263,7 +260,7 @@ assert len(sors) == 2 assert sors[0]["save_task_status"] == SAVE_TASK_FAILED assert sors[0]["visit_date"] is None - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_find_latest_status.assert_called_once() def _get_save_origin_requests( @@ -292,9 +289,6 @@ ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -308,10 +302,10 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status.return_value = visit_info sors = get_save_origin_requests(_visit_type, _origin_url) - mock_get_origin_visits.assert_called_once() + mock_archive.origin_visit_find_latest_status.assert_called_once() return sors @@ -587,9 +581,6 @@ ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -603,7 +594,7 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status.return_value = visit_info # make the scheduler return a running event _fill_scheduler_db( @@ -616,7 +607,11 @@ # The visit is detected but still running sors = refresh_save_origin_request_statuses() - assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 + assert ( + mock_archive.origin_visit_find_latest_status.called + and mock_archive.origin_visit_find_latest_status.call_count == 1 + ) + assert len(sors) == 1 for sor in sors: @@ -640,13 +635,16 @@ # (visit date and visit status in final state) visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info.update({"date": visit_date, "status": VISIT_STATUS_FULL}) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status.return_value = visit_info # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses() assert len(sors) == 1 - assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 + 1 + assert ( + mock_archive.origin_visit_find_latest_status.called + and mock_archive.origin_visit_find_latest_status.call_count == 1 + 1 + ) for sor in sors: assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot @@ -687,9 +685,6 @@ ) mock_archive = mocker.patch("swh.web.common.origin_save.archive") mock_archive.lookup_origin.return_value = {"url": _origin_url} - mock_get_origin_visits = mocker.patch( - "swh.web.common.origin_save.get_origin_visits" - ) # create a visit for the save request with status created visit_date = datetime.now(tz=timezone.utc).isoformat() visit_info = OriginVisitInfo( @@ -703,7 +698,7 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status.return_value = visit_info # no changes so refresh does detect the entry but does nothing sors = refresh_save_origin_request_statuses() @@ -746,7 +741,7 @@ url="", visit=34, ) - mock_get_origin_visits.return_value = [visit_info] + mock_archive.origin_visit_find_latest_status.return_value = visit_info # Detected entry, this time it should be updated sors = refresh_save_origin_request_statuses()