Page MenuHomeSoftware Heritage

D7078.id.diff
No OneTemporary

D7078.id.diff

diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py
--- a/swh/web/common/archive.py
+++ b/swh/web/common/archive.py
@@ -2,8 +2,8 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
from collections import defaultdict
+import datetime
import itertools
import os
import re
@@ -1066,6 +1066,33 @@
return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type})
+def origin_visit_find_by_date(
+ origin_url: str, visit_date: datetime.datetime
+) -> Optional[OriginVisitInfo]:
+ """Retrieve origin visit status whose date is most recent than the provided visit_date.
+
+ Args:
+ origin_url: origin concerned by the visit
+ visit_date: provided visit date
+
+ Returns:
+ The dict origin_visit_status matching the criteria if any.
+
+ """
+ visit = storage.origin_visit_find_by_date(origin_url, visit_date)
+ if visit and visit.date < visit_date:
+ # when visit is anterior to the provided date, trying to find another one most
+ # recent
+ visits = storage.origin_visit_get(
+ origin_url, page_token=str(visit.visit), limit=1,
+ ).results
+ visit = visits[0] if visits else None
+ if not visit:
+ return None
+ visit_status = storage.origin_visit_status_get_latest(origin_url, visit.visit)
+ return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type})
+
+
def lookup_snapshot_sizes(
snapshot_id: str, branch_name_exclude_prefix: Optional[str] = "refs/pull/"
) -> Dict[str, int]:
diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py
--- a/swh/web/common/origin_save.py
+++ b/swh/web/common/origin_save.py
@@ -1,9 +1,8 @@
-# Copyright (C) 2018-2021 The Software Heritage developers
+# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from bisect import bisect_right
from datetime import datetime, timedelta, timezone
from functools import lru_cache
from itertools import product
@@ -39,12 +38,7 @@
SaveOriginRequest,
SaveUnauthorizedOrigin,
)
-from swh.web.common.origin_visits import get_origin_visits
-from swh.web.common.typing import (
- OriginExistenceCheckInfo,
- OriginInfo,
- SaveOriginRequestInfo,
-)
+from swh.web.common.typing import OriginExistenceCheckInfo, SaveOriginRequestInfo
from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc
from swh.web.config import get_config, scheduler
@@ -285,16 +279,11 @@
# as those requests to storage are expensive and associated loading task
# surely ended up with errors
if time_delta.days <= MAX_THRESHOLD_DAYS:
- try:
- origin_info = archive.lookup_origin(OriginInfo(url=save_request.origin_url))
- origin_visits = get_origin_visits(origin_info)
- visit_dates = [parse_iso8601_date_to_utc(v["date"]) for v in origin_visits]
- i = bisect_right(visit_dates, save_request.request_date)
- if i != len(visit_dates):
- visit_date = visit_dates[i]
- visit_status = origin_visits[i]["status"]
- except Exception as exc:
- sentry_sdk.capture_exception(exc)
+ origin = save_request.origin_url
+ ovs = archive.origin_visit_find_by_date(origin, save_request.request_date)
+ if ovs:
+ visit_date = parse_iso8601_date_to_utc(ovs["date"])
+ visit_status = ovs["status"]
return visit_date, visit_status
diff --git a/swh/web/tests/common/test_archive.py b/swh/web/tests/common/test_archive.py
--- a/swh/web/tests/common/test_archive.py
+++ b/swh/web/tests/common/test_archive.py
@@ -1,14 +1,15 @@
-# Copyright (C) 2015-2021 The Software Heritage developers
+# Copyright (C) 2015-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
+import datetime
import hashlib
import itertools
import random
-from hypothesis import given
+from hypothesis import given, settings
import pytest
from swh.model.from_disk import DentryPerms
@@ -18,6 +19,7 @@
DirectoryEntry,
Origin,
OriginVisit,
+ OriginVisitStatus,
Revision,
Snapshot,
SnapshotBranch,
@@ -207,6 +209,74 @@
assert actual_origin_visit == expected_visit
+@given(new_origin(), visit_dates())
+@settings(max_examples=1)
+def test_origin_visit_find_by_date_no_result(archive_data, new_origin, visit_dates):
+ """No visit registered in storage for an origin should return no visit"""
+ archive_data.origin_add([new_origin])
+
+ for visit_date in visit_dates:
+ # No visit yet, so nothing will get returned
+ actual_origin_visit_status = archive.origin_visit_find_by_date(
+ new_origin.url, visit_date
+ )
+ assert actual_origin_visit_status is None
+
+
+@settings(max_examples=1)
+@given(new_origin(), visit_dates())
+def test_origin_visit_find_by_date_with_status(
+ subtest, archive_data, new_origin, visit_dates
+):
+ """More exhaustive checks with visits with statuses"""
+ archive_data.origin_add([new_origin])
+ # Add some visits
+ visits = archive_data.origin_visit_add(
+ [
+ OriginVisit(origin=new_origin.url, date=ts, type="git",)
+ for ts in sorted(visit_dates)
+ ]
+ )
+ # Then finalize some visits
+ # ovss = archive.lookup_origin_visits(new_origin.url, visit_date)
+ all_statuses = ["not_found", "failed", "partial", "full"]
+ # Add visit status on most recent visit
+ visit = visits[-1]
+ visit_statuses = [
+ OriginVisitStatus(
+ origin=new_origin.url,
+ visit=visit.visit,
+ date=visit.date + datetime.timedelta(days=offset_day),
+ type=visit.type,
+ status=status,
+ snapshot=None,
+ )
+ for (offset_day, status) in enumerate(all_statuses)
+ ]
+ archive_data.origin_visit_status_add(visit_statuses)
+
+ last_visit = max(visits, key=lambda v: v.date)
+
+ # visit_date = max(ovs.date for ovs in visit_statuses)
+ expected_visit_status = archive.lookup_origin_visit(new_origin.url, visit.visit)
+
+ for offset_min in [-1, 0, 1]:
+ visit_date = last_visit.date + datetime.timedelta(minutes=offset_min)
+
+ if offset_min == -1:
+ assert last_visit.date > visit_date
+ if offset_min == 0:
+ assert last_visit.date == visit_date
+ if offset_min == 1:
+ assert last_visit.date < visit_date
+
+ actual_origin_visit_status = archive.origin_visit_find_by_date(
+ new_origin.url, visit_date
+ )
+ assert actual_origin_visit_status == expected_visit_status
+ assert actual_origin_visit_status["status"] == "full"
+
+
@given(new_origin())
def test_lookup_origin(archive_data, new_origin):
archive_data.origin_add([new_origin])
diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py
--- a/swh/web/tests/common/test_origin_save.py
+++ b/swh/web/tests/common/test_origin_save.py
@@ -215,9 +215,6 @@
_fill_scheduler_db(swh_scheduler)
mock_archive = mocker.patch("swh.web.common.origin_save.archive")
mock_archive.lookup_origin.return_value = {"url": _origin_url}
- mock_get_origin_visits = mocker.patch(
- "swh.web.common.origin_save.get_origin_visits"
- )
# create a visit for the save request
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
@@ -231,18 +228,18 @@
url="",
visit=34,
)
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
# check visit date has been correctly found
sors = get_save_origin_requests(_visit_type, _origin_url)
assert len(sors) == 1
assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED
assert sors[0]["visit_date"] == visit_date
- mock_get_origin_visits.assert_called_once()
+ mock_archive.origin_visit_find_by_date.assert_called_once()
# check visit is not searched again when it has been found
get_save_origin_requests(_visit_type, _origin_url)
- mock_get_origin_visits.assert_called_once()
+ mock_archive.origin_visit_find_by_date.assert_called_once()
# check visit date are not searched for save requests older than
# one month
@@ -263,7 +260,7 @@
assert len(sors) == 2
assert sors[0]["save_task_status"] == SAVE_TASK_FAILED
assert sors[0]["visit_date"] is None
- mock_get_origin_visits.assert_called_once()
+ mock_archive.origin_visit_find_by_date.assert_called_once()
def _get_save_origin_requests(
@@ -292,9 +289,6 @@
)
mock_archive = mocker.patch("swh.web.common.origin_save.archive")
mock_archive.lookup_origin.return_value = {"url": _origin_url}
- mock_get_origin_visits = mocker.patch(
- "swh.web.common.origin_save.get_origin_visits"
- )
# create a visit for the save request with status created
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
@@ -308,10 +302,10 @@
url="",
visit=34,
)
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
sors = get_save_origin_requests(_visit_type, _origin_url)
- mock_get_origin_visits.assert_called_once()
+ mock_archive.origin_visit_find_by_date.assert_called_once()
return sors
@@ -587,9 +581,6 @@
)
mock_archive = mocker.patch("swh.web.common.origin_save.archive")
mock_archive.lookup_origin.return_value = {"url": _origin_url}
- mock_get_origin_visits = mocker.patch(
- "swh.web.common.origin_save.get_origin_visits"
- )
# create a visit for the save request with status created
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
@@ -603,7 +594,7 @@
url="",
visit=34,
)
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
# make the scheduler return a running event
_fill_scheduler_db(
@@ -616,7 +607,11 @@
# The visit is detected but still running
sors = refresh_save_origin_request_statuses()
- assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1
+ assert (
+ mock_archive.origin_visit_find_by_date.called
+ and mock_archive.origin_visit_find_by_date.call_count == 1
+ )
+
assert len(sors) == 1
for sor in sors:
@@ -640,13 +635,16 @@
# (visit date and visit status in final state)
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info.update({"date": visit_date, "status": VISIT_STATUS_FULL})
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
# Detected entry, this time it should be updated
sors = refresh_save_origin_request_statuses()
assert len(sors) == 1
- assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 + 1
+ assert (
+ mock_archive.origin_visit_find_by_date.called
+ and mock_archive.origin_visit_find_by_date.call_count == 1 + 1
+ )
for sor in sors:
assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot
@@ -687,9 +685,6 @@
)
mock_archive = mocker.patch("swh.web.common.origin_save.archive")
mock_archive.lookup_origin.return_value = {"url": _origin_url}
- mock_get_origin_visits = mocker.patch(
- "swh.web.common.origin_save.get_origin_visits"
- )
# create a visit for the save request with status created
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
@@ -703,7 +698,7 @@
url="",
visit=34,
)
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
# no changes so refresh does detect the entry but does nothing
sors = refresh_save_origin_request_statuses()
@@ -746,7 +741,7 @@
url="",
visit=34,
)
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
# Detected entry, this time it should be updated
sors = refresh_save_origin_request_statuses()

File Metadata

Mime Type
text/plain
Expires
Jan 30 2025, 11:06 AM (6 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220380

Event Timeline