Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7163597
D7078.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Subscribers
None
D7078.id.diff
View Options
diff --git a/swh/web/common/archive.py b/swh/web/common/archive.py
--- a/swh/web/common/archive.py
+++ b/swh/web/common/archive.py
@@ -2,8 +2,8 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
from collections import defaultdict
+import datetime
import itertools
import os
import re
@@ -1066,6 +1066,33 @@
return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type})
+def origin_visit_find_by_date(
+ origin_url: str, visit_date: datetime.datetime
+) -> Optional[OriginVisitInfo]:
+ """Retrieve origin visit status whose date is most recent than the provided visit_date.
+
+ Args:
+ origin_url: origin concerned by the visit
+ visit_date: provided visit date
+
+ Returns:
+ The dict origin_visit_status matching the criteria if any.
+
+ """
+ visit = storage.origin_visit_find_by_date(origin_url, visit_date)
+ if visit and visit.date < visit_date:
+ # when visit is anterior to the provided date, trying to find another one most
+ # recent
+ visits = storage.origin_visit_get(
+ origin_url, page_token=str(visit.visit), limit=1,
+ ).results
+ visit = visits[0] if visits else None
+ if not visit:
+ return None
+ visit_status = storage.origin_visit_status_get_latest(origin_url, visit.visit)
+ return converters.from_origin_visit({**visit_status.to_dict(), "type": visit.type})
+
+
def lookup_snapshot_sizes(
snapshot_id: str, branch_name_exclude_prefix: Optional[str] = "refs/pull/"
) -> Dict[str, int]:
diff --git a/swh/web/common/origin_save.py b/swh/web/common/origin_save.py
--- a/swh/web/common/origin_save.py
+++ b/swh/web/common/origin_save.py
@@ -1,9 +1,8 @@
-# Copyright (C) 2018-2021 The Software Heritage developers
+# Copyright (C) 2018-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from bisect import bisect_right
from datetime import datetime, timedelta, timezone
from functools import lru_cache
from itertools import product
@@ -39,12 +38,7 @@
SaveOriginRequest,
SaveUnauthorizedOrigin,
)
-from swh.web.common.origin_visits import get_origin_visits
-from swh.web.common.typing import (
- OriginExistenceCheckInfo,
- OriginInfo,
- SaveOriginRequestInfo,
-)
+from swh.web.common.typing import OriginExistenceCheckInfo, SaveOriginRequestInfo
from swh.web.common.utils import SWH_WEB_METRICS_REGISTRY, parse_iso8601_date_to_utc
from swh.web.config import get_config, scheduler
@@ -285,16 +279,11 @@
# as those requests to storage are expensive and associated loading task
# surely ended up with errors
if time_delta.days <= MAX_THRESHOLD_DAYS:
- try:
- origin_info = archive.lookup_origin(OriginInfo(url=save_request.origin_url))
- origin_visits = get_origin_visits(origin_info)
- visit_dates = [parse_iso8601_date_to_utc(v["date"]) for v in origin_visits]
- i = bisect_right(visit_dates, save_request.request_date)
- if i != len(visit_dates):
- visit_date = visit_dates[i]
- visit_status = origin_visits[i]["status"]
- except Exception as exc:
- sentry_sdk.capture_exception(exc)
+ origin = save_request.origin_url
+ ovs = archive.origin_visit_find_by_date(origin, save_request.request_date)
+ if ovs:
+ visit_date = parse_iso8601_date_to_utc(ovs["date"])
+ visit_status = ovs["status"]
return visit_date, visit_status
diff --git a/swh/web/tests/common/test_archive.py b/swh/web/tests/common/test_archive.py
--- a/swh/web/tests/common/test_archive.py
+++ b/swh/web/tests/common/test_archive.py
@@ -1,14 +1,15 @@
-# Copyright (C) 2015-2021 The Software Heritage developers
+# Copyright (C) 2015-2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU Affero General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
+import datetime
import hashlib
import itertools
import random
-from hypothesis import given
+from hypothesis import given, settings
import pytest
from swh.model.from_disk import DentryPerms
@@ -18,6 +19,7 @@
DirectoryEntry,
Origin,
OriginVisit,
+ OriginVisitStatus,
Revision,
Snapshot,
SnapshotBranch,
@@ -207,6 +209,74 @@
assert actual_origin_visit == expected_visit
+@given(new_origin(), visit_dates())
+@settings(max_examples=1)
+def test_origin_visit_find_by_date_no_result(archive_data, new_origin, visit_dates):
+ """No visit registered in storage for an origin should return no visit"""
+ archive_data.origin_add([new_origin])
+
+ for visit_date in visit_dates:
+ # No visit yet, so nothing will get returned
+ actual_origin_visit_status = archive.origin_visit_find_by_date(
+ new_origin.url, visit_date
+ )
+ assert actual_origin_visit_status is None
+
+
+@settings(max_examples=1)
+@given(new_origin(), visit_dates())
+def test_origin_visit_find_by_date_with_status(
+ subtest, archive_data, new_origin, visit_dates
+):
+ """More exhaustive checks with visits with statuses"""
+ archive_data.origin_add([new_origin])
+ # Add some visits
+ visits = archive_data.origin_visit_add(
+ [
+ OriginVisit(origin=new_origin.url, date=ts, type="git",)
+ for ts in sorted(visit_dates)
+ ]
+ )
+ # Then finalize some visits
+ # ovss = archive.lookup_origin_visits(new_origin.url, visit_date)
+ all_statuses = ["not_found", "failed", "partial", "full"]
+ # Add visit status on most recent visit
+ visit = visits[-1]
+ visit_statuses = [
+ OriginVisitStatus(
+ origin=new_origin.url,
+ visit=visit.visit,
+ date=visit.date + datetime.timedelta(days=offset_day),
+ type=visit.type,
+ status=status,
+ snapshot=None,
+ )
+ for (offset_day, status) in enumerate(all_statuses)
+ ]
+ archive_data.origin_visit_status_add(visit_statuses)
+
+ last_visit = max(visits, key=lambda v: v.date)
+
+ # visit_date = max(ovs.date for ovs in visit_statuses)
+ expected_visit_status = archive.lookup_origin_visit(new_origin.url, visit.visit)
+
+ for offset_min in [-1, 0, 1]:
+ visit_date = last_visit.date + datetime.timedelta(minutes=offset_min)
+
+ if offset_min == -1:
+ assert last_visit.date > visit_date
+ if offset_min == 0:
+ assert last_visit.date == visit_date
+ if offset_min == 1:
+ assert last_visit.date < visit_date
+
+ actual_origin_visit_status = archive.origin_visit_find_by_date(
+ new_origin.url, visit_date
+ )
+ assert actual_origin_visit_status == expected_visit_status
+ assert actual_origin_visit_status["status"] == "full"
+
+
@given(new_origin())
def test_lookup_origin(archive_data, new_origin):
archive_data.origin_add([new_origin])
diff --git a/swh/web/tests/common/test_origin_save.py b/swh/web/tests/common/test_origin_save.py
--- a/swh/web/tests/common/test_origin_save.py
+++ b/swh/web/tests/common/test_origin_save.py
@@ -215,9 +215,6 @@
_fill_scheduler_db(swh_scheduler)
mock_archive = mocker.patch("swh.web.common.origin_save.archive")
mock_archive.lookup_origin.return_value = {"url": _origin_url}
- mock_get_origin_visits = mocker.patch(
- "swh.web.common.origin_save.get_origin_visits"
- )
# create a visit for the save request
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
@@ -231,18 +228,18 @@
url="",
visit=34,
)
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
# check visit date has been correctly found
sors = get_save_origin_requests(_visit_type, _origin_url)
assert len(sors) == 1
assert sors[0]["save_task_status"] == SAVE_TASK_SUCCEEDED
assert sors[0]["visit_date"] == visit_date
- mock_get_origin_visits.assert_called_once()
+ mock_archive.origin_visit_find_by_date.assert_called_once()
# check visit is not searched again when it has been found
get_save_origin_requests(_visit_type, _origin_url)
- mock_get_origin_visits.assert_called_once()
+ mock_archive.origin_visit_find_by_date.assert_called_once()
# check visit date are not searched for save requests older than
# one month
@@ -263,7 +260,7 @@
assert len(sors) == 2
assert sors[0]["save_task_status"] == SAVE_TASK_FAILED
assert sors[0]["visit_date"] is None
- mock_get_origin_visits.assert_called_once()
+ mock_archive.origin_visit_find_by_date.assert_called_once()
def _get_save_origin_requests(
@@ -292,9 +289,6 @@
)
mock_archive = mocker.patch("swh.web.common.origin_save.archive")
mock_archive.lookup_origin.return_value = {"url": _origin_url}
- mock_get_origin_visits = mocker.patch(
- "swh.web.common.origin_save.get_origin_visits"
- )
# create a visit for the save request with status created
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
@@ -308,10 +302,10 @@
url="",
visit=34,
)
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
sors = get_save_origin_requests(_visit_type, _origin_url)
- mock_get_origin_visits.assert_called_once()
+ mock_archive.origin_visit_find_by_date.assert_called_once()
return sors
@@ -587,9 +581,6 @@
)
mock_archive = mocker.patch("swh.web.common.origin_save.archive")
mock_archive.lookup_origin.return_value = {"url": _origin_url}
- mock_get_origin_visits = mocker.patch(
- "swh.web.common.origin_save.get_origin_visits"
- )
# create a visit for the save request with status created
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
@@ -603,7 +594,7 @@
url="",
visit=34,
)
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
# make the scheduler return a running event
_fill_scheduler_db(
@@ -616,7 +607,11 @@
# The visit is detected but still running
sors = refresh_save_origin_request_statuses()
- assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1
+ assert (
+ mock_archive.origin_visit_find_by_date.called
+ and mock_archive.origin_visit_find_by_date.call_count == 1
+ )
+
assert len(sors) == 1
for sor in sors:
@@ -640,13 +635,16 @@
# (visit date and visit status in final state)
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info.update({"date": visit_date, "status": VISIT_STATUS_FULL})
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
# Detected entry, this time it should be updated
sors = refresh_save_origin_request_statuses()
assert len(sors) == 1
- assert mock_get_origin_visits.called and mock_get_origin_visits.call_count == 1 + 1
+ assert (
+ mock_archive.origin_visit_find_by_date.called
+ and mock_archive.origin_visit_find_by_date.call_count == 1 + 1
+ )
for sor in sors:
assert iso8601.parse_date(sor["save_request_date"]) >= date_pivot
@@ -687,9 +685,6 @@
)
mock_archive = mocker.patch("swh.web.common.origin_save.archive")
mock_archive.lookup_origin.return_value = {"url": _origin_url}
- mock_get_origin_visits = mocker.patch(
- "swh.web.common.origin_save.get_origin_visits"
- )
# create a visit for the save request with status created
visit_date = datetime.now(tz=timezone.utc).isoformat()
visit_info = OriginVisitInfo(
@@ -703,7 +698,7 @@
url="",
visit=34,
)
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
# no changes so refresh does detect the entry but does nothing
sors = refresh_save_origin_request_statuses()
@@ -746,7 +741,7 @@
url="",
visit=34,
)
- mock_get_origin_visits.return_value = [visit_info]
+ mock_archive.origin_visit_find_by_date.return_value = visit_info
# Detected entry, this time it should be updated
sors = refresh_save_origin_request_statuses()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Jan 30 2025, 11:06 AM (6 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220380
Attached To
D7078: Find latest origin visit status by date instead of visits filtering
Event Timeline
Log In to Comment