Page MenuHomeSoftware Heritage

D4888.id17440.diff
No OneTemporary

D4888.id17440.diff

diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py
--- a/swh/scheduler/backend.py
+++ b/swh/scheduler/backend.py
@@ -864,19 +864,21 @@
@db_transaction()
def origin_visit_stats_get(
- self, url: str, visit_type: str, db=None, cur=None
- ) -> Optional[OriginVisitStats]:
+ self, ids: Iterable[Tuple[str, str]], db=None, cur=None
+ ) -> List[OriginVisitStats]:
+ if not ids:
+ return []
+ primary_keys = tuple((origin, visit_type) for (origin, visit_type) in ids)
query = format_query(
- "SELECT {keys} FROM origin_visit_stats WHERE url=%s AND visit_type=%s",
+ """
+ SELECT {keys}
+ FROM (VALUES %s) as stats(url, visit_type)
+ INNER JOIN origin_visit_stats USING (url, visit_type)
+ """,
OriginVisitStats.select_columns(),
)
- cur.execute(query, (url, visit_type))
- row = cur.fetchone()
-
- if row:
- return OriginVisitStats(**row)
- else:
- return None
+ psycopg2.extras.execute_values(cur=cur, sql=query, argslist=primary_keys)
+ return [OriginVisitStats(**row) for row in cur.fetchall()]
@db_transaction()
def update_metrics(
diff --git a/swh/scheduler/interface.py b/swh/scheduler/interface.py
--- a/swh/scheduler/interface.py
+++ b/swh/scheduler/interface.py
@@ -3,8 +3,9 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+
import datetime
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Tuple
from uuid import UUID
from typing_extensions import Protocol, runtime_checkable
@@ -347,9 +348,14 @@
@remote_api_endpoint("visit_stats/get")
def origin_visit_stats_get(
- self, url: str, visit_type: str
- ) -> Optional[OriginVisitStats]:
- """Retrieve the stats for an origin with a given visit type"""
+ self, ids: Iterable[Tuple[str, str]]
+ ) -> List[OriginVisitStats]:
+ """Retrieve the stats for an origin with a given visit type
+
+ If some visit_stats are not found, they are filtered out of the result. So the
+ output list may be of length inferior to the length of the input list.
+
+ """
...
@remote_api_endpoint("scheduler_metrics/update")
diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py
--- a/swh/scheduler/journal_client.py
+++ b/swh/scheduler/journal_client.py
@@ -65,9 +65,9 @@
}
pk = origin, visit_type
if pk not in origin_visit_stats:
- visit_stats = scheduler.origin_visit_stats_get(origin, visit_type)
+ visit_stats = scheduler.origin_visit_stats_get([pk])
origin_visit_stats[pk] = (
- attr.asdict(visit_stats) if visit_stats else empty_object
+ attr.asdict(visit_stats[0]) if visit_stats else empty_object
)
visit_stats_d = origin_visit_stats[pk]
diff --git a/swh/scheduler/simulator/origins.py b/swh/scheduler/simulator/origins.py
--- a/swh/scheduler/simulator/origins.py
+++ b/swh/scheduler/simulator/origins.py
@@ -94,7 +94,9 @@
"""
# This is cheating; actual tasks access the state from the storage, not the
# scheduler
- stats = env.scheduler.origin_visit_stats_get(task.origin, task.visit_type)
+ pk = task.origin, task.visit_type
+ visit_stats = env.scheduler.origin_visit_stats_get([pk])
+ stats: Optional[OriginVisitStats] = visit_stats[0] if len(visit_stats) > 0 else None
last_snapshot = stats.last_snapshot if stats else None
status = OriginVisitStatus(
diff --git a/swh/scheduler/tests/test_cli_journal.py b/swh/scheduler/tests/test_cli_journal.py
--- a/swh/scheduler/tests/test_cli_journal.py
+++ b/swh/scheduler/tests/test_cli_journal.py
@@ -109,7 +109,8 @@
assert result.output == expected_output
actual_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
+ [(visit_status["origin"], visit_status["type"])]
)
- assert actual_visit_stats is not None
+ assert actual_visit_stats
+ assert len(actual_visit_stats) == 1
diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py
--- a/swh/scheduler/tests/test_journal_client.py
+++ b/swh/scheduler/tests/test_journal_client.py
@@ -90,11 +90,10 @@
)
# Ensure those visit status are ignored
- for visit_status in visit_statuses:
- actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is None
+ actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
+ [(vs["origin"], vs["type"]) for vs in visit_statuses]
+ )
+ assert actual_origin_visit_stats == []
def test_journal_client_origin_visit_status_from_journal_last_notfound(swh_scheduler):
@@ -112,17 +111,19 @@
)
actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats == OriginVisitStats(
- url=visit_status["origin"],
- visit_type=visit_status["type"],
- last_eventful=None,
- last_uneventful=None,
- last_failed=None,
- last_notfound=visit_status["date"],
- last_snapshot=None,
+ [(visit_status["origin"], visit_status["type"])]
)
+ assert actual_origin_visit_stats == [
+ OriginVisitStats(
+ url=visit_status["origin"],
+ visit_type=visit_status["type"],
+ last_eventful=None,
+ last_uneventful=None,
+ last_failed=None,
+ last_notfound=visit_status["date"],
+ last_snapshot=None,
+ )
+ ]
visit_statuses = [
{
@@ -149,18 +150,19 @@
for visit_status in visit_statuses:
actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
- url=visit_status["origin"],
- visit_type=visit_status["type"],
- last_eventful=None,
- last_uneventful=None,
- last_failed=None,
- last_notfound=DATE3,
- last_snapshot=None,
+ [(visit_status["origin"], visit_status["type"])]
)
+ assert actual_origin_visit_stats == [
+ OriginVisitStats(
+ url=visit_status["origin"],
+ visit_type=visit_status["type"],
+ last_eventful=None,
+ last_uneventful=None,
+ last_failed=None,
+ last_notfound=DATE3,
+ last_snapshot=None,
+ )
+ ]
def test_journal_client_origin_visit_status_from_journal_last_failed(swh_scheduler):
@@ -190,18 +192,19 @@
# Ensure those visit status are ignored
for visit_status in visit_statuses:
actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
- url=visit_status["origin"],
- visit_type=visit_status["type"],
- last_eventful=None,
- last_uneventful=None,
- last_failed=visit_status["date"],
- last_notfound=None,
- last_snapshot=None,
+ [(visit_status["origin"], visit_status["type"])]
)
+ assert actual_origin_visit_stats == [
+ OriginVisitStats(
+ url=visit_status["origin"],
+ visit_type=visit_status["type"],
+ last_eventful=None,
+ last_uneventful=None,
+ last_failed=visit_status["date"],
+ last_notfound=None,
+ last_snapshot=None,
+ )
+ ]
visit_statuses = [
{
@@ -228,18 +231,19 @@
for visit_status in visit_statuses:
actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
- url=visit_status["origin"],
- visit_type=visit_status["type"],
- last_eventful=None,
- last_uneventful=None,
- last_failed=DATE3,
- last_notfound=None,
- last_snapshot=None,
+ [(visit_status["origin"], visit_status["type"])]
)
+ assert actual_origin_visit_stats == [
+ OriginVisitStats(
+ url=visit_status["origin"],
+ visit_type=visit_status["type"],
+ last_eventful=None,
+ last_uneventful=None,
+ last_failed=DATE3,
+ last_notfound=None,
+ last_snapshot=None,
+ )
+ ]
def test_journal_client_origin_visit_status_from_journal_last_eventful(swh_scheduler):
@@ -268,18 +272,19 @@
for visit_status in visit_statuses:
actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
- url=visit_status["origin"],
- visit_type=visit_status["type"],
- last_eventful=visit_status["date"],
- last_uneventful=None,
- last_failed=None,
- last_notfound=None,
- last_snapshot=visit_status["snapshot"],
+ [(visit_status["origin"], visit_status["type"])]
)
+ assert actual_origin_visit_stats == [
+ OriginVisitStats(
+ url=visit_status["origin"],
+ visit_type=visit_status["type"],
+ last_eventful=visit_status["date"],
+ last_uneventful=None,
+ last_failed=None,
+ last_notfound=None,
+ last_snapshot=visit_status["snapshot"],
+ )
+ ]
visit_statuses = [
{
@@ -306,18 +311,19 @@
for visit_status in visit_statuses:
actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
- url=visit_status["origin"],
- visit_type=visit_status["type"],
- last_eventful=DATE3,
- last_uneventful=None,
- last_failed=None,
- last_notfound=None,
- last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"),
+ [(visit_status["origin"], visit_status["type"])]
)
+ assert actual_origin_visit_stats == [
+ OriginVisitStats(
+ url=visit_status["origin"],
+ visit_type=visit_status["type"],
+ last_eventful=DATE3,
+ last_uneventful=None,
+ last_failed=None,
+ last_notfound=None,
+ last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"),
+ )
+ ]
def test_journal_client_origin_visit_status_from_journal_last_uneventful(swh_scheduler):
@@ -350,18 +356,19 @@
)
actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
- url=visit_status["origin"],
- visit_type=visit_status["type"],
- last_eventful=DATE1,
- last_uneventful=visit_status["date"], # most recent date but uneventful
- last_failed=DATE2,
- last_notfound=DATE1,
- last_snapshot=visit_status["snapshot"],
+ [(visit_status["origin"], visit_status["type"])]
)
+ assert actual_origin_visit_stats == [
+ OriginVisitStats(
+ url=visit_status["origin"],
+ visit_type=visit_status["type"],
+ last_eventful=DATE1,
+ last_uneventful=visit_status["date"], # most recent date but uneventful
+ last_failed=DATE2,
+ last_notfound=DATE1,
+ last_snapshot=visit_status["snapshot"],
+ )
+ ]
VISIT_STATUSES = [
@@ -422,7 +429,9 @@
last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
)
- assert swh_scheduler.origin_visit_stats_get("foo", "git") == expected_visit_stats
+ assert swh_scheduler.origin_visit_stats_get([("foo", "git")]) == [
+ expected_visit_stats
+ ]
VISIT_STATUSES1 = [
@@ -483,9 +492,9 @@
last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
)
- assert (
- swh_scheduler.origin_visit_stats_get("cavabarder", "hg") == expected_visit_stats
- )
+ assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == [
+ expected_visit_stats
+ ]
VISIT_STATUSES_2 = [
@@ -572,21 +581,21 @@
)
after = utcnow()
- assert swh_scheduler.origin_visit_stats_get("cavabarder", "hg") is None
- assert swh_scheduler.origin_visit_stats_get("cavabarder", "git") is not None
+ assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == []
+ assert swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0] is not None
process_journal_objects(
{"origin_visit_status": VISIT_STATUSES_2}, scheduler=swh_scheduler
)
for url in ("cavabarder", "iciaussi"):
- ovs = swh_scheduler.origin_visit_stats_get(url, "git")
+ ovs = swh_scheduler.origin_visit_stats_get([(url, "git")])[0]
assert before <= ovs.last_scheduled <= after
- ovs = swh_scheduler.origin_visit_stats_get(url, "hg")
+ ovs = swh_scheduler.origin_visit_stats_get([(url, "hg")])[0]
assert ovs.last_scheduled is None
- ovs = swh_scheduler.origin_visit_stats_get("cavabarder", "git")
+ ovs = swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0]
assert ovs.last_eventful == DATE1 + 5 * ONE_DAY
assert ovs.last_uneventful is None
assert ovs.last_failed is None
diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py
--- a/swh/scheduler/tests/test_scheduler.py
+++ b/swh/scheduler/tests/test_scheduler.py
@@ -772,9 +772,8 @@
assert len(ret) == NUM_RESULTS
for origin in ret:
- visit_stats = swh_scheduler.origin_visit_stats_get(
- origin.url, origin.visit_type
- )
+ pk = (origin.url, origin.visit_type)
+ visit_stats = swh_scheduler.origin_visit_stats_get([pk])[0]
assert visit_stats is not None
assert before <= visit_stats.last_scheduled <= after
@@ -805,6 +804,9 @@
for tt in TASK_TYPES.values():
scheduler.create_task_type(tt)
+ def test_origin_visit_stats_get_empty(self, swh_scheduler) -> None:
+ assert swh_scheduler.origin_visit_stats_get([]) == []
+
def test_origin_visit_stats_upsert(self, swh_scheduler) -> None:
eventful_date = utcnow()
url = "https://github.com/test"
@@ -820,8 +822,8 @@
swh_scheduler.origin_visit_stats_upsert([visit_stats])
swh_scheduler.origin_visit_stats_upsert([visit_stats])
- assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats
- assert swh_scheduler.origin_visit_stats_get(url, "svn") is None
+ assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == [visit_stats]
+ assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == []
uneventful_date = utcnow()
visit_stats = OriginVisitStats(
@@ -834,7 +836,7 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats])
- uneventful_visit = swh_scheduler.origin_visit_stats_get(url, "git")
+ uneventful_visits = swh_scheduler.origin_visit_stats_get([(url, "git")])
expected_visit_stats = OriginVisitStats(
url=url,
@@ -845,7 +847,7 @@
last_notfound=None,
)
- assert uneventful_visit == expected_visit_stats
+ assert uneventful_visits == [expected_visit_stats]
failed_date = utcnow()
visit_stats = OriginVisitStats(
@@ -858,7 +860,7 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats])
- failed_visit = swh_scheduler.origin_visit_stats_get(url, "git")
+ failed_visits = swh_scheduler.origin_visit_stats_get([(url, "git")])
expected_visit_stats = OriginVisitStats(
url=url,
@@ -869,7 +871,7 @@
last_notfound=None,
)
- assert failed_visit == expected_visit_stats
+ assert failed_visits == [expected_visit_stats]
def test_origin_visit_stats_upsert_with_snapshot(self, swh_scheduler) -> None:
eventful_date = utcnow()
@@ -886,8 +888,8 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats])
- assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats
- assert swh_scheduler.origin_visit_stats_get(url, "svn") is None
+ assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == [visit_stats]
+ assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == []
def test_origin_visit_stats_upsert_messing_with_time(self, swh_scheduler) -> None:
url = "interesting-origin"
@@ -911,7 +913,7 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats0])
- actual_visit_stats0 = swh_scheduler.origin_visit_stats_get(url, "git")
+ actual_visit_stats0 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0]
assert actual_visit_stats0 == visit_stats0
visit_stats2 = OriginVisitStats(
@@ -924,7 +926,7 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats2])
- actual_visit_stats2 = swh_scheduler.origin_visit_stats_get(url, "git")
+ actual_visit_stats2 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0]
assert actual_visit_stats2 == attr.evolve(
actual_visit_stats0, last_uneventful=date1
)
@@ -943,7 +945,7 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats1])
- actual_visit_stats1 = swh_scheduler.origin_visit_stats_get(url, "git")
+ actual_visit_stats1 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0]
assert actual_visit_stats1 == attr.evolve(
actual_visit_stats2, last_eventful=date2
@@ -974,13 +976,10 @@
swh_scheduler.origin_visit_stats_upsert(visit_stats)
- for visit_stat in visit_stats:
- assert (
- swh_scheduler.origin_visit_stats_get(
- visit_stat.url, visit_stat.visit_type
- )
- is not None
- )
+ for visit_stat in swh_scheduler.origin_visit_stats_get(
+ [(vs.url, vs.visit_type) for vs in visit_stats]
+ ):
+ assert visit_stat is not None
def test_origin_visit_stats_upsert_cardinality_failing(self, swh_scheduler) -> None:
"""Batch upsert does not support altering multiple times the same origin-visit-status

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:20 PM (5 d, 19 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226412

Event Timeline