Page MenuHomeSoftware Heritage

D4888.id17387.diff
No OneTemporary

D4888.id17387.diff

diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py
--- a/swh/scheduler/backend.py
+++ b/swh/scheduler/backend.py
@@ -837,16 +837,26 @@
@db_transaction()
def origin_visit_stats_get(
- self, url: str, visit_type: str, db=None, cur=None
- ) -> Optional[OriginVisitStats]:
+ self, ids: Iterable[Tuple[str, str]], db=None, cur=None
+ ) -> Dict[Tuple[str, str], OriginVisitStats]:
+ if not ids:
+ return {}
+
+ primary_keys = tuple((origin, visit_type) for (origin, visit_type) in ids)
query = format_query(
- "SELECT {keys} FROM origin_visit_stats WHERE url=%s AND visit_type=%s",
+ """
+ SELECT {keys}
+ FROM (VALUES %s) as stats(url, visit_type)
+ INNER JOIN origin_visit_stats USING (url, visit_type)
+ """,
OriginVisitStats.select_columns(),
)
- cur.execute(query, (url, visit_type))
- row = cur.fetchone()
- if row:
- return OriginVisitStats(**row)
- else:
- return None
+ psycopg2.extras.execute_values(
+ cur=cur, sql=query, argslist=primary_keys, fetch=False,
+ )
+
+ return {
+ (row["url"], row["visit_type"]): OriginVisitStats(**row)
+ for row in cur.fetchall()
+ }
diff --git a/swh/scheduler/interface.py b/swh/scheduler/interface.py
--- a/swh/scheduler/interface.py
+++ b/swh/scheduler/interface.py
@@ -3,7 +3,7 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Tuple
from uuid import UUID
from typing_extensions import Protocol, runtime_checkable
@@ -336,7 +336,7 @@
@remote_api_endpoint("visit_stats/get")
def origin_visit_stats_get(
- self, url: str, visit_type: str
- ) -> Optional[OriginVisitStats]:
+ self, ids: Iterable[Tuple[str, str]]
+ ) -> Dict[Tuple[str, str], OriginVisitStats]:
"""Retrieve the stats for an origin with a given visit type"""
...
diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py
--- a/swh/scheduler/journal_client.py
+++ b/swh/scheduler/journal_client.py
@@ -60,7 +60,7 @@
}
pk = origin, visit_type
if pk not in origin_visit_stats:
- visit_stats = scheduler.origin_visit_stats_get(origin, visit_type)
+ visit_stats = scheduler.origin_visit_stats_get([pk]).get(pk)
origin_visit_stats[pk] = (
attr.asdict(visit_stats) if visit_stats else empty_object
)
diff --git a/swh/scheduler/tests/test_cli_journal.py b/swh/scheduler/tests/test_cli_journal.py
--- a/swh/scheduler/tests/test_cli_journal.py
+++ b/swh/scheduler/tests/test_cli_journal.py
@@ -108,7 +108,8 @@
assert result.output == expected_output
actual_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
+ [(visit_status["origin"], visit_status["type"])]
)
- assert actual_visit_stats is not None
+ assert actual_visit_stats
+ assert len(actual_visit_stats) == 1
diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py
--- a/swh/scheduler/tests/test_journal_client.py
+++ b/swh/scheduler/tests/test_journal_client.py
@@ -81,11 +81,10 @@
)
# Ensure those visit status are ignored
- for visit_status in visit_statuses:
- actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is None
+ actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
+ [(vs["origin"], vs["type"]) for vs in visit_statuses]
+ )
+ assert actual_origin_visit_stats == {}
def test_journal_client_origin_visit_status_from_journal_last_notfound(swh_scheduler):
@@ -102,10 +101,9 @@
{"origin_visit_status": [visit_status]}, scheduler=swh_scheduler
)
- actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats == OriginVisitStats(
+ pk = (visit_status["origin"], visit_status["type"])
+ actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk])
+ assert actual_origin_visit_stats[pk] == OriginVisitStats(
url=visit_status["origin"],
visit_type=visit_status["type"],
last_eventful=None,
@@ -139,11 +137,9 @@
)
for visit_status in visit_statuses:
- actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
+ pk = (visit_status["origin"], visit_status["type"])
+ actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk])
+ assert actual_origin_visit_stats[pk] == OriginVisitStats(
url=visit_status["origin"],
visit_type=visit_status["type"],
last_eventful=None,
@@ -180,11 +176,9 @@
# Ensure those visit status are ignored
for visit_status in visit_statuses:
- actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
+ pk = (visit_status["origin"], visit_status["type"])
+ actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk])
+ assert actual_origin_visit_stats[pk] == OriginVisitStats(
url=visit_status["origin"],
visit_type=visit_status["type"],
last_eventful=None,
@@ -218,11 +212,9 @@
)
for visit_status in visit_statuses:
- actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
+ pk = (visit_status["origin"], visit_status["type"])
+ actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk])
+ assert actual_origin_visit_stats[pk] == OriginVisitStats(
url=visit_status["origin"],
visit_type=visit_status["type"],
last_eventful=None,
@@ -258,11 +250,9 @@
)
for visit_status in visit_statuses:
- actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
+ pk = (visit_status["origin"], visit_status["type"])
+ actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk])
+ assert actual_origin_visit_stats[pk] == OriginVisitStats(
url=visit_status["origin"],
visit_type=visit_status["type"],
last_eventful=visit_status["date"],
@@ -296,11 +286,9 @@
)
for visit_status in visit_statuses:
- actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
+ pk = (visit_status["origin"], visit_status["type"])
+ actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk])
+ assert actual_origin_visit_stats[pk] == OriginVisitStats(
url=visit_status["origin"],
visit_type=visit_status["type"],
last_eventful=DATE3,
@@ -340,11 +328,9 @@
{"origin_visit_status": [visit_status]}, scheduler=swh_scheduler
)
- actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get(
- visit_status["origin"], visit_status["type"]
- )
- assert actual_origin_visit_stats is not None
- assert actual_origin_visit_stats == OriginVisitStats(
+ pk = (visit_status["origin"], visit_status["type"])
+ actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk])
+ assert actual_origin_visit_stats[pk] == OriginVisitStats(
url=visit_status["origin"],
visit_type=visit_status["type"],
last_eventful=DATE1,
@@ -413,7 +399,8 @@
last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
)
- assert swh_scheduler.origin_visit_stats_get("foo", "git") == expected_visit_stats
+ pk = ("foo", "git")
+ assert swh_scheduler.origin_visit_stats_get([pk]) == {pk: expected_visit_stats}
VISIT_STATUSES1 = [
@@ -474,9 +461,8 @@
last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
)
- assert (
- swh_scheduler.origin_visit_stats_get("cavabarder", "hg") == expected_visit_stats
- )
+ pk = ("cavabarder", "hg")
+ assert swh_scheduler.origin_visit_stats_get([pk]) == {pk: expected_visit_stats}
VISIT_STATUSES_2 = [
@@ -563,21 +549,26 @@
)
after = utcnow()
- assert swh_scheduler.origin_visit_stats_get("cavabarder", "hg") is None
- assert swh_scheduler.origin_visit_stats_get("cavabarder", "git") is not None
+ assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == {}
+
+ pk = ("cavabarder", "git")
+ assert swh_scheduler.origin_visit_stats_get([pk])[pk] is not None
process_journal_objects(
{"origin_visit_status": VISIT_STATUSES_2}, scheduler=swh_scheduler
)
for url in ("cavabarder", "iciaussi"):
- ovs = swh_scheduler.origin_visit_stats_get(url, "git")
+ pk = (url, "git")
+ ovs = swh_scheduler.origin_visit_stats_get([pk])[pk]
assert before <= ovs.last_scheduled <= after
- ovs = swh_scheduler.origin_visit_stats_get(url, "hg")
+ pk = (url, "hg")
+ ovs = swh_scheduler.origin_visit_stats_get([pk])[pk]
assert ovs.last_scheduled is None
- ovs = swh_scheduler.origin_visit_stats_get("cavabarder", "git")
+ pk = ("cavabarder", "git")
+ ovs = swh_scheduler.origin_visit_stats_get([pk])[pk]
assert ovs.last_eventful == DATE1 + 5 * ONE_DAY
assert ovs.last_uneventful is None
assert ovs.last_failed is None
diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py
--- a/swh/scheduler/tests/test_scheduler.py
+++ b/swh/scheduler/tests/test_scheduler.py
@@ -744,9 +744,8 @@
assert len(ret) == NUM_RESULTS
for origin in ret:
- visit_stats = swh_scheduler.origin_visit_stats_get(
- origin.url, origin.visit_type
- )
+ pk = (origin.url, origin.visit_type)
+ visit_stats = swh_scheduler.origin_visit_stats_get([pk])[pk]
assert visit_stats is not None
assert before <= visit_stats.last_scheduled <= after
@@ -792,8 +791,10 @@
swh_scheduler.origin_visit_stats_upsert([visit_stats])
swh_scheduler.origin_visit_stats_upsert([visit_stats])
- assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats
- assert swh_scheduler.origin_visit_stats_get(url, "svn") is None
+ assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == {
+ (url, "git"): visit_stats
+ }
+ assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == {}
uneventful_date = utcnow()
visit_stats = OriginVisitStats(
@@ -806,7 +807,7 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats])
- uneventful_visit = swh_scheduler.origin_visit_stats_get(url, "git")
+ uneventful_visits = swh_scheduler.origin_visit_stats_get([(url, "git")])
expected_visit_stats = OriginVisitStats(
url=url,
@@ -817,7 +818,7 @@
last_notfound=None,
)
- assert uneventful_visit == expected_visit_stats
+ assert uneventful_visits == {(url, "git"): expected_visit_stats}
failed_date = utcnow()
visit_stats = OriginVisitStats(
@@ -830,7 +831,7 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats])
- failed_visit = swh_scheduler.origin_visit_stats_get(url, "git")
+ failed_visits = swh_scheduler.origin_visit_stats_get([(url, "git")])
expected_visit_stats = OriginVisitStats(
url=url,
@@ -841,7 +842,7 @@
last_notfound=None,
)
- assert failed_visit == expected_visit_stats
+ assert failed_visits == {(url, "git"): expected_visit_stats}
def test_origin_visit_stats_upsert_with_snapshot(self, swh_scheduler) -> None:
eventful_date = utcnow()
@@ -858,8 +859,10 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats])
- assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats
- assert swh_scheduler.origin_visit_stats_get(url, "svn") is None
+ assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == {
+ (url, "git"): visit_stats
+ }
+ assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == {}
def test_origin_visit_stats_upsert_messing_with_time(self, swh_scheduler) -> None:
url = "interesting-origin"
@@ -883,7 +886,8 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats0])
- actual_visit_stats0 = swh_scheduler.origin_visit_stats_get(url, "git")
+ pk = (url, "git")
+ actual_visit_stats0 = swh_scheduler.origin_visit_stats_get([pk])[pk]
assert actual_visit_stats0 == visit_stats0
visit_stats2 = OriginVisitStats(
@@ -896,7 +900,7 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats2])
- actual_visit_stats2 = swh_scheduler.origin_visit_stats_get(url, "git")
+ actual_visit_stats2 = swh_scheduler.origin_visit_stats_get([pk])[pk]
assert actual_visit_stats2 == attr.evolve(
actual_visit_stats0, last_uneventful=date1
)
@@ -915,7 +919,7 @@
)
swh_scheduler.origin_visit_stats_upsert([visit_stats1])
- actual_visit_stats1 = swh_scheduler.origin_visit_stats_get(url, "git")
+ actual_visit_stats1 = swh_scheduler.origin_visit_stats_get([pk])[pk]
assert actual_visit_stats1 == attr.evolve(
actual_visit_stats2, last_eventful=date2
@@ -946,13 +950,13 @@
swh_scheduler.origin_visit_stats_upsert(visit_stats)
+ visit_stats_dict = swh_scheduler.origin_visit_stats_get(
+ [(vs.url, vs.visit_type) for vs in visit_stats]
+ )
for visit_stat in visit_stats:
- assert (
- swh_scheduler.origin_visit_stats_get(
- visit_stat.url, visit_stat.visit_type
- )
- is not None
- )
+ pk = (visit_stat.url, visit_stat.visit_type)
+ assert pk in visit_stats_dict
+ assert visit_stats_dict[pk] is not None
def test_origin_visit_stats_upsert_cardinality_failing(self, swh_scheduler) -> None:
"""Batch upsert does not support altering multiple times the same origin-visit-status

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 3, 3:42 PM (2 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3230924

Event Timeline