diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py --- a/swh/scheduler/backend.py +++ b/swh/scheduler/backend.py @@ -837,16 +837,26 @@ @db_transaction() def origin_visit_stats_get( - self, url: str, visit_type: str, db=None, cur=None - ) -> Optional[OriginVisitStats]: + self, ids: Iterable[Tuple[str, str]], db=None, cur=None + ) -> Dict[Tuple[str, str], OriginVisitStats]: + if not ids: + return {} + + primary_keys = tuple((origin, visit_type) for (origin, visit_type) in ids) query = format_query( - "SELECT {keys} FROM origin_visit_stats WHERE url=%s AND visit_type=%s", + """ + SELECT {keys} + FROM (VALUES %s) as stats(url, visit_type) + INNER JOIN origin_visit_stats USING (url, visit_type) + """, OriginVisitStats.select_columns(), ) - cur.execute(query, (url, visit_type)) - row = cur.fetchone() - if row: - return OriginVisitStats(**row) - else: - return None + psycopg2.extras.execute_values( + cur=cur, sql=query, argslist=primary_keys, fetch=False, + ) + + return { + (row["url"], row["visit_type"]): OriginVisitStats(**row) + for row in cur.fetchall() + } diff --git a/swh/scheduler/interface.py b/swh/scheduler/interface.py --- a/swh/scheduler/interface.py +++ b/swh/scheduler/interface.py @@ -3,7 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Tuple from uuid import UUID from typing_extensions import Protocol, runtime_checkable @@ -336,7 +336,7 @@ @remote_api_endpoint("visit_stats/get") def origin_visit_stats_get( - self, url: str, visit_type: str - ) -> Optional[OriginVisitStats]: + self, ids: Iterable[Tuple[str, str]] + ) -> Dict[Tuple[str, str], OriginVisitStats]: """Retrieve the stats for an origin with a given visit type""" ... diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py --- a/swh/scheduler/journal_client.py +++ b/swh/scheduler/journal_client.py @@ -60,7 +60,7 @@ } pk = origin, visit_type if pk not in origin_visit_stats: - visit_stats = scheduler.origin_visit_stats_get(origin, visit_type) + visit_stats = scheduler.origin_visit_stats_get([pk]).get(pk) origin_visit_stats[pk] = ( attr.asdict(visit_stats) if visit_stats else empty_object ) diff --git a/swh/scheduler/tests/test_cli_journal.py b/swh/scheduler/tests/test_cli_journal.py --- a/swh/scheduler/tests/test_cli_journal.py +++ b/swh/scheduler/tests/test_cli_journal.py @@ -108,7 +108,8 @@ assert result.output == expected_output actual_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] + [(visit_status["origin"], visit_status["type"])] ) - assert actual_visit_stats is not None + assert actual_visit_stats + assert len(actual_visit_stats) == 1 diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py --- a/swh/scheduler/tests/test_journal_client.py +++ b/swh/scheduler/tests/test_journal_client.py @@ -81,11 +81,10 @@ ) # Ensure those visit status are ignored - for visit_status in visit_statuses: - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is None + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( + [(vs["origin"], vs["type"]) for vs in visit_statuses] + ) + assert actual_origin_visit_stats == {} def test_journal_client_origin_visit_status_from_journal_last_notfound(swh_scheduler): @@ -102,10 +101,9 @@ {"origin_visit_status": [visit_status]}, scheduler=swh_scheduler ) - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats == OriginVisitStats( + pk = (visit_status["origin"], visit_status["type"]) + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk]) + assert actual_origin_visit_stats[pk] == OriginVisitStats( url=visit_status["origin"], visit_type=visit_status["type"], last_eventful=None, @@ -139,11 +137,9 @@ ) for visit_status in visit_statuses: - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( + pk = (visit_status["origin"], visit_status["type"]) + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk]) + assert actual_origin_visit_stats[pk] == OriginVisitStats( url=visit_status["origin"], visit_type=visit_status["type"], last_eventful=None, @@ -180,11 +176,9 @@ # Ensure those visit status are ignored for visit_status in visit_statuses: - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( + pk = (visit_status["origin"], visit_status["type"]) + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk]) + assert actual_origin_visit_stats[pk] == OriginVisitStats( url=visit_status["origin"], visit_type=visit_status["type"], last_eventful=None, @@ -218,11 +212,9 @@ ) for visit_status in visit_statuses: - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( + pk = (visit_status["origin"], visit_status["type"]) + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk]) + assert actual_origin_visit_stats[pk] == OriginVisitStats( url=visit_status["origin"], visit_type=visit_status["type"], last_eventful=None, @@ -258,11 +250,9 @@ ) for visit_status in visit_statuses: - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( + pk = (visit_status["origin"], visit_status["type"]) + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk]) + assert actual_origin_visit_stats[pk] == OriginVisitStats( url=visit_status["origin"], visit_type=visit_status["type"], last_eventful=visit_status["date"], @@ -296,11 +286,9 @@ ) for visit_status in visit_statuses: - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( + pk = (visit_status["origin"], visit_status["type"]) + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk]) + assert actual_origin_visit_stats[pk] == OriginVisitStats( url=visit_status["origin"], visit_type=visit_status["type"], last_eventful=DATE3, @@ -340,11 +328,9 @@ {"origin_visit_status": [visit_status]}, scheduler=swh_scheduler ) - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( + pk = (visit_status["origin"], visit_status["type"]) + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([pk]) + assert actual_origin_visit_stats[pk] == OriginVisitStats( url=visit_status["origin"], visit_type=visit_status["type"], last_eventful=DATE1, @@ -413,7 +399,8 @@ last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), ) - assert swh_scheduler.origin_visit_stats_get("foo", "git") == expected_visit_stats + pk = ("foo", "git") + assert swh_scheduler.origin_visit_stats_get([pk]) == {pk: expected_visit_stats} VISIT_STATUSES1 = [ @@ -474,9 +461,8 @@ last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), ) - assert ( - swh_scheduler.origin_visit_stats_get("cavabarder", "hg") == expected_visit_stats - ) + pk = ("cavabarder", "hg") + assert swh_scheduler.origin_visit_stats_get([pk]) == {pk: expected_visit_stats} VISIT_STATUSES_2 = [ @@ -563,21 +549,26 @@ ) after = utcnow() - assert swh_scheduler.origin_visit_stats_get("cavabarder", "hg") is None - assert swh_scheduler.origin_visit_stats_get("cavabarder", "git") is not None + assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == {} + + pk = ("cavabarder", "git") + assert swh_scheduler.origin_visit_stats_get([pk])[pk] is not None process_journal_objects( {"origin_visit_status": VISIT_STATUSES_2}, scheduler=swh_scheduler ) for url in ("cavabarder", "iciaussi"): - ovs = swh_scheduler.origin_visit_stats_get(url, "git") + pk = (url, "git") + ovs = swh_scheduler.origin_visit_stats_get([pk])[pk] assert before <= ovs.last_scheduled <= after - ovs = swh_scheduler.origin_visit_stats_get(url, "hg") + pk = (url, "hg") + ovs = swh_scheduler.origin_visit_stats_get([pk])[pk] assert ovs.last_scheduled is None - ovs = swh_scheduler.origin_visit_stats_get("cavabarder", "git") + pk = ("cavabarder", "git") + ovs = swh_scheduler.origin_visit_stats_get([pk])[pk] assert ovs.last_eventful == DATE1 + 5 * ONE_DAY assert ovs.last_uneventful is None assert ovs.last_failed is None diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py --- a/swh/scheduler/tests/test_scheduler.py +++ b/swh/scheduler/tests/test_scheduler.py @@ -744,9 +744,8 @@ assert len(ret) == NUM_RESULTS for origin in ret: - visit_stats = swh_scheduler.origin_visit_stats_get( - origin.url, origin.visit_type - ) + pk = (origin.url, origin.visit_type) + visit_stats = swh_scheduler.origin_visit_stats_get([pk])[pk] assert visit_stats is not None assert before <= visit_stats.last_scheduled <= after @@ -792,8 +791,10 @@ swh_scheduler.origin_visit_stats_upsert([visit_stats]) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats - assert swh_scheduler.origin_visit_stats_get(url, "svn") is None + assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == { + (url, "git"): visit_stats + } + assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == {} uneventful_date = utcnow() visit_stats = OriginVisitStats( @@ -806,7 +807,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - uneventful_visit = swh_scheduler.origin_visit_stats_get(url, "git") + uneventful_visits = swh_scheduler.origin_visit_stats_get([(url, "git")]) expected_visit_stats = OriginVisitStats( url=url, @@ -817,7 +818,7 @@ last_notfound=None, ) - assert uneventful_visit == expected_visit_stats + assert uneventful_visits == {(url, "git"): expected_visit_stats} failed_date = utcnow() visit_stats = OriginVisitStats( @@ -830,7 +831,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - failed_visit = swh_scheduler.origin_visit_stats_get(url, "git") + failed_visits = swh_scheduler.origin_visit_stats_get([(url, "git")]) expected_visit_stats = OriginVisitStats( url=url, @@ -841,7 +842,7 @@ last_notfound=None, ) - assert failed_visit == expected_visit_stats + assert failed_visits == {(url, "git"): expected_visit_stats} def test_origin_visit_stats_upsert_with_snapshot(self, swh_scheduler) -> None: eventful_date = utcnow() @@ -858,8 +859,10 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats - assert swh_scheduler.origin_visit_stats_get(url, "svn") is None + assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == { + (url, "git"): visit_stats + } + assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == {} def test_origin_visit_stats_upsert_messing_with_time(self, swh_scheduler) -> None: url = "interesting-origin" @@ -883,7 +886,8 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats0]) - actual_visit_stats0 = swh_scheduler.origin_visit_stats_get(url, "git") + pk = (url, "git") + actual_visit_stats0 = swh_scheduler.origin_visit_stats_get([pk])[pk] assert actual_visit_stats0 == visit_stats0 visit_stats2 = OriginVisitStats( @@ -896,7 +900,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats2]) - actual_visit_stats2 = swh_scheduler.origin_visit_stats_get(url, "git") + actual_visit_stats2 = swh_scheduler.origin_visit_stats_get([pk])[pk] assert actual_visit_stats2 == attr.evolve( actual_visit_stats0, last_uneventful=date1 ) @@ -915,7 +919,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats1]) - actual_visit_stats1 = swh_scheduler.origin_visit_stats_get(url, "git") + actual_visit_stats1 = swh_scheduler.origin_visit_stats_get([pk])[pk] assert actual_visit_stats1 == attr.evolve( actual_visit_stats2, last_eventful=date2 @@ -946,13 +950,13 @@ swh_scheduler.origin_visit_stats_upsert(visit_stats) + visit_stats_dict = swh_scheduler.origin_visit_stats_get( + [(vs.url, vs.visit_type) for vs in visit_stats] + ) for visit_stat in visit_stats: - assert ( - swh_scheduler.origin_visit_stats_get( - visit_stat.url, visit_stat.visit_type - ) - is not None - ) + pk = (visit_stat.url, visit_stat.visit_type) + assert pk in visit_stats_dict + assert visit_stats_dict[pk] is not None def test_origin_visit_stats_upsert_cardinality_failing(self, swh_scheduler) -> None: """Batch upsert does not support altering multiple times the same origin-visit-status