diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py --- a/swh/scheduler/backend.py +++ b/swh/scheduler/backend.py @@ -864,19 +864,21 @@ @db_transaction() def origin_visit_stats_get( - self, url: str, visit_type: str, db=None, cur=None - ) -> Optional[OriginVisitStats]: + self, ids: Iterable[Tuple[str, str]], db=None, cur=None + ) -> List[OriginVisitStats]: + if not ids: + return [] + primary_keys = tuple((origin, visit_type) for (origin, visit_type) in ids) query = format_query( - "SELECT {keys} FROM origin_visit_stats WHERE url=%s AND visit_type=%s", + """ + SELECT {keys} + FROM (VALUES %s) as stats(url, visit_type) + INNER JOIN origin_visit_stats USING (url, visit_type) + """, OriginVisitStats.select_columns(), ) - cur.execute(query, (url, visit_type)) - row = cur.fetchone() - - if row: - return OriginVisitStats(**row) - else: - return None + psycopg2.extras.execute_values(cur=cur, sql=query, argslist=primary_keys) + return [OriginVisitStats(**row) for row in cur.fetchall()] @db_transaction() def update_metrics( diff --git a/swh/scheduler/interface.py b/swh/scheduler/interface.py --- a/swh/scheduler/interface.py +++ b/swh/scheduler/interface.py @@ -3,8 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import datetime -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Tuple from uuid import UUID from typing_extensions import Protocol, runtime_checkable @@ -347,9 +348,14 @@ @remote_api_endpoint("visit_stats/get") def origin_visit_stats_get( - self, url: str, visit_type: str - ) -> Optional[OriginVisitStats]: - """Retrieve the stats for an origin with a given visit type""" + self, ids: Iterable[Tuple[str, str]] + ) -> List[OriginVisitStats]: + """Retrieve the stats for an origin with a given visit type + + If some visit_stats are not found, they are filtered out of the result. So the + output list may be of length inferior to the length of the input list. + + """ ... @remote_api_endpoint("scheduler_metrics/update") diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py --- a/swh/scheduler/journal_client.py +++ b/swh/scheduler/journal_client.py @@ -65,9 +65,9 @@ } pk = origin, visit_type if pk not in origin_visit_stats: - visit_stats = scheduler.origin_visit_stats_get(origin, visit_type) + visit_stats = scheduler.origin_visit_stats_get([pk]) origin_visit_stats[pk] = ( - attr.asdict(visit_stats) if visit_stats else empty_object + attr.asdict(visit_stats[0]) if visit_stats else empty_object ) visit_stats_d = origin_visit_stats[pk] diff --git a/swh/scheduler/simulator/origins.py b/swh/scheduler/simulator/origins.py --- a/swh/scheduler/simulator/origins.py +++ b/swh/scheduler/simulator/origins.py @@ -94,7 +94,9 @@ """ # This is cheating; actual tasks access the state from the storage, not the # scheduler - stats = env.scheduler.origin_visit_stats_get(task.origin, task.visit_type) + pk = task.origin, task.visit_type + visit_stats = env.scheduler.origin_visit_stats_get([pk]) + stats: Optional[OriginVisitStats] = visit_stats[0] if len(visit_stats) > 0 else None last_snapshot = stats.last_snapshot if stats else None status = OriginVisitStatus( diff --git a/swh/scheduler/tests/test_cli_journal.py b/swh/scheduler/tests/test_cli_journal.py --- a/swh/scheduler/tests/test_cli_journal.py +++ b/swh/scheduler/tests/test_cli_journal.py @@ -108,7 +108,8 @@ assert result.output == expected_output actual_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] + [(visit_status["origin"], visit_status["type"])] ) - assert actual_visit_stats is not None + assert actual_visit_stats + assert len(actual_visit_stats) == 1 diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py --- a/swh/scheduler/tests/test_journal_client.py +++ b/swh/scheduler/tests/test_journal_client.py @@ -90,11 +90,10 @@ ) # Ensure those visit status are ignored - for visit_status in visit_statuses: - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is None + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( + [(vs["origin"], vs["type"]) for vs in visit_statuses] + ) + assert actual_origin_visit_stats == [] def test_journal_client_origin_visit_status_from_journal_last_notfound(swh_scheduler): @@ -111,16 +110,18 @@ {"origin_visit_status": [visit_status]}, scheduler=swh_scheduler ) - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get("foo", "git") - assert actual_origin_visit_stats == OriginVisitStats( - url="foo", - visit_type="git", - last_eventful=None, - last_uneventful=None, - last_failed=None, - last_notfound=visit_status["date"], - last_snapshot=None, - ) + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url="foo", + visit_type="git", + last_eventful=None, + last_uneventful=None, + last_failed=None, + last_notfound=visit_status["date"], + last_snapshot=None, + ) + ] visit_statuses = [ { @@ -145,9 +146,8 @@ {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get("foo", "git") - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) + assert actual_origin_visit_stats == [OriginVisitStats( url="foo", visit_type="git", last_eventful=None, @@ -155,7 +155,7 @@ last_failed=None, last_notfound=DATE3, last_snapshot=None, - ) + )] def test_journal_client_origin_visit_status_from_journal_last_failed(swh_scheduler): @@ -198,9 +198,8 @@ {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get("bar", "git") - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar", "git")]) + assert actual_origin_visit_stats == [OriginVisitStats( url="bar", visit_type="git", last_eventful=None, @@ -208,7 +207,7 @@ last_failed=DATE3, last_notfound=None, last_snapshot=None, - ) + )] def test_journal_client_origin_visit_status_from_journal_last_eventful(swh_scheduler): @@ -251,9 +250,8 @@ {"origin_visit_status": visit_statuses}, scheduler=swh_scheduler ) - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get("foo", "git") - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) + assert actual_origin_visit_stats == [OriginVisitStats( url="foo", visit_type="git", last_eventful=DATE3, @@ -261,7 +259,7 @@ last_failed=None, last_notfound=None, last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), - ) + )] def test_journal_client_origin_visit_status_from_journal_last_uneventful(swh_scheduler): @@ -294,18 +292,19 @@ ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=DATE1, - last_uneventful=visit_status["date"], # most recent date but uneventful - last_failed=DATE2, - last_notfound=DATE1, - last_snapshot=visit_status["snapshot"], + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=DATE1, + last_uneventful=visit_status["date"], # most recent date but uneventful + last_failed=DATE2, + last_notfound=DATE1, + last_snapshot=visit_status["snapshot"], + ) + ] VISIT_STATUSES = [ @@ -366,7 +365,9 @@ last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), ) - assert swh_scheduler.origin_visit_stats_get("foo", "git") == expected_visit_stats + assert swh_scheduler.origin_visit_stats_get([("foo", "git")]) == [ + expected_visit_stats + ] VISIT_STATUSES_1 = [ @@ -427,9 +428,9 @@ last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), ) - assert ( - swh_scheduler.origin_visit_stats_get("cavabarder", "hg") == expected_visit_stats - ) + assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == [ + expected_visit_stats + ] VISIT_STATUSES_2 = [ @@ -516,21 +517,21 @@ ) after = utcnow() - assert swh_scheduler.origin_visit_stats_get("cavabarder", "hg") is None - assert swh_scheduler.origin_visit_stats_get("cavabarder", "git") is not None + assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == [] + assert swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0] is not None process_journal_objects( {"origin_visit_status": VISIT_STATUSES_2}, scheduler=swh_scheduler ) for url in ("cavabarder", "iciaussi"): - ovs = swh_scheduler.origin_visit_stats_get(url, "git") + ovs = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert before <= ovs.last_scheduled <= after - ovs = swh_scheduler.origin_visit_stats_get(url, "hg") + ovs = swh_scheduler.origin_visit_stats_get([(url, "hg")])[0] assert ovs.last_scheduled is None - ovs = swh_scheduler.origin_visit_stats_get("cavabarder", "git") + ovs = swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0] assert ovs.last_eventful == DATE1 + 5 * ONE_DAY assert ovs.last_uneventful is None assert ovs.last_failed is None diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py --- a/swh/scheduler/tests/test_scheduler.py +++ b/swh/scheduler/tests/test_scheduler.py @@ -772,9 +772,8 @@ assert len(ret) == NUM_RESULTS for origin in ret: - visit_stats = swh_scheduler.origin_visit_stats_get( - origin.url, origin.visit_type - ) + pk = (origin.url, origin.visit_type) + visit_stats = swh_scheduler.origin_visit_stats_get([pk])[0] assert visit_stats is not None assert before <= visit_stats.last_scheduled <= after @@ -805,6 +804,9 @@ for tt in TASK_TYPES.values(): scheduler.create_task_type(tt) + def test_origin_visit_stats_get_empty(self, swh_scheduler) -> None: + assert swh_scheduler.origin_visit_stats_get([]) == [] + def test_origin_visit_stats_upsert(self, swh_scheduler) -> None: eventful_date = utcnow() url = "https://github.com/test" @@ -820,8 +822,8 @@ swh_scheduler.origin_visit_stats_upsert([visit_stats]) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats - assert swh_scheduler.origin_visit_stats_get(url, "svn") is None + assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == [visit_stats] + assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == [] uneventful_date = utcnow() visit_stats = OriginVisitStats( @@ -834,7 +836,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - uneventful_visit = swh_scheduler.origin_visit_stats_get(url, "git") + uneventful_visits = swh_scheduler.origin_visit_stats_get([(url, "git")]) expected_visit_stats = OriginVisitStats( url=url, @@ -845,7 +847,7 @@ last_notfound=None, ) - assert uneventful_visit == expected_visit_stats + assert uneventful_visits == [expected_visit_stats] failed_date = utcnow() visit_stats = OriginVisitStats( @@ -858,7 +860,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - failed_visit = swh_scheduler.origin_visit_stats_get(url, "git") + failed_visits = swh_scheduler.origin_visit_stats_get([(url, "git")]) expected_visit_stats = OriginVisitStats( url=url, @@ -869,7 +871,7 @@ last_notfound=None, ) - assert failed_visit == expected_visit_stats + assert failed_visits == [expected_visit_stats] def test_origin_visit_stats_upsert_with_snapshot(self, swh_scheduler) -> None: eventful_date = utcnow() @@ -886,8 +888,8 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats - assert swh_scheduler.origin_visit_stats_get(url, "svn") is None + assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == [visit_stats] + assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == [] def test_origin_visit_stats_upsert_messing_with_time(self, swh_scheduler) -> None: url = "interesting-origin" @@ -911,7 +913,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats0]) - actual_visit_stats0 = swh_scheduler.origin_visit_stats_get(url, "git") + actual_visit_stats0 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert actual_visit_stats0 == visit_stats0 visit_stats2 = OriginVisitStats( @@ -924,7 +926,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats2]) - actual_visit_stats2 = swh_scheduler.origin_visit_stats_get(url, "git") + actual_visit_stats2 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert actual_visit_stats2 == attr.evolve( actual_visit_stats0, last_uneventful=date1 ) @@ -943,7 +945,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats1]) - actual_visit_stats1 = swh_scheduler.origin_visit_stats_get(url, "git") + actual_visit_stats1 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert actual_visit_stats1 == attr.evolve( actual_visit_stats2, last_eventful=date2 @@ -974,13 +976,10 @@ swh_scheduler.origin_visit_stats_upsert(visit_stats) - for visit_stat in visit_stats: - assert ( - swh_scheduler.origin_visit_stats_get( - visit_stat.url, visit_stat.visit_type - ) - is not None - ) + for visit_stat in swh_scheduler.origin_visit_stats_get( + [(vs.url, vs.visit_type) for vs in visit_stats] + ): + assert visit_stat is not None def test_origin_visit_stats_upsert_cardinality_failing(self, swh_scheduler) -> None: """Batch upsert does not support altering multiple times the same origin-visit-status