diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py --- a/swh/scheduler/backend.py +++ b/swh/scheduler/backend.py @@ -864,19 +864,21 @@ @db_transaction() def origin_visit_stats_get( - self, url: str, visit_type: str, db=None, cur=None - ) -> Optional[OriginVisitStats]: + self, ids: Iterable[Tuple[str, str]], db=None, cur=None + ) -> List[OriginVisitStats]: + if not ids: + return [] + primary_keys = tuple((origin, visit_type) for (origin, visit_type) in ids) query = format_query( - "SELECT {keys} FROM origin_visit_stats WHERE url=%s AND visit_type=%s", + """ + SELECT {keys} + FROM (VALUES %s) as stats(url, visit_type) + INNER JOIN origin_visit_stats USING (url, visit_type) + """, OriginVisitStats.select_columns(), ) - cur.execute(query, (url, visit_type)) - row = cur.fetchone() - - if row: - return OriginVisitStats(**row) - else: - return None + psycopg2.extras.execute_values(cur=cur, sql=query, argslist=primary_keys) + return [OriginVisitStats(**row) for row in cur.fetchall()] @db_transaction() def update_metrics( diff --git a/swh/scheduler/interface.py b/swh/scheduler/interface.py --- a/swh/scheduler/interface.py +++ b/swh/scheduler/interface.py @@ -3,8 +3,9 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information + import datetime -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Tuple from uuid import UUID from typing_extensions import Protocol, runtime_checkable @@ -347,9 +348,14 @@ @remote_api_endpoint("visit_stats/get") def origin_visit_stats_get( - self, url: str, visit_type: str - ) -> Optional[OriginVisitStats]: - """Retrieve the stats for an origin with a given visit type""" + self, ids: Iterable[Tuple[str, str]] + ) -> List[OriginVisitStats]: + """Retrieve the stats for an origin with a given visit type + + If some visit_stats are not found, they are filtered out of the result. So the + output list may be of length inferior to the length of the input list. + + """ ... @remote_api_endpoint("scheduler_metrics/update") diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py --- a/swh/scheduler/journal_client.py +++ b/swh/scheduler/journal_client.py @@ -65,9 +65,9 @@ } pk = origin, visit_type if pk not in origin_visit_stats: - visit_stats = scheduler.origin_visit_stats_get(origin, visit_type) + visit_stats = scheduler.origin_visit_stats_get([pk]) origin_visit_stats[pk] = ( - attr.asdict(visit_stats) if visit_stats else empty_object + attr.asdict(visit_stats[0]) if visit_stats else empty_object ) visit_stats_d = origin_visit_stats[pk] diff --git a/swh/scheduler/simulator/origins.py b/swh/scheduler/simulator/origins.py --- a/swh/scheduler/simulator/origins.py +++ b/swh/scheduler/simulator/origins.py @@ -94,7 +94,9 @@ """ # This is cheating; actual tasks access the state from the storage, not the # scheduler - stats = env.scheduler.origin_visit_stats_get(task.origin, task.visit_type) + pk = task.origin, task.visit_type + visit_stats = env.scheduler.origin_visit_stats_get([pk]) + stats: Optional[OriginVisitStats] = visit_stats[0] if len(visit_stats) > 0 else None last_snapshot = stats.last_snapshot if stats else None status = OriginVisitStatus( diff --git a/swh/scheduler/tests/test_cli_journal.py b/swh/scheduler/tests/test_cli_journal.py --- a/swh/scheduler/tests/test_cli_journal.py +++ b/swh/scheduler/tests/test_cli_journal.py @@ -109,7 +109,8 @@ assert result.output == expected_output actual_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] + [(visit_status["origin"], visit_status["type"])] ) - assert actual_visit_stats is not None + assert actual_visit_stats + assert len(actual_visit_stats) == 1 diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py --- a/swh/scheduler/tests/test_journal_client.py +++ b/swh/scheduler/tests/test_journal_client.py @@ -90,11 +90,10 @@ ) # Ensure those visit status are ignored - for visit_status in visit_statuses: - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is None + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( + [(vs["origin"], vs["type"]) for vs in visit_statuses] + ) + assert actual_origin_visit_stats == [] def test_journal_client_origin_visit_status_from_journal_last_notfound(swh_scheduler): @@ -112,17 +111,19 @@ ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=None, - last_uneventful=None, - last_failed=None, - last_notfound=visit_status["date"], - last_snapshot=None, + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=None, + last_uneventful=None, + last_failed=None, + last_notfound=visit_status["date"], + last_snapshot=None, + ) + ] visit_statuses = [ { @@ -149,18 +150,19 @@ for visit_status in visit_statuses: actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=None, - last_uneventful=None, - last_failed=None, - last_notfound=DATE3, - last_snapshot=None, + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=None, + last_uneventful=None, + last_failed=None, + last_notfound=DATE3, + last_snapshot=None, + ) + ] def test_journal_client_origin_visit_status_from_journal_last_failed(swh_scheduler): @@ -190,18 +192,19 @@ # Ensure those visit status are ignored for visit_status in visit_statuses: actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=None, - last_uneventful=None, - last_failed=visit_status["date"], - last_notfound=None, - last_snapshot=None, + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=None, + last_uneventful=None, + last_failed=visit_status["date"], + last_notfound=None, + last_snapshot=None, + ) + ] visit_statuses = [ { @@ -228,18 +231,19 @@ for visit_status in visit_statuses: actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=None, - last_uneventful=None, - last_failed=DATE3, - last_notfound=None, - last_snapshot=None, + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=None, + last_uneventful=None, + last_failed=DATE3, + last_notfound=None, + last_snapshot=None, + ) + ] def test_journal_client_origin_visit_status_from_journal_last_eventful(swh_scheduler): @@ -268,18 +272,19 @@ for visit_status in visit_statuses: actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=visit_status["date"], - last_uneventful=None, - last_failed=None, - last_notfound=None, - last_snapshot=visit_status["snapshot"], + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=visit_status["date"], + last_uneventful=None, + last_failed=None, + last_notfound=None, + last_snapshot=visit_status["snapshot"], + ) + ] visit_statuses = [ { @@ -306,18 +311,19 @@ for visit_status in visit_statuses: actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=DATE3, - last_uneventful=None, - last_failed=None, - last_notfound=None, - last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=DATE3, + last_uneventful=None, + last_failed=None, + last_notfound=None, + last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), + ) + ] def test_journal_client_origin_visit_status_from_journal_last_uneventful(swh_scheduler): @@ -350,18 +356,19 @@ ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=DATE1, - last_uneventful=visit_status["date"], # most recent date but uneventful - last_failed=DATE2, - last_notfound=DATE1, - last_snapshot=visit_status["snapshot"], + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=DATE1, + last_uneventful=visit_status["date"], # most recent date but uneventful + last_failed=DATE2, + last_notfound=DATE1, + last_snapshot=visit_status["snapshot"], + ) + ] VISIT_STATUSES = [ @@ -422,7 +429,9 @@ last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), ) - assert swh_scheduler.origin_visit_stats_get("foo", "git") == expected_visit_stats + assert swh_scheduler.origin_visit_stats_get([("foo", "git")]) == [ + expected_visit_stats + ] VISIT_STATUSES1 = [ @@ -483,9 +492,9 @@ last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), ) - assert ( - swh_scheduler.origin_visit_stats_get("cavabarder", "hg") == expected_visit_stats - ) + assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == [ + expected_visit_stats + ] VISIT_STATUSES_2 = [ @@ -572,21 +581,21 @@ ) after = utcnow() - assert swh_scheduler.origin_visit_stats_get("cavabarder", "hg") is None - assert swh_scheduler.origin_visit_stats_get("cavabarder", "git") is not None + assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == [] + assert swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0] is not None process_journal_objects( {"origin_visit_status": VISIT_STATUSES_2}, scheduler=swh_scheduler ) for url in ("cavabarder", "iciaussi"): - ovs = swh_scheduler.origin_visit_stats_get(url, "git") + ovs = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert before <= ovs.last_scheduled <= after - ovs = swh_scheduler.origin_visit_stats_get(url, "hg") + ovs = swh_scheduler.origin_visit_stats_get([(url, "hg")])[0] assert ovs.last_scheduled is None - ovs = swh_scheduler.origin_visit_stats_get("cavabarder", "git") + ovs = swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0] assert ovs.last_eventful == DATE1 + 5 * ONE_DAY assert ovs.last_uneventful is None assert ovs.last_failed is None diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py --- a/swh/scheduler/tests/test_scheduler.py +++ b/swh/scheduler/tests/test_scheduler.py @@ -772,9 +772,8 @@ assert len(ret) == NUM_RESULTS for origin in ret: - visit_stats = swh_scheduler.origin_visit_stats_get( - origin.url, origin.visit_type - ) + pk = (origin.url, origin.visit_type) + visit_stats = swh_scheduler.origin_visit_stats_get([pk])[0] assert visit_stats is not None assert before <= visit_stats.last_scheduled <= after @@ -805,6 +804,9 @@ for tt in TASK_TYPES.values(): scheduler.create_task_type(tt) + def test_origin_visit_stats_get_empty(self, swh_scheduler) -> None: + assert swh_scheduler.origin_visit_stats_get([]) == [] + def test_origin_visit_stats_upsert(self, swh_scheduler) -> None: eventful_date = utcnow() url = "https://github.com/test" @@ -820,8 +822,8 @@ swh_scheduler.origin_visit_stats_upsert([visit_stats]) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats - assert swh_scheduler.origin_visit_stats_get(url, "svn") is None + assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == [visit_stats] + assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == [] uneventful_date = utcnow() visit_stats = OriginVisitStats( @@ -834,7 +836,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - uneventful_visit = swh_scheduler.origin_visit_stats_get(url, "git") + uneventful_visits = swh_scheduler.origin_visit_stats_get([(url, "git")]) expected_visit_stats = OriginVisitStats( url=url, @@ -845,7 +847,7 @@ last_notfound=None, ) - assert uneventful_visit == expected_visit_stats + assert uneventful_visits == [expected_visit_stats] failed_date = utcnow() visit_stats = OriginVisitStats( @@ -858,7 +860,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - failed_visit = swh_scheduler.origin_visit_stats_get(url, "git") + failed_visits = swh_scheduler.origin_visit_stats_get([(url, "git")]) expected_visit_stats = OriginVisitStats( url=url, @@ -869,7 +871,7 @@ last_notfound=None, ) - assert failed_visit == expected_visit_stats + assert failed_visits == [expected_visit_stats] def test_origin_visit_stats_upsert_with_snapshot(self, swh_scheduler) -> None: eventful_date = utcnow() @@ -886,8 +888,8 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats - assert swh_scheduler.origin_visit_stats_get(url, "svn") is None + assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == [visit_stats] + assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == [] def test_origin_visit_stats_upsert_messing_with_time(self, swh_scheduler) -> None: url = "interesting-origin" @@ -911,7 +913,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats0]) - actual_visit_stats0 = swh_scheduler.origin_visit_stats_get(url, "git") + actual_visit_stats0 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert actual_visit_stats0 == visit_stats0 visit_stats2 = OriginVisitStats( @@ -924,7 +926,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats2]) - actual_visit_stats2 = swh_scheduler.origin_visit_stats_get(url, "git") + actual_visit_stats2 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert actual_visit_stats2 == attr.evolve( actual_visit_stats0, last_uneventful=date1 ) @@ -943,7 +945,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats1]) - actual_visit_stats1 = swh_scheduler.origin_visit_stats_get(url, "git") + actual_visit_stats1 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert actual_visit_stats1 == attr.evolve( actual_visit_stats2, last_eventful=date2 @@ -974,13 +976,10 @@ swh_scheduler.origin_visit_stats_upsert(visit_stats) - for visit_stat in visit_stats: - assert ( - swh_scheduler.origin_visit_stats_get( - visit_stat.url, visit_stat.visit_type - ) - is not None - ) + for visit_stat in swh_scheduler.origin_visit_stats_get( + [(vs.url, vs.visit_type) for vs in visit_stats] + ): + assert visit_stat is not None def test_origin_visit_stats_upsert_cardinality_failing(self, swh_scheduler) -> None: """Batch upsert does not support altering multiple times the same origin-visit-status