diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py --- a/swh/scheduler/backend.py +++ b/swh/scheduler/backend.py @@ -837,16 +837,19 @@ @db_transaction() def origin_visit_stats_get( - self, url: str, visit_type: str, db=None, cur=None - ) -> Optional[OriginVisitStats]: + self, ids: Iterable[Tuple[str, str]], db=None, cur=None + ) -> List[OriginVisitStats]: + if not ids: + return [] + primary_keys = tuple((origin, visit_type) for (origin, visit_type) in ids) query = format_query( - "SELECT {keys} FROM origin_visit_stats WHERE url=%s AND visit_type=%s", + """ + SELECT {keys} + FROM (VALUES %s) as stats(url, visit_type) + INNER JOIN origin_visit_stats USING (url, visit_type) + """, OriginVisitStats.select_columns(), ) - cur.execute(query, (url, visit_type)) - row = cur.fetchone() - if row: - return OriginVisitStats(**row) - else: - return None + psycopg2.extras.execute_values(cur=cur, sql=query, argslist=primary_keys) + return [OriginVisitStats(**row) for row in cur.fetchall()] diff --git a/swh/scheduler/interface.py b/swh/scheduler/interface.py --- a/swh/scheduler/interface.py +++ b/swh/scheduler/interface.py @@ -3,7 +3,7 @@ # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Tuple from uuid import UUID from typing_extensions import Protocol, runtime_checkable @@ -336,7 +336,12 @@ @remote_api_endpoint("visit_stats/get") def origin_visit_stats_get( - self, url: str, visit_type: str - ) -> Optional[OriginVisitStats]: - """Retrieve the stats for an origin with a given visit type""" + self, ids: Iterable[Tuple[str, str]] + ) -> List[OriginVisitStats]: + """Retrieve the stats for an origin with a given visit type + + If some visit_stats are not found, they are filtered out of the result. So the + output list may be of length inferior to the length of the input list. + + """ ... diff --git a/swh/scheduler/journal_client.py b/swh/scheduler/journal_client.py --- a/swh/scheduler/journal_client.py +++ b/swh/scheduler/journal_client.py @@ -60,9 +60,9 @@ } pk = origin, visit_type if pk not in origin_visit_stats: - visit_stats = scheduler.origin_visit_stats_get(origin, visit_type) + visit_stats = scheduler.origin_visit_stats_get([pk]) origin_visit_stats[pk] = ( - attr.asdict(visit_stats) if visit_stats else empty_object + attr.asdict(visit_stats[0]) if visit_stats else empty_object ) visit_stats_d = origin_visit_stats[pk] diff --git a/swh/scheduler/tests/test_cli_journal.py b/swh/scheduler/tests/test_cli_journal.py --- a/swh/scheduler/tests/test_cli_journal.py +++ b/swh/scheduler/tests/test_cli_journal.py @@ -108,7 +108,8 @@ assert result.output == expected_output actual_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] + [(visit_status["origin"], visit_status["type"])] ) - assert actual_visit_stats is not None + assert actual_visit_stats + assert len(actual_visit_stats) == 1 diff --git a/swh/scheduler/tests/test_journal_client.py b/swh/scheduler/tests/test_journal_client.py --- a/swh/scheduler/tests/test_journal_client.py +++ b/swh/scheduler/tests/test_journal_client.py @@ -81,11 +81,10 @@ ) # Ensure those visit status are ignored - for visit_status in visit_statuses: - actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is None + actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( + [(vs["origin"], vs["type"]) for vs in visit_statuses] + ) + assert actual_origin_visit_stats == [] def test_journal_client_origin_visit_status_from_journal_last_notfound(swh_scheduler): @@ -103,17 +102,19 @@ ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=None, - last_uneventful=None, - last_failed=None, - last_notfound=visit_status["date"], - last_snapshot=None, + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=None, + last_uneventful=None, + last_failed=None, + last_notfound=visit_status["date"], + last_snapshot=None, + ) + ] visit_statuses = [ { @@ -140,18 +141,19 @@ for visit_status in visit_statuses: actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=None, - last_uneventful=None, - last_failed=None, - last_notfound=DATE3, - last_snapshot=None, + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=None, + last_uneventful=None, + last_failed=None, + last_notfound=DATE3, + last_snapshot=None, + ) + ] def test_journal_client_origin_visit_status_from_journal_last_failed(swh_scheduler): @@ -181,18 +183,19 @@ # Ensure those visit status are ignored for visit_status in visit_statuses: actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=None, - last_uneventful=None, - last_failed=visit_status["date"], - last_notfound=None, - last_snapshot=None, + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=None, + last_uneventful=None, + last_failed=visit_status["date"], + last_notfound=None, + last_snapshot=None, + ) + ] visit_statuses = [ { @@ -219,18 +222,19 @@ for visit_status in visit_statuses: actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=None, - last_uneventful=None, - last_failed=DATE3, - last_notfound=None, - last_snapshot=None, + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=None, + last_uneventful=None, + last_failed=DATE3, + last_notfound=None, + last_snapshot=None, + ) + ] def test_journal_client_origin_visit_status_from_journal_last_eventful(swh_scheduler): @@ -259,18 +263,19 @@ for visit_status in visit_statuses: actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=visit_status["date"], - last_uneventful=None, - last_failed=None, - last_notfound=None, - last_snapshot=visit_status["snapshot"], + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=visit_status["date"], + last_uneventful=None, + last_failed=None, + last_notfound=None, + last_snapshot=visit_status["snapshot"], + ) + ] visit_statuses = [ { @@ -297,18 +302,19 @@ for visit_status in visit_statuses: actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=DATE3, - last_uneventful=None, - last_failed=None, - last_notfound=None, - last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=DATE3, + last_uneventful=None, + last_failed=None, + last_notfound=None, + last_snapshot=hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), + ) + ] def test_journal_client_origin_visit_status_from_journal_last_uneventful(swh_scheduler): @@ -341,18 +347,19 @@ ) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get( - visit_status["origin"], visit_status["type"] - ) - assert actual_origin_visit_stats is not None - assert actual_origin_visit_stats == OriginVisitStats( - url=visit_status["origin"], - visit_type=visit_status["type"], - last_eventful=DATE1, - last_uneventful=visit_status["date"], # most recent date but uneventful - last_failed=DATE2, - last_notfound=DATE1, - last_snapshot=visit_status["snapshot"], + [(visit_status["origin"], visit_status["type"])] ) + assert actual_origin_visit_stats == [ + OriginVisitStats( + url=visit_status["origin"], + visit_type=visit_status["type"], + last_eventful=DATE1, + last_uneventful=visit_status["date"], # most recent date but uneventful + last_failed=DATE2, + last_notfound=DATE1, + last_snapshot=visit_status["snapshot"], + ) + ] VISIT_STATUSES = [ @@ -413,7 +420,9 @@ last_snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), ) - assert swh_scheduler.origin_visit_stats_get("foo", "git") == expected_visit_stats + assert swh_scheduler.origin_visit_stats_get([("foo", "git")]) == [ + expected_visit_stats + ] VISIT_STATUSES1 = [ @@ -474,9 +483,9 @@ last_snapshot=hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), ) - assert ( - swh_scheduler.origin_visit_stats_get("cavabarder", "hg") == expected_visit_stats - ) + assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == [ + expected_visit_stats + ] VISIT_STATUSES_2 = [ @@ -563,21 +572,21 @@ ) after = utcnow() - assert swh_scheduler.origin_visit_stats_get("cavabarder", "hg") is None - assert swh_scheduler.origin_visit_stats_get("cavabarder", "git") is not None + assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == [] + assert swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0] is not None process_journal_objects( {"origin_visit_status": VISIT_STATUSES_2}, scheduler=swh_scheduler ) for url in ("cavabarder", "iciaussi"): - ovs = swh_scheduler.origin_visit_stats_get(url, "git") + ovs = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert before <= ovs.last_scheduled <= after - ovs = swh_scheduler.origin_visit_stats_get(url, "hg") + ovs = swh_scheduler.origin_visit_stats_get([(url, "hg")])[0] assert ovs.last_scheduled is None - ovs = swh_scheduler.origin_visit_stats_get("cavabarder", "git") + ovs = swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0] assert ovs.last_eventful == DATE1 + 5 * ONE_DAY assert ovs.last_uneventful is None assert ovs.last_failed is None diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py --- a/swh/scheduler/tests/test_scheduler.py +++ b/swh/scheduler/tests/test_scheduler.py @@ -744,9 +744,8 @@ assert len(ret) == NUM_RESULTS for origin in ret: - visit_stats = swh_scheduler.origin_visit_stats_get( - origin.url, origin.visit_type - ) + pk = (origin.url, origin.visit_type) + visit_stats = swh_scheduler.origin_visit_stats_get([pk])[0] assert visit_stats is not None assert before <= visit_stats.last_scheduled <= after @@ -792,8 +791,8 @@ swh_scheduler.origin_visit_stats_upsert([visit_stats]) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats - assert swh_scheduler.origin_visit_stats_get(url, "svn") is None + assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == [visit_stats] + assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == [] uneventful_date = utcnow() visit_stats = OriginVisitStats( @@ -806,7 +805,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - uneventful_visit = swh_scheduler.origin_visit_stats_get(url, "git") + uneventful_visits = swh_scheduler.origin_visit_stats_get([(url, "git")]) expected_visit_stats = OriginVisitStats( url=url, @@ -817,7 +816,7 @@ last_notfound=None, ) - assert uneventful_visit == expected_visit_stats + assert uneventful_visits == [expected_visit_stats] failed_date = utcnow() visit_stats = OriginVisitStats( @@ -830,7 +829,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - failed_visit = swh_scheduler.origin_visit_stats_get(url, "git") + failed_visits = swh_scheduler.origin_visit_stats_get([(url, "git")]) expected_visit_stats = OriginVisitStats( url=url, @@ -841,7 +840,7 @@ last_notfound=None, ) - assert failed_visit == expected_visit_stats + assert failed_visits == [expected_visit_stats] def test_origin_visit_stats_upsert_with_snapshot(self, swh_scheduler) -> None: eventful_date = utcnow() @@ -858,8 +857,8 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) - assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats - assert swh_scheduler.origin_visit_stats_get(url, "svn") is None + assert swh_scheduler.origin_visit_stats_get([(url, "git")]) == [visit_stats] + assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == [] def test_origin_visit_stats_upsert_messing_with_time(self, swh_scheduler) -> None: url = "interesting-origin" @@ -883,7 +882,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats0]) - actual_visit_stats0 = swh_scheduler.origin_visit_stats_get(url, "git") + actual_visit_stats0 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert actual_visit_stats0 == visit_stats0 visit_stats2 = OriginVisitStats( @@ -896,7 +895,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats2]) - actual_visit_stats2 = swh_scheduler.origin_visit_stats_get(url, "git") + actual_visit_stats2 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert actual_visit_stats2 == attr.evolve( actual_visit_stats0, last_uneventful=date1 ) @@ -915,7 +914,7 @@ ) swh_scheduler.origin_visit_stats_upsert([visit_stats1]) - actual_visit_stats1 = swh_scheduler.origin_visit_stats_get(url, "git") + actual_visit_stats1 = swh_scheduler.origin_visit_stats_get([(url, "git")])[0] assert actual_visit_stats1 == attr.evolve( actual_visit_stats2, last_eventful=date2 @@ -946,13 +945,10 @@ swh_scheduler.origin_visit_stats_upsert(visit_stats) - for visit_stat in visit_stats: - assert ( - swh_scheduler.origin_visit_stats_get( - visit_stat.url, visit_stat.visit_type - ) - is not None - ) + for visit_stat in swh_scheduler.origin_visit_stats_get( + [(vs.url, vs.visit_type) for vs in visit_stats] + ): + assert visit_stat is not None def test_origin_visit_stats_upsert_cardinality_failing(self, swh_scheduler) -> None: """Batch upsert does not support altering multiple times the same origin-visit-status @@ -981,3 +977,6 @@ ), ] ) + + def test_origin_visit_stats_get_empty(self, swh_scheduler) -> None: + assert swh_scheduler.origin_visit_stats_get([]) == []