diff --git a/sql/updates/19.sql b/sql/updates/19.sql new file mode 100644 --- /dev/null +++ b/sql/updates/19.sql @@ -0,0 +1,18 @@ +insert into dbversion (version, release, description) + values (19, now(), 'Work In Progress'); + +create table origin_visit_stats ( + url text not null, + visit_type text not null, + last_eventful timestamptz, + last_uneventful timestamptz, + last_failed timestamptz, + + primary key (url, visit_type) +); + +comment on column origin_visit_stats.url is 'Origin URL'; +comment on column origin_visit_stats.visit_type is 'Type of the visit for the given url'; +comment on column origin_visit_stats.last_eventful is 'Date of the last eventful event'; +comment on column origin_visit_stats.last_uneventful is 'Date of the last uneventful event'; +comment on column origin_visit_stats.last_failed is 'Date of the last failed event'; diff --git a/swh/scheduler/backend.py b/swh/scheduler/backend.py --- a/swh/scheduler/backend.py +++ b/swh/scheduler/backend.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -21,6 +21,7 @@ ListedOrigin, ListedOriginPageToken, Lister, + OriginVisitStats, PaginatedListedOriginList, ) @@ -757,3 +758,58 @@ def get_priority_ratios(self, db=None, cur=None): cur.execute("select id, ratio from priority_ratio") return {row["id"]: row["ratio"] for row in cur.fetchall()} + + @db_transaction() + def origin_visit_stats_upsert( + self, visit_stats: OriginVisitStats, db=None, cur=None + ) -> None: + query = """ + INSERT into origin_visit_stats AS ovi ( + url, + visit_type, + last_eventful, + last_uneventful, + last_failed + ) + VALUES (%s, %s, %s, %s, %s) ON CONFLICT (url, visit_type) DO + UPDATE + SET last_eventful = coalesce( + excluded.last_eventful, + ovi.last_eventful + ), + last_uneventful = coalesce( + excluded.last_uneventful, + ovi.last_uneventful + ), + last_failed = coalesce( + excluded.last_failed, + ovi.last_failed + ) + """ + + cur.execute( + query, + ( + visit_stats.url, + visit_stats.visit_type, + visit_stats.last_eventful, + visit_stats.last_uneventful, + visit_stats.last_failed, + ), + ) + + @db_transaction() + def origin_visit_stats_get( + self, url: str, visit_type: str, db=None, cur=None + ) -> Optional[OriginVisitStats]: + query = format_query( + "SELECT {keys} FROM origin_visit_stats WHERE url=%s AND visit_type=%s", + OriginVisitStats.select_columns(), + ) + cur.execute(query, (url, visit_type)) + row = cur.fetchone() + + if row: + return OriginVisitStats(**row) + else: + return None diff --git a/swh/scheduler/interface.py b/swh/scheduler/interface.py --- a/swh/scheduler/interface.py +++ b/swh/scheduler/interface.py @@ -1,4 +1,4 @@ -# Copyright (C) 2015-2020 The Software Heritage developers +# Copyright (C) 2015-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -13,6 +13,7 @@ ListedOrigin, ListedOriginPageToken, Lister, + OriginVisitStats, PaginatedListedOriginList, ) @@ -322,3 +323,16 @@ @remote_api_endpoint("priority_ratios/get") def get_priority_ratios(self): ... + + @remote_api_endpoint("visit_stats/upsert") + def origin_visit_stats_upsert(self, visit_stats: OriginVisitStats) -> None: + """Create a new origin visit stats + """ + ... + + @remote_api_endpoint("visit_stats/get") + def origin_visit_stats_get( + self, url: str, visit_type: str + ) -> Optional[OriginVisitStats]: + """Retrieve the stats for an origin with a given visit type""" + ... diff --git a/swh/scheduler/model.py b/swh/scheduler/model.py --- a/swh/scheduler/model.py +++ b/swh/scheduler/model.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -12,6 +12,12 @@ from attrs_strict import type_validator +def check_timestamptz(value) -> None: + """Checks the date has a timezone.""" + if value is not None and value.tzinfo is None: + raise ValueError("date must be a timezone-aware datetime.") + + @attr.s class BaseSchedulerModel: """Base class for database-backed objects. @@ -195,3 +201,35 @@ converter=convert_listed_origin_page_token, default=None, ) + + +@attr.s(frozen=True, slots=True) +class OriginVisitStats(BaseSchedulerModel): + """Represents an aggregated origin visits view. + """ + + url = attr.ib( + type=str, validator=[type_validator()], metadata={"primary_key": True} + ) + visit_type = attr.ib( + type=str, validator=[type_validator()], metadata={"primary_key": True} + ) + last_eventful = attr.ib( + type=Optional[datetime.datetime], validator=type_validator() + ) + last_uneventful = attr.ib( + type=Optional[datetime.datetime], validator=type_validator() + ) + last_failed = attr.ib(type=Optional[datetime.datetime], validator=type_validator()) + + @last_eventful.validator + def check_last_eventful(self, attribute, value): + check_timestamptz(value) + + @last_uneventful.validator + def check_last_uneventful(self, attribute, value): + check_timestamptz(value) + + @last_failed.validator + def check_last_failed(self, attribute, value): + check_timestamptz(value) diff --git a/swh/scheduler/sql/30-schema.sql b/swh/scheduler/sql/30-schema.sql --- a/swh/scheduler/sql/30-schema.sql +++ b/swh/scheduler/sql/30-schema.sql @@ -11,7 +11,7 @@ comment on column dbversion.description is 'Version description'; insert into dbversion (version, release, description) - values (18, now(), 'Work In Progress'); + values (19, now(), 'Work In Progress'); create table task_type ( type text primary key, @@ -164,3 +164,19 @@ comment on column listed_origins.last_update is 'Time of the last update to the origin recorded by the remote'; comment on column listed_origins.last_scheduled is 'Time when this origin was scheduled to be visited last'; + +create table origin_visit_stats ( + url text not null, + visit_type text not null, + last_eventful timestamptz, + last_uneventful timestamptz, + last_failed timestamptz, + + primary key (url, visit_type) +); + +comment on column origin_visit_stats.url is 'Origin URL'; +comment on column origin_visit_stats.visit_type is 'Type of the visit for the given url'; +comment on column origin_visit_stats.last_eventful is 'Date of the last eventful event'; +comment on column origin_visit_stats.last_uneventful is 'Date of the last uneventful event'; +comment on column origin_visit_stats.last_failed is 'Date of the last failed event'; diff --git a/swh/scheduler/tests/test_api_client.py b/swh/scheduler/tests/test_api_client.py --- a/swh/scheduler/tests/test_api_client.py +++ b/swh/scheduler/tests/test_api_client.py @@ -65,6 +65,8 @@ "task_type/create", "task_type/get", "task_type/get_all", + "visit_stats/get", + "visit_stats/upsert", ) ) assert rules == expected_rules diff --git a/swh/scheduler/tests/test_scheduler.py b/swh/scheduler/tests/test_scheduler.py --- a/swh/scheduler/tests/test_scheduler.py +++ b/swh/scheduler/tests/test_scheduler.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2019 The Software Heritage developers +# Copyright (C) 2017-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -16,7 +16,7 @@ from swh.scheduler.exc import StaleData, UnknownPolicy from swh.scheduler.interface import SchedulerInterface -from swh.scheduler.model import ListedOrigin, ListedOriginPageToken +from swh.scheduler.model import ListedOrigin, ListedOriginPageToken, OriginVisitStats from swh.scheduler.utils import utcnow from .common import LISTERS, TASK_TYPES, TEMPLATES, tasks_from_template @@ -762,3 +762,64 @@ def _create_task_types(self, scheduler): for tt in TASK_TYPES.values(): scheduler.create_task_type(tt) + + def test_origin_visit_stats_upsert(self, swh_scheduler) -> None: + eventful_date = utcnow() + url = "https://github.com/test" + + visit_stats = OriginVisitStats( + url=url, + visit_type="git", + last_eventful=eventful_date, + last_uneventful=None, + last_failed=None, + ) + swh_scheduler.origin_visit_stats_upsert(visit_stats) + swh_scheduler.origin_visit_stats_upsert(visit_stats) + + assert swh_scheduler.origin_visit_stats_get(url, "git") == visit_stats + assert swh_scheduler.origin_visit_stats_get(url, "svn") is None + + uneventful_date = utcnow() + visit_stats = OriginVisitStats( + url=url, + visit_type="git", + last_eventful=None, + last_uneventful=uneventful_date, + last_failed=None, + ) + swh_scheduler.origin_visit_stats_upsert(visit_stats) + + uneventful_visit = swh_scheduler.origin_visit_stats_get(url, "git") + + expected_visit_stats = OriginVisitStats( + url=url, + visit_type="git", + last_eventful=eventful_date, + last_uneventful=uneventful_date, + last_failed=None, + ) + + assert uneventful_visit == expected_visit_stats + + failed_date = utcnow() + visit_stats = OriginVisitStats( + url=url, + visit_type="git", + last_eventful=None, + last_uneventful=None, + last_failed=failed_date, + ) + swh_scheduler.origin_visit_stats_upsert(visit_stats) + + failed_visit = swh_scheduler.origin_visit_stats_get(url, "git") + + expected_visit_stats = OriginVisitStats( + url=url, + visit_type="git", + last_eventful=eventful_date, + last_uneventful=uneventful_date, + last_failed=failed_date, + ) + + assert failed_visit == expected_visit_stats