diff --git a/sql/upgrades/138.sql b/sql/upgrades/138.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/138.sql @@ -0,0 +1,29 @@ +-- SWH DB schema upgrade +-- from_version: 137 +-- to_version: 138 +-- description: Make swh_visit_find_by_date break ties using the largest visit id + +insert into dbversion(version, release, description) + values(138, now(), 'Work In Progress'); + +create or replace function swh_visit_find_by_date(origin bigint, visit_date timestamptz default NOW()) + returns origin_visit + language sql + stable +as $$ + with closest_two_visits as (( + select ov, (date - visit_date), visit as interval + from origin_visit ov + where ov.origin = origin + and ov.date >= visit_date + order by ov.date asc, ov.visit desc + limit 1 + ) union ( + select ov, (visit_date - date), visit as interval + from origin_visit ov + where ov.origin = origin + and ov.date < visit_date + order by ov.date desc, ov.visit desc + limit 1 + )) select (ov).* from closest_two_visits order by interval, visit limit 1 +$$; diff --git a/swh/storage/api/client.py b/swh/storage/api/client.py --- a/swh/storage/api/client.py +++ b/swh/storage/api/client.py @@ -207,6 +207,10 @@ return self.post('origin/visit/get', { 'origin': origin, 'last_visit': last_visit, 'limit': limit}) + def origin_visit_find_by_date(self, origin, visit_date, limit=None): + return self.post('origin/visit/find_by_date', { + 'origin': origin, 'visit_date': visit_date}) + def origin_visit_get_by(self, origin, visit): return self.post('origin/visit/getby', {'origin': origin, 'visit': visit}) diff --git a/swh/storage/api/server.py b/swh/storage/api/server.py --- a/swh/storage/api/server.py +++ b/swh/storage/api/server.py @@ -396,6 +396,13 @@ **decode_request(request))) +@app.route('/origin/visit/find_by_date', methods=['POST']) +@timed +def origin_visit_find_by_date(): + return encode_data(get_storage().origin_visit_find_by_date( + **decode_request(request))) + + @app.route('/origin/visit/getby', methods=['POST']) @timed def origin_visit_get_by(): diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -421,6 +421,15 @@ return None return r[0] + def origin_visit_find_by_date(self, origin, visit_date, cur=None): + cur = self._cursor(cur) + cur.execute( + 'SELECT * FROM swh_visit_find_by_date(%s, %s)', + (origin, visit_date)) + r = cur.fetchall() + if r: + return r[0] + def origin_visit_exists(self, origin_id, visit_id, cur=None): """Check whether an origin visit with the given ids exists""" cur = self._cursor(cur) diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -1319,6 +1319,26 @@ visit_id = visit['visit'] yield copy.deepcopy(self._origin_visits[origin-1][visit_id-1]) + def origin_visit_find_by_date(self, origin, visit_date): + """Retrieves the origin visit whose date is closest to the provided + timestamp. + In case of a tie, the visit with largest id is selected. + + Args: + origin (str): The occurrence's origin (URL). + target (datetime): target timestamp + + Returns: + A visit. + + """ + origin = self.origin_get([{'url': origin}])[0]['id'] + if origin <= len(self._origin_visits): + visits = self._origin_visits[origin-1] + return min( + visits, + key=lambda v: (abs(v['date'] - visit_date), -v['visit'])) + def origin_visit_get_by(self, origin, visit): """Retrieve origin visit's information. diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -17,7 +17,7 @@ -- latest schema version insert into dbversion(version, release, description) - values(137, now(), 'Work In Progress'); + values(138, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); diff --git a/swh/storage/sql/40-swh-func.sql b/swh/storage/sql/40-swh-func.sql --- a/swh/storage/sql/40-swh-func.sql +++ b/swh/storage/sql/40-swh-func.sql @@ -829,26 +829,27 @@ $$; -- Find the visit of origin id closest to date visit_date +-- Breaks ties by selecting the largest visit id create or replace function swh_visit_find_by_date(origin bigint, visit_date timestamptz default NOW()) returns origin_visit language sql stable as $$ with closest_two_visits as (( - select ov, (date - visit_date) as interval + select ov, (date - visit_date), visit as interval from origin_visit ov where ov.origin = origin and ov.date >= visit_date - order by ov.date asc + order by ov.date asc, ov.visit desc limit 1 ) union ( - select ov, (visit_date - date) as interval + select ov, (visit_date - date), visit as interval from origin_visit ov where ov.origin = origin and ov.date < visit_date - order by ov.date desc + order by ov.date desc, ov.visit desc limit 1 - )) select (ov).* from closest_two_visits order by interval limit 1 + )) select (ov).* from closest_two_visits order by interval, visit limit 1 $$; -- Find the visit of origin id closest to date visit_date diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -1322,6 +1322,25 @@ yield data @db_transaction(statement_timeout=500) + def origin_visit_find_by_date(self, origin, visit_date, db=None, cur=None): + """Retrieves the origin visit whose date is closest to the provided + timestamp. + In case of a tie, the visit with largest id is selected. + + Args: + origin (str): The occurrence's origin (URL). + target (datetime): target timestamp + + Returns: + A visit. + + """ + origin = self.origin_get([{'url': origin}], db=db, cur=cur)[0]['id'] + line = db.origin_visit_find_by_date(origin, visit_date, cur=cur) + if line: + return dict(zip(db.origin_visit_get_cols, line)) + + @db_transaction(statement_timeout=500) def origin_visit_get_by(self, origin, visit, db=None, cur=None): """Retrieve origin visit's information. diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1842,6 +1842,32 @@ ('origin_visit', data4), ('origin_visit', data5)]) + def test_origin_visit_find_by_date(self): + # given + self.storage.origin_add_one(self.origin) + + self.storage.origin_visit_add( + self.origin['url'], + date=self.date_visit2) + + origin_visit2 = self.storage.origin_visit_add( + self.origin['url'], + date=self.date_visit3) + + origin_visit3 = self.storage.origin_visit_add( + self.origin['url'], + date=self.date_visit2) + + # Simple case + visit = self.storage.origin_visit_find_by_date( + self.origin['url'], self.date_visit3) + self.assertEqual(visit['visit'], origin_visit2['visit']) + + # There are two visits at the same date, the latest must be returned + visit = self.storage.origin_visit_find_by_date( + self.origin['url'], self.date_visit2) + self.assertEqual(visit['visit'], origin_visit3['visit']) + def test_origin_visit_update_missing_snapshot(self): # given origin_id = self.storage.origin_add_one(self.origin)