diff --git a/swh/storage/cassandra/cql.py b/swh/storage/cassandra/cql.py --- a/swh/storage/cassandra/cql.py +++ b/swh/storage/cassandra/cql.py @@ -1033,10 +1033,12 @@ else: return None - @_prepared_select_statement(OriginVisitRow, "WHERE origin = ?") - def origin_visit_get_all( + @_prepared_select_statement(OriginVisitRow, "WHERE origin = ? ORDER BY visit DESC") + def origin_visit_iter_all( self, origin_url: str, *, statement ) -> Iterable[OriginVisitRow]: + """Returns an iterator on visits for a given origin, ordered by descending + visit id.""" return map( OriginVisitRow.from_dict, self._execute_with_retries(statement, [origin_url]), diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -1281,7 +1281,7 @@ # Iterator over all the visits of the origin # This should be ok for now, as there aren't too many visits # per origin. - rows = list(self._cql_runner.origin_visit_get_all(origin)) + rows = list(self._cql_runner.origin_visit_iter_all(origin)) def key(visit): dt = visit.date.replace(tzinfo=datetime.timezone.utc) - visit_date @@ -1309,9 +1309,7 @@ f"Unknown allowed statuses {','.join(allowed_statuses)}, only " f"{','.join(VISIT_STATUSES)} authorized" ) - # TODO: Do not fetch all visits - rows = self._cql_runner.origin_visit_get_all(origin) - latest_visit = None + rows = self._cql_runner.origin_visit_iter_all(origin) for row in rows: visit = self._format_origin_visit_row(row) for status_row in self._cql_runner.origin_visit_status_get( @@ -1325,24 +1323,14 @@ if require_snapshot and updated_visit["snapshot"] is None: continue - # updated_visit is a candidate - if latest_visit is not None: - if updated_visit["date"] < latest_visit["date"]: - continue - assert ( - updated_visit["visit"] >= latest_visit["visit"] - ), "Cassandra returned visits not ordered by increasing visit id." - - latest_visit = updated_visit + return OriginVisit( + origin=visit["origin"], + visit=visit["visit"], + date=visit["date"], + type=visit["type"], + ) - if latest_visit is None: - return None - return OriginVisit( - origin=latest_visit["origin"], - visit=latest_visit["visit"], - date=latest_visit["date"], - type=latest_visit["type"], - ) + return None def origin_visit_status_get_latest( self, diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -531,8 +531,8 @@ ) -> Optional[OriginVisitRow]: return self._origin_visits.get_from_primary_key((origin_url, visit_id)) - def origin_visit_get_all(self, origin_url: str) -> Iterable[OriginVisitRow]: - return self._origin_visits.get_from_partition_key((origin_url,)) + def origin_visit_iter_all(self, origin_url: str) -> Iterable[OriginVisitRow]: + return reversed(list(self._origin_visits.get_from_partition_key((origin_url,)))) def origin_visit_iter(self, start_token: int) -> Iterator[OriginVisitRow]: """Returns all origin visits in order from this token, diff --git a/swh/storage/postgresql/db.py b/swh/storage/postgresql/db.py --- a/swh/storage/postgresql/db.py +++ b/swh/storage/postgresql/db.py @@ -783,9 +783,7 @@ query_parts.append("AND ovs.status IN %s") query_params.append(tuple(allowed_statuses)) - query_parts.append( - "ORDER BY ov.date DESC, ov.visit DESC, ovs.date DESC LIMIT 1" - ) + query_parts.append("ORDER BY ov.visit DESC, ovs.date DESC LIMIT 1") query = "\n".join(query_parts) diff --git a/swh/storage/tests/storage_tests.py b/swh/storage/tests/storage_tests.py --- a/swh/storage/tests/storage_tests.py +++ b/swh/storage/tests/storage_tests.py @@ -3122,9 +3122,9 @@ ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3]) - # no filters, latest visit is the last one (whose date is most recent) + # no filters actual_visit = swh_storage.origin_visit_get_latest(origin.url) - assert actual_visit == ov2 + assert actual_visit == ov3 def test_origin_visit_get_latest__not_last(self, swh_storage, sample_data): origin = sample_data.origin