diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -60,6 +60,7 @@ MetadataFetcherRow, OriginRow, OriginVisitRow, + OriginVisitStatusRow, RawExtrinsicMetadataRow, RevisionParentRow, SkippedContentRow, @@ -895,16 +896,13 @@ for visit_status in visit_statuses: self._origin_visit_status_add(visit_status) - def _origin_visit_apply_last_status(self, visit: Dict[str, Any]) -> Dict[str, Any]: + def _origin_visit_apply_status( + self, visit: Dict[str, Any], visit_status: OriginVisitStatusRow + ) -> Dict[str, Any]: """Retrieve the latest visit status information for the origin visit. Then merge it with the visit and return it. """ - row = self._cql_runner.origin_visit_status_get_latest( - visit["origin"], visit["visit"] - ) - assert row is not None - visit_status = converters.row_to_visit_status(row) return { # default to the values in visit **visit, @@ -1028,22 +1026,25 @@ latest_visit = None for row in rows: visit = self._format_origin_visit_row(row) - updated_visit = self._origin_visit_apply_last_status(visit) - if type is not None and updated_visit["type"] != type: - continue - if allowed_statuses and updated_visit["status"] not in allowed_statuses: - continue - if require_snapshot and updated_visit["snapshot"] is None: - continue - - # updated_visit is a candidate - if latest_visit is not None: - if updated_visit["date"] < latest_visit["date"]: + for status_row in self._cql_runner.origin_visit_status_get( + origin, visit["visit"] + ): + updated_visit = self._origin_visit_apply_status(visit, status_row) + if type is not None and updated_visit["type"] != type: + continue + if allowed_statuses and updated_visit["status"] not in allowed_statuses: continue - if updated_visit["visit"] < latest_visit["visit"]: + if require_snapshot and updated_visit["snapshot"] is None: continue - latest_visit = updated_visit + # updated_visit is a candidate + if latest_visit is not None: + if updated_visit["date"] < latest_visit["date"]: + continue + if updated_visit["visit"] < latest_visit["visit"]: + continue + + latest_visit = updated_visit if latest_visit is None: return None diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -2295,6 +2295,60 @@ actual_visit = swh_storage.origin_visit_get_latest(origin.url) assert actual_visit == ov2 + def test_origin_visit_get_latest__not_last(self, swh_storage, sample_data): + origin = sample_data.origin + swh_storage.origin_add([origin]) + + visit1, visit2 = sample_data.origin_visits[:2] + assert visit1.origin == origin.url + + swh_storage.origin_visit_add([visit1]) + ov1 = swh_storage.origin_visit_get_latest(origin.url) + + # Add snapshot to visit1, latest snapshot = visit 1 snapshot + complete_snapshot = sample_data.snapshots[2] + swh_storage.snapshot_add([complete_snapshot]) + + swh_storage.origin_visit_status_add( + [ + OriginVisitStatus( + origin=origin.url, + visit=ov1.visit, + date=visit2.date, + status="partial", + snapshot=None, + ) + ] + ) + assert visit1.date < visit2.date + + # no snapshot associated to the visit, so None + visit = swh_storage.origin_visit_get_latest( + origin.url, allowed_statuses=["partial"], require_snapshot=True, + ) + assert visit is None + + date_now = now() + assert visit2.date < date_now + swh_storage.origin_visit_status_add( + [ + OriginVisitStatus( + origin=origin.url, + visit=ov1.visit, + date=date_now, + status="full", + snapshot=complete_snapshot.id, + ) + ] + ) + + swh_storage.origin_visit_add( + [OriginVisit(origin=origin.url, date=now(), type=visit1.type,)] + ) + + visit = swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True) + assert visit is not None + def test_origin_visit_status_get_latest__validation(self, swh_storage, sample_data): origin = sample_data.origin swh_storage.origin_add([origin])