diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -885,7 +885,7 @@ type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, - ) -> Optional[Dict[str, Any]]: + ) -> Optional[OriginVisit]: # TODO: Do not fetch all visits rows = self._cql_runner.origin_visit_get_all(origin) latest_visit = None @@ -908,7 +908,14 @@ latest_visit = updated_visit - return latest_visit + if latest_visit is None: + return None + return OriginVisit( + origin=latest_visit["origin"], + visit=latest_visit["visit"], + date=latest_visit["date"], + type=latest_visit["type"], + ) def origin_visit_status_get_latest( self, diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -916,29 +916,33 @@ type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, - ) -> Optional[Dict[str, Any]]: + ) -> Optional[OriginVisit]: ori = self._origins.get(origin) if not ori: return None - visits = self._origin_visits[ori.url] - visits = [ - self._origin_visit_get_updated(visit.origin, visit.visit) - for visit in visits - if visit is not None - ] + visits = sorted( + self._origin_visits[ori.url], key=lambda v: v.date, reverse=True, + ) + for visit in visits: + if type is not None and visit.type != type: + continue + visit_statuses = self._origin_visit_statuses[origin, visit.visit] - if type is not None: - visits = [visit for visit in visits if visit["type"] == type] - if allowed_statuses is not None: - visits = [visit for visit in visits if visit["status"] in allowed_statuses] - if require_snapshot: - visits = [visit for visit in visits if visit["snapshot"]] + if allowed_statuses is not None: + visit_statuses = [ + vs for vs in visit_statuses if vs.status in allowed_statuses + ] + if require_snapshot: + visit_statuses = [vs for vs in visit_statuses if vs.snapshot] - visit = max(visits, key=lambda v: (v["date"], v["visit"]), default=None) - if visit is None: - return None - return visit + if visit_statuses: # we found visit statuses matching criteria + visit_status = max(visit_statuses, key=lambda vs: (vs.date, vs.visit)) + assert visit.origin == visit_status.origin + assert visit.visit == visit_status.visit + return visit + + return None def origin_visit_status_get_latest( self, diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -853,7 +853,7 @@ type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, - ) -> Optional[Dict[str, Any]]: + ) -> Optional[OriginVisit]: """Get the latest origin visit for the given origin, optionally looking only for those with one of the given allowed_statuses or for those with a snapshot. @@ -870,16 +870,9 @@ will be returned. Returns: - dict: a dict with the following keys: - - - **origin**: the URL of the origin - - **visit**: origin visit id - - **type**: type of loader used for the visit - - **date**: timestamp of such visit - - **status**: Visit's new status - - **metadata**: Data associated to the visit - - **snapshot** (Optional[sha1_git]): identifier of the snapshot - associated to the visit + OriginVisit matching the criteria if found, None otherwise. Note that as + OriginVisit no longer held reference on the visit status or snapshot, you + may want to use origin_visit_status_get_latest for those information. """ ... diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -921,7 +921,7 @@ require_snapshot: bool = False, db=None, cur=None, - ) -> Optional[Dict[str, Any]]: + ) -> Optional[OriginVisit]: row = db.origin_visit_get_latest( origin, type=type, @@ -930,7 +930,14 @@ cur=cur, ) if row: - return dict(zip(db.origin_visit_get_cols, row)) + row_d = dict(zip(db.origin_visit_get_cols, row)) + visit = OriginVisit( + origin=row_d["origin"], + visit=row_d["visit"], + date=row_d["date"], + type=row_d["type"], + ) + return visit return None @timed diff --git a/swh/storage/tests/algos/test_snapshot.py b/swh/storage/tests/algos/test_snapshot.py --- a/swh/storage/tests/algos/test_snapshot.py +++ b/swh/storage/tests/algos/test_snapshot.py @@ -63,7 +63,6 @@ ov1 = swh_storage.origin_visit_get_latest(origin.url) assert ov1 is not None - visit_id = ov1["visit"] # visit references a snapshot but the snapshot does not exist in backend for some # reason @@ -72,7 +71,7 @@ [ OriginVisitStatus( origin=origin.url, - visit=visit_id, + visit=ov1.visit, date=origin_visit2.date, status="partial", snapshot=complete_snapshot.id, @@ -93,7 +92,6 @@ swh_storage.origin_visit_add([visit1]) ov1 = swh_storage.origin_visit_get_latest(origin.url) - visit_id = ov1["visit"] # Add snapshot to visit1, latest snapshot = visit 1 snapshot complete_snapshot = sample_data.snapshots[2] @@ -103,7 +101,7 @@ [ OriginVisitStatus( origin=origin.url, - visit=visit_id, + visit=ov1.visit, date=visit2.date, status="partial", snapshot=None, @@ -124,7 +122,7 @@ [ OriginVisitStatus( origin=origin.url, - visit=visit_id, + visit=ov1.visit, date=date_now, status="full", snapshot=complete_snapshot.id, diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1647,7 +1647,7 @@ snapshot=None, ) - date_visit_now = now() + date_visit_now = round_to_milliseconds(now()) visit_status1 = OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, @@ -1656,7 +1656,7 @@ snapshot=snapshot.id, ) - date_visit_now = now() + date_visit_now = round_to_milliseconds(now()) visit_status2 = OriginVisitStatus( origin=ov2.origin, visit=ov2.visit, @@ -1667,21 +1667,18 @@ ) swh_storage.origin_visit_status_add([visit_status1, visit_status2]) - origin_visit1 = swh_storage.origin_visit_get_latest( - origin1.url, require_snapshot=True + visit = swh_storage.origin_visit_get_latest(origin1.url, require_snapshot=True) + visit_status = swh_storage.origin_visit_status_get_latest( + origin1.url, visit.visit, require_snapshot=True ) - assert origin_visit1 - assert origin_visit1["status"] == "full" - assert origin_visit1["snapshot"] == snapshot.id + assert visit_status == visit_status1 - origin_visit2 = swh_storage.origin_visit_get_latest( - origin2.url, require_snapshot=False + visit = swh_storage.origin_visit_get_latest(origin2.url, require_snapshot=False) + visit_status = swh_storage.origin_visit_status_get_latest( + origin2.url, visit.visit, require_snapshot=False ) assert origin2.url != origin1.url - assert origin_visit2 - assert origin_visit2["status"] == "ongoing" - assert origin_visit2["snapshot"] is None - assert origin_visit2["metadata"] == {"intrinsic": "something"} + assert visit_status == visit_status2 actual_objects = list(swh_storage.journal_writer.journal.objects) @@ -1924,51 +1921,27 @@ origin = sample_data.origin swh_storage.origin_add([origin]) visit1 = OriginVisit( - origin=origin.url, - date=sample_data.date_visit1, - type=sample_data.type_visit1, + origin=origin.url, date=sample_data.date_visit1, type="git", ) visit2 = OriginVisit( - origin=origin.url, - date=sample_data.date_visit2, - type=sample_data.type_visit2, - ) - # Add a visit with the same date as the previous one - visit3 = OriginVisit( - origin=origin.url, - date=sample_data.date_visit2, - type=sample_data.type_visit2, + origin=origin.url, date=sample_data.date_visit2, type="hg", ) - assert sample_data.type_visit1 != sample_data.type_visit2 + date_now = round_to_milliseconds(now()) + visit3 = OriginVisit(origin=origin.url, date=date_now, type="hg",) assert sample_data.date_visit1 < sample_data.date_visit2 + assert sample_data.date_visit2 < date_now ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3]) - origin_visit1 = swh_storage.origin_visit_get_by(origin.url, ov1.visit) - origin_visit3 = swh_storage.origin_visit_get_by(origin.url, ov3.visit) - - assert sample_data.type_visit1 != sample_data.type_visit2 - # Check type filter is ok - actual_ov1 = swh_storage.origin_visit_get_latest( - origin.url, type=sample_data.type_visit1, - ) - assert actual_ov1 == origin_visit1 - - actual_ov3 = swh_storage.origin_visit_get_latest( - origin.url, type=sample_data.type_visit2, - ) - assert actual_ov3 == origin_visit3 - - new_type = "npm" - assert new_type not in [sample_data.type_visit1, sample_data.type_visit2] - - assert ( - swh_storage.origin_visit_get_latest( - origin.url, type=new_type, # no visit matching that type - ) - is None + actual_visit = swh_storage.origin_visit_get_latest(origin.url, type="git") + assert actual_visit == ov1 + actual_visit = swh_storage.origin_visit_get_latest(origin.url, type="hg") + assert actual_visit == ov3 + actual_visit_unknown_type = swh_storage.origin_visit_get_latest( + origin.url, type="npm", # no visit matching that type ) + assert actual_visit_unknown_type is None def test_origin_visit_get_latest(self, swh_storage, sample_data): empty_snapshot, complete_snapshot = sample_data.snapshots[1:3] @@ -1976,152 +1949,177 @@ swh_storage.origin_add([origin]) visit1 = OriginVisit( - origin=origin.url, - date=sample_data.date_visit1, - type=sample_data.type_visit1, + origin=origin.url, date=sample_data.date_visit1, type="git", ) visit2 = OriginVisit( - origin=origin.url, - date=sample_data.date_visit2, - type=sample_data.type_visit2, - ) - # Add a visit with the same date as the previous one - visit3 = OriginVisit( - origin=origin.url, - date=sample_data.date_visit2, - type=sample_data.type_visit2, + origin=origin.url, date=sample_data.date_visit2, type="hg", ) + date_now = round_to_milliseconds(now()) + visit3 = OriginVisit(origin=origin.url, date=date_now, type="hg",) + assert visit1.date < visit2.date + assert visit2.date < visit3.date ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3]) - origin_visit1 = swh_storage.origin_visit_get_by(origin.url, ov1.visit) - origin_visit2 = swh_storage.origin_visit_get_by(origin.url, ov2.visit) - origin_visit3 = swh_storage.origin_visit_get_by(origin.url, ov3.visit) - # Two visits, both with no snapshot - assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url) - assert ( - swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True) - is None + # no filters, latest visit is the last one (whose date is most recent) + actual_visit = swh_storage.origin_visit_get_latest(origin.url) + assert actual_visit == ov3 + + # 3 visits, none has snapshot so nothing is returned + actual_visit = swh_storage.origin_visit_get_latest( + origin.url, require_snapshot=True ) + assert actual_visit is None - # Add snapshot to visit1; require_snapshot=True makes it return - # visit1 and require_snapshot=False still returns visit2 + # visit are created with "created" status, so nothing will get returned + actual_visit = swh_storage.origin_visit_get_latest( + origin.url, allowed_statuses=["partial"] + ) + assert actual_visit is None + + # visit are created with "created" status, so most recent again + actual_visit = swh_storage.origin_visit_get_latest( + origin.url, allowed_statuses=["created"] + ) + assert actual_visit == ov3 + # Add snapshot to visit1; require_snapshot=True makes it return first visit swh_storage.snapshot_add([complete_snapshot]) - swh_storage.origin_visit_status_add( - [ - OriginVisitStatus( - origin=origin.url, - visit=ov1.visit, - date=now(), - status="ongoing", - snapshot=complete_snapshot.id, - ) - ] + visit_status_with_snapshot = OriginVisitStatus( + origin=origin.url, + visit=ov1.visit, + date=round_to_milliseconds(now()), + status="ongoing", + snapshot=complete_snapshot.id, ) + swh_storage.origin_visit_status_add([visit_status_with_snapshot]) + # only the first visit has a snapshot now actual_visit = swh_storage.origin_visit_get_latest( origin.url, require_snapshot=True ) - assert actual_visit == { - **origin_visit1, - "snapshot": complete_snapshot.id, - "status": "ongoing", # visit1 has status created now - } + assert actual_visit == ov1 + + # only the first visit has a status ongoing now + actual_visit = swh_storage.origin_visit_get_latest( + origin.url, allowed_statuses=["ongoing"] + ) + assert actual_visit == ov1 - assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url) + actual_visit_status = swh_storage.origin_visit_status_get_latest( + origin.url, ov1.visit, require_snapshot=True + ) + assert actual_visit_status == visit_status_with_snapshot + + # ... and require_snapshot=False (defaults) still returns latest visit (3rd) + actual_visit = swh_storage.origin_visit_get_latest( + origin.url, require_snapshot=False + ) + assert actual_visit == ov3 + # no specific filter, this returns as before the latest visit + actual_visit = swh_storage.origin_visit_get_latest(origin.url) + assert actual_visit == ov3 # Status filter: all three visits are status=ongoing, so no visit # returned - assert ( - swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"]) - is None + actual_visit = swh_storage.origin_visit_get_latest( + origin.url, allowed_statuses=["full"] ) + assert actual_visit is None + visit_status1_full = OriginVisitStatus( + origin=origin.url, + visit=ov1.visit, + date=round_to_milliseconds(now()), + status="full", + snapshot=complete_snapshot.id, + ) # Mark the first visit as completed and check status filter again - swh_storage.origin_visit_status_add( - [ - OriginVisitStatus( - origin=origin.url, - visit=ov1.visit, - date=now(), - status="full", - snapshot=complete_snapshot.id, - ) - ] + swh_storage.origin_visit_status_add([visit_status1_full]) + + # only the first visit has the full status + actual_visit = swh_storage.origin_visit_get_latest( + origin.url, allowed_statuses=["full"] ) + assert actual_visit == ov1 - assert { - **origin_visit1, - "snapshot": complete_snapshot.id, - "status": "full", - } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"]) + actual_visit_status = swh_storage.origin_visit_status_get_latest( + origin.url, ov1.visit, allowed_statuses=["full"] + ) + assert actual_visit_status == visit_status1_full - assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url) + # no specific filter, this returns as before the latest visit + actual_visit = swh_storage.origin_visit_get_latest(origin.url) + assert actual_visit == ov3 # Add snapshot to visit2 and check that the new snapshot is returned swh_storage.snapshot_add([empty_snapshot]) - swh_storage.origin_visit_status_add( - [ - OriginVisitStatus( - origin=origin.url, - visit=ov2.visit, - date=now(), - status="ongoing", - snapshot=empty_snapshot.id, - ) - ] + visit_status2_full = OriginVisitStatus( + origin=origin.url, + visit=ov2.visit, + date=round_to_milliseconds(now()), + status="ongoing", + snapshot=empty_snapshot.id, ) - assert { - **origin_visit2, - "snapshot": empty_snapshot.id, - "status": "ongoing", - } == swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True) + swh_storage.origin_visit_status_add([visit_status2_full]) + actual_visit = swh_storage.origin_visit_get_latest( + origin.url, require_snapshot=True + ) + # 2nd visit is most recent with a snapshot + assert actual_visit == ov2 + actual_visit_status = swh_storage.origin_visit_status_get_latest( + origin.url, ov2.visit, require_snapshot=True + ) + assert actual_visit_status == visit_status2_full - assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url) + # no specific filter, this returns as before the latest visit, 3rd one + actual_origin = swh_storage.origin_visit_get_latest(origin.url) + assert actual_origin == ov3 - # Check that the status filter is still working - assert { - **origin_visit1, - "snapshot": complete_snapshot.id, - "status": "full", - } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"]) + # full status is still the first visit + actual_visit = swh_storage.origin_visit_get_latest( + origin.url, allowed_statuses=["full"] + ) + assert actual_visit == ov1 # Add snapshot to visit3 (same date as visit2) - swh_storage.origin_visit_status_add( - [ - OriginVisitStatus( - origin=origin.url, - visit=ov3.visit, - date=now(), - status="ongoing", - snapshot=complete_snapshot.id, - ) - ] + visit_status3_with_snapshot = OriginVisitStatus( + origin=origin.url, + visit=ov3.visit, + date=round_to_milliseconds(now()), + status="ongoing", + snapshot=complete_snapshot.id, + ) + swh_storage.origin_visit_status_add([visit_status3_with_snapshot]) + + # full status is still the first visit + actual_visit = swh_storage.origin_visit_get_latest( + origin.url, allowed_statuses=["full"], require_snapshot=True, + ) + assert actual_visit == ov1 + + actual_visit_status = swh_storage.origin_visit_status_get_latest( + origin.url, + visit=actual_visit.visit, + allowed_statuses=["full"], + require_snapshot=True, ) - assert { - **origin_visit1, - "snapshot": complete_snapshot.id, - "status": "full", - } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"]) - assert { - **origin_visit1, - "snapshot": complete_snapshot.id, - "status": "full", - } == swh_storage.origin_visit_get_latest( - origin.url, allowed_statuses=["full"], require_snapshot=True - ) - assert { - **origin_visit3, - "snapshot": complete_snapshot.id, - "status": "ongoing", - } == swh_storage.origin_visit_get_latest(origin.url) - - assert { - **origin_visit3, - "snapshot": complete_snapshot.id, - "status": "ongoing", - } == swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True) + assert actual_visit_status == visit_status1_full + + # most recent is still the 3rd visit + actual_visit = swh_storage.origin_visit_get_latest(origin.url) + assert actual_visit == ov3 + + # 3rd visit has a snapshot now, so it's elected + actual_visit = swh_storage.origin_visit_get_latest( + origin.url, require_snapshot=True + ) + assert actual_visit == ov3 + + actual_visit_status = swh_storage.origin_visit_status_get_latest( + origin.url, ov3.visit, require_snapshot=True + ) + assert actual_visit_status == visit_status3_with_snapshot def test_origin_visit_status_get_latest(self, swh_storage, sample_data): snapshot = sample_data.snapshots[2] @@ -2146,8 +2144,7 @@ ) swh_storage.snapshot_add([snapshot]) - date_now = now() - date_now = round_to_milliseconds(date_now) + date_now = round_to_milliseconds(now()) assert sample_data.date_visit1 < sample_data.date_visit2 assert sample_data.date_visit2 < date_now