diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -924,6 +924,7 @@ def origin_visit_get_latest( self, origin: str, + type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, ) -> Optional[Dict[str, Any]]: @@ -933,6 +934,8 @@ for row in rows: visit = self._format_origin_visit_row(row) updated_visit = self._origin_visit_apply_last_status(visit) + if type is not None and updated_visit["type"] != type: + continue if allowed_statuses and updated_visit["status"] not in allowed_statuses: continue if require_snapshot and updated_visit["snapshot"] is None: diff --git a/swh/storage/db.py b/swh/storage/db.py --- a/swh/storage/db.py +++ b/swh/storage/db.py @@ -6,7 +6,7 @@ import datetime import random import select -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple from swh.core.db import BaseDb from swh.core.db.db_utils import stored_procedure, jsonize @@ -671,13 +671,19 @@ return bool(cur.fetchone()) def origin_visit_get_latest( - self, origin_id: str, allowed_statuses=None, require_snapshot=False, cur=None + self, + origin_id: str, + type: Optional[str], + allowed_statuses: Optional[Iterable[str]] = None, + require_snapshot: bool = False, + cur=None, ): """Retrieve the most recent origin_visit of the given origin, with optional filters. Args: origin_id: the origin concerned + type: Optional visit type to filter on allowed_statuses: the visit statuses allowed for the returned visit require_snapshot (bool): If True, only a visit with a known snapshot will be returned. @@ -697,6 +703,10 @@ query_parts.append("WHERE o.url = %s") query_params: List[Any] = [origin_id] + if type is not None: + query_parts.append("AND ov.type = %s") + query_params.append(type) + if require_snapshot: query_parts.append("AND ovs.snapshot is not null") diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -918,6 +918,7 @@ def origin_visit_get_latest( self, origin: str, + type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, ) -> Optional[Dict[str, Any]]: @@ -932,6 +933,8 @@ if visit is not None ] + if type is not None: + visits = [visit for visit in visits if visit.type == type] if allowed_statuses is not None: visits = [visit for visit in visits if visit.status in allowed_statuses] if require_snapshot: diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -866,6 +866,7 @@ def origin_visit_get_latest( self, origin: str, + type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, ) -> Optional[Dict[str, Any]]: @@ -875,6 +876,8 @@ Args: origin: origin URL + type: Optional visit type to filter on (e.g git, tar, dsc, svn, + hg, npm, pypi, ...) allowed_statuses: list of visit statuses considered to find the latest visit. For instance, ``allowed_statuses=['full']`` will only consider visits that @@ -893,6 +896,7 @@ - **metadata**: Data associated to the visit - **snapshot** (Optional[sha1_git]): identifier of the snapshot associated to the visit + """ ... diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -984,6 +984,7 @@ def origin_visit_get_latest( self, origin: str, + type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, db=None, @@ -991,6 +992,7 @@ ) -> Optional[Dict[str, Any]]: row = db.origin_visit_get_latest( origin, + type=type, allowed_statuses=allowed_statuses, require_snapshot=require_snapshot, cur=cur, diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -1784,6 +1784,7 @@ ) assert origin2.url != origin1.url assert origin_visit2 + assert origin_visit2["type"] == data.type_visit2 assert origin_visit2["status"] == "ongoing" assert origin_visit2["snapshot"] is None assert origin_visit2["metadata"] == {"intrinsic": "something"} @@ -2001,16 +2002,17 @@ assert actual_origin_visit is None def test_origin_visit_get_latest(self, swh_storage): - origin_url = swh_storage.origin_add_one(data.origin) + origin = Origin.from_dict(data.origin) + swh_storage.origin_add_one(origin) visit1 = OriginVisit( - origin=origin_url, + origin=origin.url, date=data.date_visit1, type=data.type_visit1, status="ongoing", snapshot=None, ) visit2 = OriginVisit( - origin=origin_url, + origin=origin.url, date=data.date_visit2, type=data.type_visit2, status="ongoing", @@ -2018,49 +2020,76 @@ ) # Add a visit with the same date as the previous one visit3 = OriginVisit( - origin=origin_url, + origin=origin.url, date=data.date_visit2, type=data.type_visit2, status="ongoing", snapshot=None, ) + assert data.type_visit1 != data.type_visit2 + assert data.date_visit1 < data.date_visit2 + ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3]) + origin_visit1 = swh_storage.origin_visit_get_by(origin.url, ov1.visit) + origin_visit2 = swh_storage.origin_visit_get_by(origin.url, ov2.visit) + origin_visit3 = swh_storage.origin_visit_get_by(origin.url, ov3.visit) + + assert data.type_visit1 != data.type_visit2 + + # Check type filter is ok + actual_ov1 = swh_storage.origin_visit_get_latest( + origin.url, type=data.type_visit1, + ) + assert actual_ov1 == origin_visit1 + + actual_ov3 = swh_storage.origin_visit_get_latest( + origin.url, type=data.type_visit2, + ) + assert actual_ov3 == origin_visit3 + + new_type = "npm" + assert new_type not in [data.type_visit1, data.type_visit2] + + assert ( + swh_storage.origin_visit_get_latest( + origin.url, type=new_type, # no visit matching that type + ) + is None + ) - origin_visit1 = swh_storage.origin_visit_get_by(origin_url, ov1.visit) - origin_visit2 = swh_storage.origin_visit_get_by(origin_url, ov2.visit) - origin_visit3 = swh_storage.origin_visit_get_by(origin_url, ov3.visit) # Two visits, both with no snapshot - assert origin_visit3 == swh_storage.origin_visit_get_latest(origin_url) + assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url) assert ( - swh_storage.origin_visit_get_latest(origin_url, require_snapshot=True) + swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True) is None ) # Add snapshot to visit1; require_snapshot=True makes it return # visit1 and require_snapshot=False still returns visit2 - swh_storage.snapshot_add([data.complete_snapshot]) + complete_snapshot = Snapshot.from_dict(data.complete_snapshot) + swh_storage.snapshot_add([complete_snapshot]) swh_storage.origin_visit_status_add( [ OriginVisitStatus( - origin=origin_url, + origin=origin.url, visit=ov1.visit, date=now(), status="ongoing", - snapshot=data.complete_snapshot["id"], + snapshot=complete_snapshot.id, ) ] ) assert { **origin_visit1, - "snapshot": data.complete_snapshot["id"], - } == swh_storage.origin_visit_get_latest(origin_url, require_snapshot=True) + "snapshot": complete_snapshot.id, + } == swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True) - assert origin_visit3 == swh_storage.origin_visit_get_latest(origin_url) + assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url) # Status filter: all three visits are status=ongoing, so no visit # returned assert ( - swh_storage.origin_visit_get_latest(origin_url, allowed_statuses=["full"]) + swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"]) is None ) @@ -2068,86 +2097,85 @@ swh_storage.origin_visit_status_add( [ OriginVisitStatus( - origin=origin_url, + origin=origin.url, visit=ov1.visit, date=now(), status="full", - snapshot=data.complete_snapshot["id"], + snapshot=complete_snapshot.id, ) ] ) assert { **origin_visit1, - "snapshot": data.complete_snapshot["id"], + "snapshot": complete_snapshot.id, "status": "full", - } == swh_storage.origin_visit_get_latest(origin_url, allowed_statuses=["full"]) + } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"]) - assert origin_visit3 == swh_storage.origin_visit_get_latest(origin_url) + assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url) # Add snapshot to visit2 and check that the new snapshot is returned - swh_storage.snapshot_add([data.empty_snapshot]) + empty_snapshot = Snapshot.from_dict(data.empty_snapshot) + swh_storage.snapshot_add([empty_snapshot]) swh_storage.origin_visit_status_add( [ OriginVisitStatus( - origin=origin_url, + origin=origin.url, visit=ov2.visit, date=now(), status="ongoing", - snapshot=data.empty_snapshot["id"], + snapshot=empty_snapshot.id, ) ] ) assert { **origin_visit2, - "snapshot": data.empty_snapshot["id"], - } == swh_storage.origin_visit_get_latest(origin_url, require_snapshot=True) + "snapshot": empty_snapshot.id, + } == swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True) - assert origin_visit3 == swh_storage.origin_visit_get_latest(origin_url) + assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url) # Check that the status filter is still working assert { **origin_visit1, - "snapshot": data.complete_snapshot["id"], + "snapshot": complete_snapshot.id, "status": "full", - } == swh_storage.origin_visit_get_latest(origin_url, allowed_statuses=["full"]) + } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"]) # Add snapshot to visit3 (same date as visit2) - swh_storage.snapshot_add([data.complete_snapshot]) - swh_storage.origin_visit_status_add( [ OriginVisitStatus( - origin=origin_url, + origin=origin.url, visit=ov3.visit, date=now(), status="ongoing", - snapshot=data.complete_snapshot["id"], + snapshot=complete_snapshot.id, ) ] ) assert { **origin_visit1, - "snapshot": data.complete_snapshot["id"], + "snapshot": complete_snapshot.id, "status": "full", - } == swh_storage.origin_visit_get_latest(origin_url, allowed_statuses=["full"]) + } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"]) assert { **origin_visit1, - "snapshot": data.complete_snapshot["id"], + "snapshot": complete_snapshot.id, "status": "full", } == swh_storage.origin_visit_get_latest( - origin_url, allowed_statuses=["full"], require_snapshot=True + origin.url, allowed_statuses=["full"], require_snapshot=True ) assert { **origin_visit3, - "snapshot": data.complete_snapshot["id"], - } == swh_storage.origin_visit_get_latest(origin_url) + "snapshot": complete_snapshot.id, + } == swh_storage.origin_visit_get_latest(origin.url) assert { **origin_visit3, - "snapshot": data.complete_snapshot["id"], - } == swh_storage.origin_visit_get_latest(origin_url, require_snapshot=True) + "snapshot": complete_snapshot.id, + } == swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True) def test_origin_visit_status_get_latest(self, swh_storage): origin1 = Origin.from_dict(data.origin)