Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123111
D3620.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
23 KB
Subscribers
None
D3620.diff
View Options
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -885,7 +885,7 @@
type: Optional[str] = None,
allowed_statuses: Optional[List[str]] = None,
require_snapshot: bool = False,
- ) -> Optional[Dict[str, Any]]:
+ ) -> Optional[OriginVisit]:
# TODO: Do not fetch all visits
rows = self._cql_runner.origin_visit_get_all(origin)
latest_visit = None
@@ -908,7 +908,14 @@
latest_visit = updated_visit
- return latest_visit
+ if latest_visit is None:
+ return None
+ return OriginVisit(
+ origin=latest_visit["origin"],
+ visit=latest_visit["visit"],
+ date=latest_visit["date"],
+ type=latest_visit["type"],
+ )
def origin_visit_status_get_latest(
self,
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -916,29 +916,33 @@
type: Optional[str] = None,
allowed_statuses: Optional[List[str]] = None,
require_snapshot: bool = False,
- ) -> Optional[Dict[str, Any]]:
+ ) -> Optional[OriginVisit]:
ori = self._origins.get(origin)
if not ori:
return None
- visits = self._origin_visits[ori.url]
- visits = [
- self._origin_visit_get_updated(visit.origin, visit.visit)
- for visit in visits
- if visit is not None
- ]
+ visits = sorted(
+ self._origin_visits[ori.url], key=lambda v: v.date, reverse=True,
+ )
+ for visit in visits:
+ if type is not None and visit.type != type:
+ continue
+ visit_statuses = self._origin_visit_statuses[origin, visit.visit]
- if type is not None:
- visits = [visit for visit in visits if visit["type"] == type]
- if allowed_statuses is not None:
- visits = [visit for visit in visits if visit["status"] in allowed_statuses]
- if require_snapshot:
- visits = [visit for visit in visits if visit["snapshot"]]
+ if allowed_statuses is not None:
+ visit_statuses = [
+ vs for vs in visit_statuses if vs.status in allowed_statuses
+ ]
+ if require_snapshot:
+ visit_statuses = [vs for vs in visit_statuses if vs.snapshot]
- visit = max(visits, key=lambda v: (v["date"], v["visit"]), default=None)
- if visit is None:
- return None
- return visit
+ if visit_statuses: # we found visit statuses matching criteria
+ visit_status = max(visit_statuses, key=lambda vs: (vs.date, vs.visit))
+ assert visit.origin == visit_status.origin
+ assert visit.visit == visit_status.visit
+ return visit
+
+ return None
def origin_visit_status_get_latest(
self,
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -853,7 +853,7 @@
type: Optional[str] = None,
allowed_statuses: Optional[List[str]] = None,
require_snapshot: bool = False,
- ) -> Optional[Dict[str, Any]]:
+ ) -> Optional[OriginVisit]:
"""Get the latest origin visit for the given origin, optionally
looking only for those with one of the given allowed_statuses
or for those with a snapshot.
@@ -870,16 +870,9 @@
will be returned.
Returns:
- dict: a dict with the following keys:
-
- - **origin**: the URL of the origin
- - **visit**: origin visit id
- - **type**: type of loader used for the visit
- - **date**: timestamp of such visit
- - **status**: Visit's new status
- - **metadata**: Data associated to the visit
- - **snapshot** (Optional[sha1_git]): identifier of the snapshot
- associated to the visit
+ OriginVisit matching the criteria if found, None otherwise. Note that as
+ OriginVisit no longer held reference on the visit status or snapshot, you
+ may want to use origin_visit_status_get_latest for those information.
"""
...
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -921,7 +921,7 @@
require_snapshot: bool = False,
db=None,
cur=None,
- ) -> Optional[Dict[str, Any]]:
+ ) -> Optional[OriginVisit]:
row = db.origin_visit_get_latest(
origin,
type=type,
@@ -930,7 +930,14 @@
cur=cur,
)
if row:
- return dict(zip(db.origin_visit_get_cols, row))
+ row_d = dict(zip(db.origin_visit_get_cols, row))
+ visit = OriginVisit(
+ origin=row_d["origin"],
+ visit=row_d["visit"],
+ date=row_d["date"],
+ type=row_d["type"],
+ )
+ return visit
return None
@timed
diff --git a/swh/storage/tests/algos/test_snapshot.py b/swh/storage/tests/algos/test_snapshot.py
--- a/swh/storage/tests/algos/test_snapshot.py
+++ b/swh/storage/tests/algos/test_snapshot.py
@@ -63,7 +63,6 @@
ov1 = swh_storage.origin_visit_get_latest(origin.url)
assert ov1 is not None
- visit_id = ov1["visit"]
# visit references a snapshot but the snapshot does not exist in backend for some
# reason
@@ -72,7 +71,7 @@
[
OriginVisitStatus(
origin=origin.url,
- visit=visit_id,
+ visit=ov1.visit,
date=origin_visit2.date,
status="partial",
snapshot=complete_snapshot.id,
@@ -93,7 +92,6 @@
swh_storage.origin_visit_add([visit1])
ov1 = swh_storage.origin_visit_get_latest(origin.url)
- visit_id = ov1["visit"]
# Add snapshot to visit1, latest snapshot = visit 1 snapshot
complete_snapshot = sample_data.snapshots[2]
@@ -103,7 +101,7 @@
[
OriginVisitStatus(
origin=origin.url,
- visit=visit_id,
+ visit=ov1.visit,
date=visit2.date,
status="partial",
snapshot=None,
@@ -124,7 +122,7 @@
[
OriginVisitStatus(
origin=origin.url,
- visit=visit_id,
+ visit=ov1.visit,
date=date_now,
status="full",
snapshot=complete_snapshot.id,
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -1647,7 +1647,7 @@
snapshot=None,
)
- date_visit_now = now()
+ date_visit_now = round_to_milliseconds(now())
visit_status1 = OriginVisitStatus(
origin=ov1.origin,
visit=ov1.visit,
@@ -1656,7 +1656,7 @@
snapshot=snapshot.id,
)
- date_visit_now = now()
+ date_visit_now = round_to_milliseconds(now())
visit_status2 = OriginVisitStatus(
origin=ov2.origin,
visit=ov2.visit,
@@ -1667,21 +1667,18 @@
)
swh_storage.origin_visit_status_add([visit_status1, visit_status2])
- origin_visit1 = swh_storage.origin_visit_get_latest(
- origin1.url, require_snapshot=True
+ visit = swh_storage.origin_visit_get_latest(origin1.url, require_snapshot=True)
+ visit_status = swh_storage.origin_visit_status_get_latest(
+ origin1.url, visit.visit, require_snapshot=True
)
- assert origin_visit1
- assert origin_visit1["status"] == "full"
- assert origin_visit1["snapshot"] == snapshot.id
+ assert visit_status == visit_status1
- origin_visit2 = swh_storage.origin_visit_get_latest(
- origin2.url, require_snapshot=False
+ visit = swh_storage.origin_visit_get_latest(origin2.url, require_snapshot=False)
+ visit_status = swh_storage.origin_visit_status_get_latest(
+ origin2.url, visit.visit, require_snapshot=False
)
assert origin2.url != origin1.url
- assert origin_visit2
- assert origin_visit2["status"] == "ongoing"
- assert origin_visit2["snapshot"] is None
- assert origin_visit2["metadata"] == {"intrinsic": "something"}
+ assert visit_status == visit_status2
actual_objects = list(swh_storage.journal_writer.journal.objects)
@@ -1924,51 +1921,27 @@
origin = sample_data.origin
swh_storage.origin_add([origin])
visit1 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit1,
- type=sample_data.type_visit1,
+ origin=origin.url, date=sample_data.date_visit1, type="git",
)
visit2 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit2,
- type=sample_data.type_visit2,
- )
- # Add a visit with the same date as the previous one
- visit3 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit2,
- type=sample_data.type_visit2,
+ origin=origin.url, date=sample_data.date_visit2, type="hg",
)
- assert sample_data.type_visit1 != sample_data.type_visit2
+ date_now = round_to_milliseconds(now())
+ visit3 = OriginVisit(origin=origin.url, date=date_now, type="hg",)
assert sample_data.date_visit1 < sample_data.date_visit2
+ assert sample_data.date_visit2 < date_now
ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3])
- origin_visit1 = swh_storage.origin_visit_get_by(origin.url, ov1.visit)
- origin_visit3 = swh_storage.origin_visit_get_by(origin.url, ov3.visit)
-
- assert sample_data.type_visit1 != sample_data.type_visit2
-
# Check type filter is ok
- actual_ov1 = swh_storage.origin_visit_get_latest(
- origin.url, type=sample_data.type_visit1,
- )
- assert actual_ov1 == origin_visit1
-
- actual_ov3 = swh_storage.origin_visit_get_latest(
- origin.url, type=sample_data.type_visit2,
- )
- assert actual_ov3 == origin_visit3
-
- new_type = "npm"
- assert new_type not in [sample_data.type_visit1, sample_data.type_visit2]
-
- assert (
- swh_storage.origin_visit_get_latest(
- origin.url, type=new_type, # no visit matching that type
- )
- is None
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url, type="git")
+ assert actual_visit == ov1
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url, type="hg")
+ assert actual_visit == ov3
+ actual_visit_unknown_type = swh_storage.origin_visit_get_latest(
+ origin.url, type="npm", # no visit matching that type
)
+ assert actual_visit_unknown_type is None
def test_origin_visit_get_latest(self, swh_storage, sample_data):
empty_snapshot, complete_snapshot = sample_data.snapshots[1:3]
@@ -1976,152 +1949,177 @@
swh_storage.origin_add([origin])
visit1 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit1,
- type=sample_data.type_visit1,
+ origin=origin.url, date=sample_data.date_visit1, type="git",
)
visit2 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit2,
- type=sample_data.type_visit2,
- )
- # Add a visit with the same date as the previous one
- visit3 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit2,
- type=sample_data.type_visit2,
+ origin=origin.url, date=sample_data.date_visit2, type="hg",
)
+ date_now = round_to_milliseconds(now())
+ visit3 = OriginVisit(origin=origin.url, date=date_now, type="hg",)
+ assert visit1.date < visit2.date
+ assert visit2.date < visit3.date
ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3])
- origin_visit1 = swh_storage.origin_visit_get_by(origin.url, ov1.visit)
- origin_visit2 = swh_storage.origin_visit_get_by(origin.url, ov2.visit)
- origin_visit3 = swh_storage.origin_visit_get_by(origin.url, ov3.visit)
- # Two visits, both with no snapshot
- assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url)
- assert (
- swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True)
- is None
+ # no filters, latest visit is the last one (whose date is most recent)
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url)
+ assert actual_visit == ov3
+
+ # 3 visits, none has snapshot so nothing is returned
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, require_snapshot=True
)
+ assert actual_visit is None
- # Add snapshot to visit1; require_snapshot=True makes it return
- # visit1 and require_snapshot=False still returns visit2
+ # visit are created with "created" status, so nothing will get returned
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["partial"]
+ )
+ assert actual_visit is None
+
+ # visit are created with "created" status, so most recent again
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["created"]
+ )
+ assert actual_visit == ov3
+ # Add snapshot to visit1; require_snapshot=True makes it return first visit
swh_storage.snapshot_add([complete_snapshot])
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin.url,
- visit=ov1.visit,
- date=now(),
- status="ongoing",
- snapshot=complete_snapshot.id,
- )
- ]
+ visit_status_with_snapshot = OriginVisitStatus(
+ origin=origin.url,
+ visit=ov1.visit,
+ date=round_to_milliseconds(now()),
+ status="ongoing",
+ snapshot=complete_snapshot.id,
)
+ swh_storage.origin_visit_status_add([visit_status_with_snapshot])
+ # only the first visit has a snapshot now
actual_visit = swh_storage.origin_visit_get_latest(
origin.url, require_snapshot=True
)
- assert actual_visit == {
- **origin_visit1,
- "snapshot": complete_snapshot.id,
- "status": "ongoing", # visit1 has status created now
- }
+ assert actual_visit == ov1
+
+ # only the first visit has a status ongoing now
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["ongoing"]
+ )
+ assert actual_visit == ov1
- assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url)
+ actual_visit_status = swh_storage.origin_visit_status_get_latest(
+ origin.url, ov1.visit, require_snapshot=True
+ )
+ assert actual_visit_status == visit_status_with_snapshot
+
+ # ... and require_snapshot=False (defaults) still returns latest visit (3rd)
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, require_snapshot=False
+ )
+ assert actual_visit == ov3
+ # no specific filter, this returns as before the latest visit
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url)
+ assert actual_visit == ov3
# Status filter: all three visits are status=ongoing, so no visit
# returned
- assert (
- swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"])
- is None
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["full"]
)
+ assert actual_visit is None
+ visit_status1_full = OriginVisitStatus(
+ origin=origin.url,
+ visit=ov1.visit,
+ date=round_to_milliseconds(now()),
+ status="full",
+ snapshot=complete_snapshot.id,
+ )
# Mark the first visit as completed and check status filter again
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin.url,
- visit=ov1.visit,
- date=now(),
- status="full",
- snapshot=complete_snapshot.id,
- )
- ]
+ swh_storage.origin_visit_status_add([visit_status1_full])
+
+ # only the first visit has the full status
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["full"]
)
+ assert actual_visit == ov1
- assert {
- **origin_visit1,
- "snapshot": complete_snapshot.id,
- "status": "full",
- } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"])
+ actual_visit_status = swh_storage.origin_visit_status_get_latest(
+ origin.url, ov1.visit, allowed_statuses=["full"]
+ )
+ assert actual_visit_status == visit_status1_full
- assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url)
+ # no specific filter, this returns as before the latest visit
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url)
+ assert actual_visit == ov3
# Add snapshot to visit2 and check that the new snapshot is returned
swh_storage.snapshot_add([empty_snapshot])
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin.url,
- visit=ov2.visit,
- date=now(),
- status="ongoing",
- snapshot=empty_snapshot.id,
- )
- ]
+ visit_status2_full = OriginVisitStatus(
+ origin=origin.url,
+ visit=ov2.visit,
+ date=round_to_milliseconds(now()),
+ status="ongoing",
+ snapshot=empty_snapshot.id,
)
- assert {
- **origin_visit2,
- "snapshot": empty_snapshot.id,
- "status": "ongoing",
- } == swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True)
+ swh_storage.origin_visit_status_add([visit_status2_full])
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, require_snapshot=True
+ )
+ # 2nd visit is most recent with a snapshot
+ assert actual_visit == ov2
+ actual_visit_status = swh_storage.origin_visit_status_get_latest(
+ origin.url, ov2.visit, require_snapshot=True
+ )
+ assert actual_visit_status == visit_status2_full
- assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url)
+ # no specific filter, this returns as before the latest visit, 3rd one
+ actual_origin = swh_storage.origin_visit_get_latest(origin.url)
+ assert actual_origin == ov3
- # Check that the status filter is still working
- assert {
- **origin_visit1,
- "snapshot": complete_snapshot.id,
- "status": "full",
- } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"])
+ # full status is still the first visit
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["full"]
+ )
+ assert actual_visit == ov1
# Add snapshot to visit3 (same date as visit2)
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin.url,
- visit=ov3.visit,
- date=now(),
- status="ongoing",
- snapshot=complete_snapshot.id,
- )
- ]
+ visit_status3_with_snapshot = OriginVisitStatus(
+ origin=origin.url,
+ visit=ov3.visit,
+ date=round_to_milliseconds(now()),
+ status="ongoing",
+ snapshot=complete_snapshot.id,
+ )
+ swh_storage.origin_visit_status_add([visit_status3_with_snapshot])
+
+ # full status is still the first visit
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["full"], require_snapshot=True,
+ )
+ assert actual_visit == ov1
+
+ actual_visit_status = swh_storage.origin_visit_status_get_latest(
+ origin.url,
+ visit=actual_visit.visit,
+ allowed_statuses=["full"],
+ require_snapshot=True,
)
- assert {
- **origin_visit1,
- "snapshot": complete_snapshot.id,
- "status": "full",
- } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"])
- assert {
- **origin_visit1,
- "snapshot": complete_snapshot.id,
- "status": "full",
- } == swh_storage.origin_visit_get_latest(
- origin.url, allowed_statuses=["full"], require_snapshot=True
- )
- assert {
- **origin_visit3,
- "snapshot": complete_snapshot.id,
- "status": "ongoing",
- } == swh_storage.origin_visit_get_latest(origin.url)
-
- assert {
- **origin_visit3,
- "snapshot": complete_snapshot.id,
- "status": "ongoing",
- } == swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True)
+ assert actual_visit_status == visit_status1_full
+
+ # most recent is still the 3rd visit
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url)
+ assert actual_visit == ov3
+
+ # 3rd visit has a snapshot now, so it's elected
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, require_snapshot=True
+ )
+ assert actual_visit == ov3
+
+ actual_visit_status = swh_storage.origin_visit_status_get_latest(
+ origin.url, ov3.visit, require_snapshot=True
+ )
+ assert actual_visit_status == visit_status3_with_snapshot
def test_origin_visit_status_get_latest(self, swh_storage, sample_data):
snapshot = sample_data.snapshots[2]
@@ -2146,8 +2144,7 @@
)
swh_storage.snapshot_add([snapshot])
- date_now = now()
- date_now = round_to_milliseconds(date_now)
+ date_now = round_to_milliseconds(now())
assert sample_data.date_visit1 < sample_data.date_visit2
assert sample_data.date_visit2 < date_now
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Dec 17, 9:56 PM (2 d, 8 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216489
Attached To
D3620: storage*: type origin_visit_get_latest endpoint result
Event Timeline
Log In to Comment