Page MenuHomeSoftware Heritage

D3620.diff
No OneTemporary

D3620.diff

diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -885,7 +885,7 @@
type: Optional[str] = None,
allowed_statuses: Optional[List[str]] = None,
require_snapshot: bool = False,
- ) -> Optional[Dict[str, Any]]:
+ ) -> Optional[OriginVisit]:
# TODO: Do not fetch all visits
rows = self._cql_runner.origin_visit_get_all(origin)
latest_visit = None
@@ -908,7 +908,14 @@
latest_visit = updated_visit
- return latest_visit
+ if latest_visit is None:
+ return None
+ return OriginVisit(
+ origin=latest_visit["origin"],
+ visit=latest_visit["visit"],
+ date=latest_visit["date"],
+ type=latest_visit["type"],
+ )
def origin_visit_status_get_latest(
self,
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -916,29 +916,33 @@
type: Optional[str] = None,
allowed_statuses: Optional[List[str]] = None,
require_snapshot: bool = False,
- ) -> Optional[Dict[str, Any]]:
+ ) -> Optional[OriginVisit]:
ori = self._origins.get(origin)
if not ori:
return None
- visits = self._origin_visits[ori.url]
- visits = [
- self._origin_visit_get_updated(visit.origin, visit.visit)
- for visit in visits
- if visit is not None
- ]
+ visits = sorted(
+ self._origin_visits[ori.url], key=lambda v: v.date, reverse=True,
+ )
+ for visit in visits:
+ if type is not None and visit.type != type:
+ continue
+ visit_statuses = self._origin_visit_statuses[origin, visit.visit]
- if type is not None:
- visits = [visit for visit in visits if visit["type"] == type]
- if allowed_statuses is not None:
- visits = [visit for visit in visits if visit["status"] in allowed_statuses]
- if require_snapshot:
- visits = [visit for visit in visits if visit["snapshot"]]
+ if allowed_statuses is not None:
+ visit_statuses = [
+ vs for vs in visit_statuses if vs.status in allowed_statuses
+ ]
+ if require_snapshot:
+ visit_statuses = [vs for vs in visit_statuses if vs.snapshot]
- visit = max(visits, key=lambda v: (v["date"], v["visit"]), default=None)
- if visit is None:
- return None
- return visit
+ if visit_statuses: # we found visit statuses matching criteria
+ visit_status = max(visit_statuses, key=lambda vs: (vs.date, vs.visit))
+ assert visit.origin == visit_status.origin
+ assert visit.visit == visit_status.visit
+ return visit
+
+ return None
def origin_visit_status_get_latest(
self,
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -853,7 +853,7 @@
type: Optional[str] = None,
allowed_statuses: Optional[List[str]] = None,
require_snapshot: bool = False,
- ) -> Optional[Dict[str, Any]]:
+ ) -> Optional[OriginVisit]:
"""Get the latest origin visit for the given origin, optionally
looking only for those with one of the given allowed_statuses
or for those with a snapshot.
@@ -870,16 +870,9 @@
will be returned.
Returns:
- dict: a dict with the following keys:
-
- - **origin**: the URL of the origin
- - **visit**: origin visit id
- - **type**: type of loader used for the visit
- - **date**: timestamp of such visit
- - **status**: Visit's new status
- - **metadata**: Data associated to the visit
- - **snapshot** (Optional[sha1_git]): identifier of the snapshot
- associated to the visit
+ OriginVisit matching the criteria if found, None otherwise. Note that as
+ OriginVisit no longer held reference on the visit status or snapshot, you
+ may want to use origin_visit_status_get_latest for those information.
"""
...
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -921,7 +921,7 @@
require_snapshot: bool = False,
db=None,
cur=None,
- ) -> Optional[Dict[str, Any]]:
+ ) -> Optional[OriginVisit]:
row = db.origin_visit_get_latest(
origin,
type=type,
@@ -930,7 +930,14 @@
cur=cur,
)
if row:
- return dict(zip(db.origin_visit_get_cols, row))
+ row_d = dict(zip(db.origin_visit_get_cols, row))
+ visit = OriginVisit(
+ origin=row_d["origin"],
+ visit=row_d["visit"],
+ date=row_d["date"],
+ type=row_d["type"],
+ )
+ return visit
return None
@timed
diff --git a/swh/storage/tests/algos/test_snapshot.py b/swh/storage/tests/algos/test_snapshot.py
--- a/swh/storage/tests/algos/test_snapshot.py
+++ b/swh/storage/tests/algos/test_snapshot.py
@@ -63,7 +63,6 @@
ov1 = swh_storage.origin_visit_get_latest(origin.url)
assert ov1 is not None
- visit_id = ov1["visit"]
# visit references a snapshot but the snapshot does not exist in backend for some
# reason
@@ -72,7 +71,7 @@
[
OriginVisitStatus(
origin=origin.url,
- visit=visit_id,
+ visit=ov1.visit,
date=origin_visit2.date,
status="partial",
snapshot=complete_snapshot.id,
@@ -93,7 +92,6 @@
swh_storage.origin_visit_add([visit1])
ov1 = swh_storage.origin_visit_get_latest(origin.url)
- visit_id = ov1["visit"]
# Add snapshot to visit1, latest snapshot = visit 1 snapshot
complete_snapshot = sample_data.snapshots[2]
@@ -103,7 +101,7 @@
[
OriginVisitStatus(
origin=origin.url,
- visit=visit_id,
+ visit=ov1.visit,
date=visit2.date,
status="partial",
snapshot=None,
@@ -124,7 +122,7 @@
[
OriginVisitStatus(
origin=origin.url,
- visit=visit_id,
+ visit=ov1.visit,
date=date_now,
status="full",
snapshot=complete_snapshot.id,
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -1647,7 +1647,7 @@
snapshot=None,
)
- date_visit_now = now()
+ date_visit_now = round_to_milliseconds(now())
visit_status1 = OriginVisitStatus(
origin=ov1.origin,
visit=ov1.visit,
@@ -1656,7 +1656,7 @@
snapshot=snapshot.id,
)
- date_visit_now = now()
+ date_visit_now = round_to_milliseconds(now())
visit_status2 = OriginVisitStatus(
origin=ov2.origin,
visit=ov2.visit,
@@ -1667,21 +1667,18 @@
)
swh_storage.origin_visit_status_add([visit_status1, visit_status2])
- origin_visit1 = swh_storage.origin_visit_get_latest(
- origin1.url, require_snapshot=True
+ visit = swh_storage.origin_visit_get_latest(origin1.url, require_snapshot=True)
+ visit_status = swh_storage.origin_visit_status_get_latest(
+ origin1.url, visit.visit, require_snapshot=True
)
- assert origin_visit1
- assert origin_visit1["status"] == "full"
- assert origin_visit1["snapshot"] == snapshot.id
+ assert visit_status == visit_status1
- origin_visit2 = swh_storage.origin_visit_get_latest(
- origin2.url, require_snapshot=False
+ visit = swh_storage.origin_visit_get_latest(origin2.url, require_snapshot=False)
+ visit_status = swh_storage.origin_visit_status_get_latest(
+ origin2.url, visit.visit, require_snapshot=False
)
assert origin2.url != origin1.url
- assert origin_visit2
- assert origin_visit2["status"] == "ongoing"
- assert origin_visit2["snapshot"] is None
- assert origin_visit2["metadata"] == {"intrinsic": "something"}
+ assert visit_status == visit_status2
actual_objects = list(swh_storage.journal_writer.journal.objects)
@@ -1924,51 +1921,27 @@
origin = sample_data.origin
swh_storage.origin_add([origin])
visit1 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit1,
- type=sample_data.type_visit1,
+ origin=origin.url, date=sample_data.date_visit1, type="git",
)
visit2 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit2,
- type=sample_data.type_visit2,
- )
- # Add a visit with the same date as the previous one
- visit3 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit2,
- type=sample_data.type_visit2,
+ origin=origin.url, date=sample_data.date_visit2, type="hg",
)
- assert sample_data.type_visit1 != sample_data.type_visit2
+ date_now = round_to_milliseconds(now())
+ visit3 = OriginVisit(origin=origin.url, date=date_now, type="hg",)
assert sample_data.date_visit1 < sample_data.date_visit2
+ assert sample_data.date_visit2 < date_now
ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3])
- origin_visit1 = swh_storage.origin_visit_get_by(origin.url, ov1.visit)
- origin_visit3 = swh_storage.origin_visit_get_by(origin.url, ov3.visit)
-
- assert sample_data.type_visit1 != sample_data.type_visit2
-
# Check type filter is ok
- actual_ov1 = swh_storage.origin_visit_get_latest(
- origin.url, type=sample_data.type_visit1,
- )
- assert actual_ov1 == origin_visit1
-
- actual_ov3 = swh_storage.origin_visit_get_latest(
- origin.url, type=sample_data.type_visit2,
- )
- assert actual_ov3 == origin_visit3
-
- new_type = "npm"
- assert new_type not in [sample_data.type_visit1, sample_data.type_visit2]
-
- assert (
- swh_storage.origin_visit_get_latest(
- origin.url, type=new_type, # no visit matching that type
- )
- is None
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url, type="git")
+ assert actual_visit == ov1
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url, type="hg")
+ assert actual_visit == ov3
+ actual_visit_unknown_type = swh_storage.origin_visit_get_latest(
+ origin.url, type="npm", # no visit matching that type
)
+ assert actual_visit_unknown_type is None
def test_origin_visit_get_latest(self, swh_storage, sample_data):
empty_snapshot, complete_snapshot = sample_data.snapshots[1:3]
@@ -1976,152 +1949,177 @@
swh_storage.origin_add([origin])
visit1 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit1,
- type=sample_data.type_visit1,
+ origin=origin.url, date=sample_data.date_visit1, type="git",
)
visit2 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit2,
- type=sample_data.type_visit2,
- )
- # Add a visit with the same date as the previous one
- visit3 = OriginVisit(
- origin=origin.url,
- date=sample_data.date_visit2,
- type=sample_data.type_visit2,
+ origin=origin.url, date=sample_data.date_visit2, type="hg",
)
+ date_now = round_to_milliseconds(now())
+ visit3 = OriginVisit(origin=origin.url, date=date_now, type="hg",)
+ assert visit1.date < visit2.date
+ assert visit2.date < visit3.date
ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3])
- origin_visit1 = swh_storage.origin_visit_get_by(origin.url, ov1.visit)
- origin_visit2 = swh_storage.origin_visit_get_by(origin.url, ov2.visit)
- origin_visit3 = swh_storage.origin_visit_get_by(origin.url, ov3.visit)
- # Two visits, both with no snapshot
- assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url)
- assert (
- swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True)
- is None
+ # no filters, latest visit is the last one (whose date is most recent)
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url)
+ assert actual_visit == ov3
+
+ # 3 visits, none has snapshot so nothing is returned
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, require_snapshot=True
)
+ assert actual_visit is None
- # Add snapshot to visit1; require_snapshot=True makes it return
- # visit1 and require_snapshot=False still returns visit2
+ # visit are created with "created" status, so nothing will get returned
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["partial"]
+ )
+ assert actual_visit is None
+
+ # visit are created with "created" status, so most recent again
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["created"]
+ )
+ assert actual_visit == ov3
+ # Add snapshot to visit1; require_snapshot=True makes it return first visit
swh_storage.snapshot_add([complete_snapshot])
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin.url,
- visit=ov1.visit,
- date=now(),
- status="ongoing",
- snapshot=complete_snapshot.id,
- )
- ]
+ visit_status_with_snapshot = OriginVisitStatus(
+ origin=origin.url,
+ visit=ov1.visit,
+ date=round_to_milliseconds(now()),
+ status="ongoing",
+ snapshot=complete_snapshot.id,
)
+ swh_storage.origin_visit_status_add([visit_status_with_snapshot])
+ # only the first visit has a snapshot now
actual_visit = swh_storage.origin_visit_get_latest(
origin.url, require_snapshot=True
)
- assert actual_visit == {
- **origin_visit1,
- "snapshot": complete_snapshot.id,
- "status": "ongoing", # visit1 has status created now
- }
+ assert actual_visit == ov1
+
+ # only the first visit has a status ongoing now
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["ongoing"]
+ )
+ assert actual_visit == ov1
- assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url)
+ actual_visit_status = swh_storage.origin_visit_status_get_latest(
+ origin.url, ov1.visit, require_snapshot=True
+ )
+ assert actual_visit_status == visit_status_with_snapshot
+
+ # ... and require_snapshot=False (defaults) still returns latest visit (3rd)
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, require_snapshot=False
+ )
+ assert actual_visit == ov3
+ # no specific filter, this returns as before the latest visit
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url)
+ assert actual_visit == ov3
# Status filter: all three visits are status=ongoing, so no visit
# returned
- assert (
- swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"])
- is None
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["full"]
)
+ assert actual_visit is None
+ visit_status1_full = OriginVisitStatus(
+ origin=origin.url,
+ visit=ov1.visit,
+ date=round_to_milliseconds(now()),
+ status="full",
+ snapshot=complete_snapshot.id,
+ )
# Mark the first visit as completed and check status filter again
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin.url,
- visit=ov1.visit,
- date=now(),
- status="full",
- snapshot=complete_snapshot.id,
- )
- ]
+ swh_storage.origin_visit_status_add([visit_status1_full])
+
+ # only the first visit has the full status
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["full"]
)
+ assert actual_visit == ov1
- assert {
- **origin_visit1,
- "snapshot": complete_snapshot.id,
- "status": "full",
- } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"])
+ actual_visit_status = swh_storage.origin_visit_status_get_latest(
+ origin.url, ov1.visit, allowed_statuses=["full"]
+ )
+ assert actual_visit_status == visit_status1_full
- assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url)
+ # no specific filter, this returns as before the latest visit
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url)
+ assert actual_visit == ov3
# Add snapshot to visit2 and check that the new snapshot is returned
swh_storage.snapshot_add([empty_snapshot])
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin.url,
- visit=ov2.visit,
- date=now(),
- status="ongoing",
- snapshot=empty_snapshot.id,
- )
- ]
+ visit_status2_full = OriginVisitStatus(
+ origin=origin.url,
+ visit=ov2.visit,
+ date=round_to_milliseconds(now()),
+ status="ongoing",
+ snapshot=empty_snapshot.id,
)
- assert {
- **origin_visit2,
- "snapshot": empty_snapshot.id,
- "status": "ongoing",
- } == swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True)
+ swh_storage.origin_visit_status_add([visit_status2_full])
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, require_snapshot=True
+ )
+ # 2nd visit is most recent with a snapshot
+ assert actual_visit == ov2
+ actual_visit_status = swh_storage.origin_visit_status_get_latest(
+ origin.url, ov2.visit, require_snapshot=True
+ )
+ assert actual_visit_status == visit_status2_full
- assert origin_visit3 == swh_storage.origin_visit_get_latest(origin.url)
+ # no specific filter, this returns as before the latest visit, 3rd one
+ actual_origin = swh_storage.origin_visit_get_latest(origin.url)
+ assert actual_origin == ov3
- # Check that the status filter is still working
- assert {
- **origin_visit1,
- "snapshot": complete_snapshot.id,
- "status": "full",
- } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"])
+ # full status is still the first visit
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["full"]
+ )
+ assert actual_visit == ov1
# Add snapshot to visit3 (same date as visit2)
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin.url,
- visit=ov3.visit,
- date=now(),
- status="ongoing",
- snapshot=complete_snapshot.id,
- )
- ]
+ visit_status3_with_snapshot = OriginVisitStatus(
+ origin=origin.url,
+ visit=ov3.visit,
+ date=round_to_milliseconds(now()),
+ status="ongoing",
+ snapshot=complete_snapshot.id,
+ )
+ swh_storage.origin_visit_status_add([visit_status3_with_snapshot])
+
+ # full status is still the first visit
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, allowed_statuses=["full"], require_snapshot=True,
+ )
+ assert actual_visit == ov1
+
+ actual_visit_status = swh_storage.origin_visit_status_get_latest(
+ origin.url,
+ visit=actual_visit.visit,
+ allowed_statuses=["full"],
+ require_snapshot=True,
)
- assert {
- **origin_visit1,
- "snapshot": complete_snapshot.id,
- "status": "full",
- } == swh_storage.origin_visit_get_latest(origin.url, allowed_statuses=["full"])
- assert {
- **origin_visit1,
- "snapshot": complete_snapshot.id,
- "status": "full",
- } == swh_storage.origin_visit_get_latest(
- origin.url, allowed_statuses=["full"], require_snapshot=True
- )
- assert {
- **origin_visit3,
- "snapshot": complete_snapshot.id,
- "status": "ongoing",
- } == swh_storage.origin_visit_get_latest(origin.url)
-
- assert {
- **origin_visit3,
- "snapshot": complete_snapshot.id,
- "status": "ongoing",
- } == swh_storage.origin_visit_get_latest(origin.url, require_snapshot=True)
+ assert actual_visit_status == visit_status1_full
+
+ # most recent is still the 3rd visit
+ actual_visit = swh_storage.origin_visit_get_latest(origin.url)
+ assert actual_visit == ov3
+
+ # 3rd visit has a snapshot now, so it's elected
+ actual_visit = swh_storage.origin_visit_get_latest(
+ origin.url, require_snapshot=True
+ )
+ assert actual_visit == ov3
+
+ actual_visit_status = swh_storage.origin_visit_status_get_latest(
+ origin.url, ov3.visit, require_snapshot=True
+ )
+ assert actual_visit_status == visit_status3_with_snapshot
def test_origin_visit_status_get_latest(self, swh_storage, sample_data):
snapshot = sample_data.snapshots[2]
@@ -2146,8 +2144,7 @@
)
swh_storage.snapshot_add([snapshot])
- date_now = now()
- date_now = round_to_milliseconds(date_now)
+ date_now = round_to_milliseconds(now())
assert sample_data.date_visit1 < sample_data.date_visit2
assert sample_data.date_visit2 < date_now

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 9:56 PM (2 d, 14 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216489

Event Timeline