Page MenuHomeSoftware Heritage

D3314.id11745.diff
No OneTemporary

D3314.id11745.diff

diff --git a/swh/storage/algos/snapshot.py b/swh/storage/algos/snapshot.py
--- a/swh/storage/algos/snapshot.py
+++ b/swh/storage/algos/snapshot.py
@@ -1,8 +1,14 @@
-# Copyright (C) 2018 The Software Heritage developers
+# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+from typing import Iterable, Optional
+
+from swh.model.model import Snapshot
+
+from swh.storage.algos.origin import origin_get_latest_visit_status
+
def snapshot_get_all_branches(storage, snapshot_id):
"""Get all the branches for a given snapshot
@@ -28,3 +34,61 @@
next_branch = data.get("next_branch")
return ret
+
+
+def snapshot_get_latest(
+ storage,
+ origin: str,
+ allowed_statuses: Optional[Iterable[str]] = None,
+ branches_count: Optional[int] = None,
+) -> Optional[Snapshot]:
+ """Get the latest snapshot for the given origin, optionally only from visits that have
+ one of the given allowed_statuses.
+
+ The branches of the snapshot are iterated in the lexicographical
+ order of their names.
+
+ Args:
+ storage: Storage instance
+ origin: the origin's URL
+ allowed_statuses: list of visit statuses considered
+ to find the latest snapshot for the visit. For instance,
+ ``allowed_statuses=['full']`` will only consider visits that
+ have successfully run to completion.
+ branches_count: Optional parameter to retrieve snapshot with all branches
+ (default behavior when None) or not. If set to positive number, the snapshot
+ will be partial with only that number of branches.
+
+ Raises:
+ ValueError if branches_count is not a positive value
+
+ Returns:
+ The snapshot object if one is found matching the criteria or None.
+
+ """
+ visit_and_status = origin_get_latest_visit_status(
+ storage, origin, allowed_statuses=allowed_statuses, require_snapshot=True
+ )
+
+ if not visit_and_status:
+ return None
+
+ _, visit_status = visit_and_status
+ snapshot_id = visit_status.snapshot
+ if not snapshot_id:
+ return None
+
+ if branches_count: # partial snapshot
+ if not isinstance(branches_count, int) or branches_count <= 0:
+ raise ValueError(
+ "Parameter branches_count must be a positive integer. "
+ f"Current value is {branches_count}"
+ )
+ snapshot = storage.snapshot_get_branches(
+ snapshot_id, branches_count=branches_count
+ )
+ snapshot.pop("next_branch")
+ else:
+ snapshot = snapshot_get_all_branches(storage, snapshot_id)
+
+ return Snapshot.from_dict(snapshot)
diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py
--- a/swh/storage/cassandra/storage.py
+++ b/swh/storage/cassandra/storage.py
@@ -583,17 +583,6 @@
return self.snapshot_get(visit["snapshot"])
- def snapshot_get_latest(self, origin, allowed_statuses=None):
- visit = self.origin_visit_get_latest(
- origin, allowed_statuses=allowed_statuses, require_snapshot=True
- )
-
- if visit:
- assert visit["snapshot"]
- if self._cql_runner.snapshot_missing([visit["snapshot"]]):
- raise StorageArgumentException("Visit references unknown snapshot")
- return self.snapshot_get_branches(visit["snapshot"])
-
def snapshot_count_branches(self, snapshot_id):
if self._cql_runner.snapshot_missing([snapshot_id]):
# Makes sure we don't fetch branches for a snapshot that is
diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py
--- a/swh/storage/in_memory.py
+++ b/swh/storage/in_memory.py
@@ -583,22 +583,6 @@
else:
return None
- def snapshot_get_latest(self, origin, allowed_statuses=None):
- origin_url = self._get_origin_url(origin)
- if not origin_url:
- return
-
- visit = self.origin_visit_get_latest(
- origin_url, allowed_statuses=allowed_statuses, require_snapshot=True
- )
- if visit and visit["snapshot"]:
- snapshot = self.snapshot_get(visit["snapshot"])
- if not snapshot:
- raise StorageArgumentException(
- "last origin visit references an unknown snapshot"
- )
- return snapshot
-
def snapshot_count_branches(self, snapshot_id):
snapshot = self._snapshots[snapshot_id]
return collections.Counter(
diff --git a/swh/storage/interface.py b/swh/storage/interface.py
--- a/swh/storage/interface.py
+++ b/swh/storage/interface.py
@@ -695,37 +695,6 @@
"""
...
- @remote_api_endpoint("snapshot/latest")
- def snapshot_get_latest(self, origin, allowed_statuses=None):
- """Get the content, possibly partial, of the latest snapshot for the
- given origin, optionally only from visits that have one of the given
- allowed_statuses
-
- The branches of the snapshot are iterated in the lexicographical
- order of their names.
-
- .. warning:: At most 1000 branches contained in the snapshot will be
- returned for performance reasons. In order to browse the whole
- set of branches, the method :meth:`snapshot_get_branches`
- should be used instead.
-
- Args:
- origin (str): the origin's URL
- allowed_statuses (list of str): list of visit statuses considered
- to find the latest snapshot for the visit. For instance,
- ``allowed_statuses=['full']`` will only consider visits that
- have successfully run to completion.
- Returns:
- dict: a dict with three keys:
- * **id**: identifier of the snapshot
- * **branches**: a dict of branches contained in the snapshot
- whose keys are the branches' names.
- * **next_branch**: the name of the first branch not returned
- or :const:`None` if the snapshot has less than 1000
- branches.
- """
- ...
-
@remote_api_endpoint("snapshot/count_branches")
def snapshot_count_branches(self, snapshot_id):
"""Count the number of branches in the snapshot with the given id
diff --git a/swh/storage/storage.py b/swh/storage/storage.py
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -735,30 +735,6 @@
return None
- @timed
- @db_transaction(statement_timeout=4000)
- def snapshot_get_latest(self, origin, allowed_statuses=None, db=None, cur=None):
- if isinstance(origin, int):
- origin = self.origin_get({"id": origin}, db=db, cur=cur)
- if not origin:
- return
- origin = origin["url"]
-
- origin_visit = self.origin_visit_get_latest(
- origin,
- allowed_statuses=allowed_statuses,
- require_snapshot=True,
- db=db,
- cur=cur,
- )
- if origin_visit and origin_visit["snapshot"]:
- snapshot = self.snapshot_get(origin_visit["snapshot"], db=db, cur=cur)
- if not snapshot:
- raise StorageArgumentException(
- "last origin visit references an unknown snapshot"
- )
- return snapshot
-
@timed
@db_transaction(statement_timeout=2000)
def snapshot_count_branches(self, snapshot_id, db=None, cur=None):
diff --git a/swh/storage/tests/algos/test_snapshot.py b/swh/storage/tests/algos/test_snapshot.py
--- a/swh/storage/tests/algos/test_snapshot.py
+++ b/swh/storage/tests/algos/test_snapshot.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2018 The Software Heritage developers
+# Copyright (C) 2018-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@@ -6,10 +6,14 @@
from hypothesis import given
import pytest
-from swh.model.identifiers import snapshot_identifier, identifier_to_bytes
from swh.model.hypothesis_strategies import snapshots, branch_names, branch_targets
+from swh.model.identifiers import snapshot_identifier, identifier_to_bytes
+from swh.model.model import Origin, OriginVisit, OriginVisitStatus, Snapshot
+
+from swh.storage.algos.snapshot import snapshot_get_all_branches, snapshot_get_latest
+from swh.storage.utils import now
-from swh.storage.algos.snapshot import snapshot_get_all_branches
+from swh.storage.tests.storage_data import data
@pytest.fixture
@@ -42,3 +46,106 @@
returned_snapshot = snapshot_get_all_branches(swh_storage, snapshot["id"])
assert snapshot == returned_snapshot
+
+
+def test_snapshot_get_latest_none(swh_storage):
+ """Retrieve latest snapshot on unknown origin or origin without snapshot should
+ yield no result
+
+ """
+ assert snapshot_get_latest(swh_storage, "unknown-origin") is None
+
+ # no snapshot on origin visit then nothing is found
+ origin = Origin.from_dict(data.origin)
+ swh_storage.origin_add_one(origin)
+ swh_storage.origin_visit_add(
+ [
+ OriginVisit(
+ origin=origin.url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
+ )
+ ]
+ )
+ assert snapshot_get_latest(swh_storage, origin.url) is None
+
+
+def test_snapshot_get_latest(swh_storage):
+ origin = Origin.from_dict(data.origin)
+ swh_storage.origin_add_one(origin)
+
+ visit1 = OriginVisit(
+ origin=origin.url,
+ date=data.date_visit1,
+ type=data.type_visit1,
+ status="ongoing",
+ snapshot=None,
+ )
+ ov1 = swh_storage.origin_visit_add([visit1])[0]
+
+ # Add snapshot to visit1, latest snapshot = visit 1 snapshot
+ complete_snapshot = Snapshot.from_dict(data.complete_snapshot)
+ swh_storage.snapshot_add([complete_snapshot])
+
+ swh_storage.origin_visit_status_add(
+ [
+ OriginVisitStatus(
+ origin=origin.url,
+ visit=ov1.visit,
+ date=data.date_visit2,
+ status="partial",
+ snapshot=None,
+ )
+ ]
+ )
+ assert data.date_visit1 < data.date_visit2
+
+ # no snapshot associated to the visit, so None
+ actual_snapshot = snapshot_get_latest(
+ swh_storage, origin.url, allowed_statuses=["partial"]
+ )
+ assert actual_snapshot is None
+
+ date_now = now()
+ assert data.date_visit2 < date_now
+ swh_storage.origin_visit_status_add(
+ [
+ OriginVisitStatus(
+ origin=origin.url,
+ visit=ov1.visit,
+ date=date_now,
+ status="full",
+ snapshot=complete_snapshot.id,
+ )
+ ]
+ )
+
+ actual_snapshot = snapshot_get_latest(swh_storage, origin.url)
+ assert actual_snapshot is not None
+ assert actual_snapshot == complete_snapshot
+
+ swh_storage.origin_visit_status_add(
+ [
+ OriginVisitStatus(
+ origin=origin.url,
+ visit=ov1.visit,
+ date=date_now,
+ status="full",
+ snapshot=complete_snapshot.id,
+ )
+ ]
+ )
+
+ actual_snapshot = snapshot_get_latest(swh_storage, origin.url)
+ assert actual_snapshot is not None
+ assert actual_snapshot == complete_snapshot
+
+ actual_snapshot = snapshot_get_latest(swh_storage, origin.url, branches_count=1)
+ assert actual_snapshot is not None
+ assert actual_snapshot.id == complete_snapshot.id
+ assert len(actual_snapshot.branches.values()) == 1
+
+ with pytest.raises(ValueError, match="branches_count must be a positive integer"):
+ snapshot_get_latest(swh_storage, origin.url, branches_count="something-wrong")
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -2863,213 +2863,6 @@
("origin_visit_status", OriginVisitStatus.from_dict(data4),),
]
- def test_snapshot_get_latest(self, swh_storage):
- origin_url = swh_storage.origin_add_one(data.origin)
- visit1 = OriginVisit(
- origin=origin_url,
- date=data.date_visit1,
- type=data.type_visit1,
- status="ongoing",
- snapshot=None,
- )
- visit2 = OriginVisit(
- origin=origin_url,
- date=data.date_visit2,
- type=data.type_visit2,
- status="ongoing",
- snapshot=None,
- )
- # Add a visit with the same date as the previous one
- visit3 = OriginVisit(
- origin=origin_url,
- date=data.date_visit2,
- type=data.type_visit3,
- status="ongoing",
- snapshot=None,
- )
- ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3])
- # Two visits, both with no snapshot: latest snapshot is None
- assert swh_storage.snapshot_get_latest(origin_url) is None
-
- # Add snapshot to visit1, latest snapshot = visit 1 snapshot
- swh_storage.snapshot_add([data.complete_snapshot])
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin_url,
- visit=ov1.visit,
- date=now(),
- status="ongoing",
- snapshot=data.complete_snapshot["id"],
- )
- ]
- )
- assert {
- **data.complete_snapshot,
- "next_branch": None,
- } == swh_storage.snapshot_get_latest(origin_url)
-
- # Status filter: all three visits are status=ongoing, so no snapshot
- # returned
- assert (
- swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"])
- is None
- )
-
- # Mark the first visit as completed and check status filter again
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin_url,
- visit=ov1.visit,
- date=now(),
- status="full",
- snapshot=data.complete_snapshot["id"],
- )
- ]
- )
- assert {
- **data.complete_snapshot,
- "next_branch": None,
- } == swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"])
-
- # Add snapshot to visit2 and check that the new snapshot is returned
- swh_storage.snapshot_add([data.empty_snapshot])
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin_url,
- visit=ov2.visit,
- date=now(),
- status="ongoing",
- snapshot=data.empty_snapshot["id"],
- )
- ]
- )
- assert {
- **data.empty_snapshot,
- "next_branch": None,
- } == swh_storage.snapshot_get_latest(origin_url)
-
- # Check that the status filter is still working
- assert {
- **data.complete_snapshot,
- "next_branch": None,
- } == swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"])
-
- # Add snapshot to visit3 (same date as visit2) and check that
- # the new snapshot is returned
- swh_storage.snapshot_add([data.complete_snapshot])
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin_url,
- visit=ov3.visit,
- date=now(),
- status="ongoing",
- snapshot=data.complete_snapshot["id"],
- )
- ]
- )
- assert {
- **data.complete_snapshot,
- "next_branch": None,
- } == swh_storage.snapshot_get_latest(origin_url)
-
- def test_snapshot_get_latest__missing_snapshot(self, swh_storage):
- origin_url = swh_storage.origin_add_one(data.origin)
- assert swh_storage.snapshot_get_latest(origin_url) is None
- visit1 = OriginVisit(
- origin=origin_url,
- date=data.date_visit1,
- type=data.type_visit1,
- status="ongoing",
- snapshot=None,
- )
- visit2 = OriginVisit(
- origin=origin_url,
- date=data.date_visit2,
- type=data.type_visit2,
- status="ongoing",
- snapshot=None,
- )
- ov1, ov2 = swh_storage.origin_visit_add([visit1, visit2])
-
- # Two visits, both with no snapshot: latest snapshot is None
- assert swh_storage.snapshot_get_latest(origin_url) is None
-
- # Add unknown snapshot to visit1, check that the inconsistency is
- # detected
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin_url,
- visit=ov1.visit,
- date=now(),
- status="ongoing",
- snapshot=data.complete_snapshot["id"],
- )
- ]
- )
- with pytest.raises(Exception):
- # XXX: should the exception be more specific than this?
- swh_storage.snapshot_get_latest(origin_url)
-
- # Status filter: both visits are status=ongoing, so no snapshot
- # returned
- assert (
- swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"])
- is None
- )
-
- # Mark the first visit as completed and check status filter again
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin_url,
- visit=ov1.visit,
- date=now(),
- status="full",
- snapshot=data.complete_snapshot["id"],
- )
- ]
- )
-
- with pytest.raises(Exception):
- # XXX: should the exception be more specific than this?
- swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"]),
-
- # Actually add the snapshot and check status filter again
- swh_storage.snapshot_add([data.complete_snapshot])
- assert {
- **data.complete_snapshot,
- "next_branch": None,
- } == swh_storage.snapshot_get_latest(origin_url)
-
- # Add unknown snapshot to visit2 and check that the inconsistency
- # is detected
- swh_storage.origin_visit_status_add(
- [
- OriginVisitStatus(
- origin=origin_url,
- visit=ov2.visit,
- date=now(),
- status="ongoing",
- snapshot=data.snapshot["id"],
- )
- ]
- )
- with pytest.raises(Exception):
- # XXX: should the exception be more specific than this?
- swh_storage.snapshot_get_latest(origin_url)
-
- # Actually add that snapshot and check that the new one is returned
- swh_storage.snapshot_add([data.snapshot])
- assert {
- **data.snapshot,
- "next_branch": None,
- } == swh_storage.snapshot_get_latest(origin_url)
-
def test_snapshot_get_random(self, swh_storage):
swh_storage.snapshot_add(
[data.snapshot, data.empty_snapshot, data.complete_snapshot]

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 18, 12:58 PM (10 h, 48 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3220536

Event Timeline