diff --git a/swh/storage/algos/snapshot.py b/swh/storage/algos/snapshot.py index 5c9aeef7..5e965f4e 100644 --- a/swh/storage/algos/snapshot.py +++ b/swh/storage/algos/snapshot.py @@ -1,94 +1,95 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Iterable, Optional from swh.model.model import Snapshot from swh.storage.algos.origin import origin_get_latest_visit_status def snapshot_get_all_branches(storage, snapshot_id): """Get all the branches for a given snapshot Args: storage (swh.storage.storage.Storage): the storage instance snapshot_id (bytes): the snapshot's identifier Returns: dict: a dict with two keys: * **id**: identifier of the snapshot * **branches**: a dict of branches contained in the snapshot whose keys are the branches' names. """ ret = storage.snapshot_get(snapshot_id) if not ret: return next_branch = ret.pop("next_branch", None) while next_branch: data = storage.snapshot_get_branches(snapshot_id, branches_from=next_branch) ret["branches"].update(data["branches"]) next_branch = data.get("next_branch") return ret def snapshot_get_latest( storage, origin: str, allowed_statuses: Optional[Iterable[str]] = None, branches_count: Optional[int] = None, ) -> Optional[Snapshot]: """Get the latest snapshot for the given origin, optionally only from visits that have one of the given allowed_statuses. The branches of the snapshot are iterated in the lexicographical order of their names. Args: storage: Storage instance origin: the origin's URL allowed_statuses: list of visit statuses considered to find the latest snapshot for the visit. For instance, ``allowed_statuses=['full']`` will only consider visits that have successfully run to completion. branches_count: Optional parameter to retrieve snapshot with all branches (default behavior when None) or not. If set to positive number, the snapshot will be partial with only that number of branches. Raises: ValueError if branches_count is not a positive value Returns: The snapshot object if one is found matching the criteria or None. """ visit_and_status = origin_get_latest_visit_status( storage, origin, allowed_statuses=allowed_statuses, require_snapshot=True ) if not visit_and_status: return None _, visit_status = visit_and_status snapshot_id = visit_status.snapshot if not snapshot_id: return None if branches_count: # partial snapshot if not isinstance(branches_count, int) or branches_count <= 0: raise ValueError( "Parameter branches_count must be a positive integer. " f"Current value is {branches_count}" ) snapshot = storage.snapshot_get_branches( snapshot_id, branches_count=branches_count ) + if snapshot is None: + return None snapshot.pop("next_branch") else: snapshot = snapshot_get_all_branches(storage, snapshot_id) - - return Snapshot.from_dict(snapshot) + return Snapshot.from_dict(snapshot) if snapshot else None diff --git a/swh/storage/tests/algos/test_snapshot.py b/swh/storage/tests/algos/test_snapshot.py index 7a8da207..70bef2df 100644 --- a/swh/storage/tests/algos/test_snapshot.py +++ b/swh/storage/tests/algos/test_snapshot.py @@ -1,151 +1,174 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from hypothesis import given import pytest from swh.model.hypothesis_strategies import snapshots, branch_names, branch_targets from swh.model.identifiers import snapshot_identifier, identifier_to_bytes from swh.model.model import Origin, OriginVisit, OriginVisitStatus, Snapshot from swh.storage.algos.snapshot import snapshot_get_all_branches, snapshot_get_latest from swh.storage.utils import now from swh.storage.tests.storage_data import data @pytest.fixture def swh_storage_backend_config(): yield { "cls": "memory", "journal_writer": None, } @given(snapshot=snapshots(min_size=0, max_size=10, only_objects=False)) def test_snapshot_small(swh_storage, snapshot): # noqa snapshot = snapshot.to_dict() swh_storage.snapshot_add([snapshot]) returned_snapshot = snapshot_get_all_branches(swh_storage, snapshot["id"]) assert snapshot == returned_snapshot @given(branch_name=branch_names(), branch_target=branch_targets(only_objects=True)) def test_snapshot_large(swh_storage, branch_name, branch_target): # noqa branch_target = branch_target.to_dict() snapshot = { "branches": {b"%s%05d" % (branch_name, i): branch_target for i in range(10000)} } snapshot["id"] = identifier_to_bytes(snapshot_identifier(snapshot)) swh_storage.snapshot_add([snapshot]) returned_snapshot = snapshot_get_all_branches(swh_storage, snapshot["id"]) assert snapshot == returned_snapshot def test_snapshot_get_latest_none(swh_storage): """Retrieve latest snapshot on unknown origin or origin without snapshot should yield no result """ + # unknown origin so None assert snapshot_get_latest(swh_storage, "unknown-origin") is None - # no snapshot on origin visit then nothing is found + # no snapshot on origin visit so None origin = Origin.from_dict(data.origin) swh_storage.origin_add_one(origin) swh_storage.origin_visit_add( [ OriginVisit( origin=origin.url, date=data.date_visit1, type=data.type_visit1, status="ongoing", snapshot=None, ) ] ) assert snapshot_get_latest(swh_storage, origin.url) is None + ov1 = swh_storage.origin_visit_get_latest(origin.url) + assert ov1 is not None + visit_id = ov1["visit"] + + # visit references a snapshot but the snapshot does not exist in backend for some + # reason + complete_snapshot = Snapshot.from_dict(data.complete_snapshot) + swh_storage.origin_visit_status_add( + [ + OriginVisitStatus( + origin=origin.url, + visit=visit_id, + date=data.date_visit2, + status="partial", + snapshot=complete_snapshot.id, + ) + ] + ) + # so we do not find it + assert snapshot_get_latest(swh_storage, origin.url) is None + assert snapshot_get_latest(swh_storage, origin.url, branches_count=1) is None + def test_snapshot_get_latest(swh_storage): origin = Origin.from_dict(data.origin) swh_storage.origin_add_one(origin) visit1 = OriginVisit( origin=origin.url, date=data.date_visit1, type=data.type_visit1, status="ongoing", snapshot=None, ) ov1 = swh_storage.origin_visit_add([visit1])[0] # Add snapshot to visit1, latest snapshot = visit 1 snapshot complete_snapshot = Snapshot.from_dict(data.complete_snapshot) swh_storage.snapshot_add([complete_snapshot]) swh_storage.origin_visit_status_add( [ OriginVisitStatus( origin=origin.url, visit=ov1.visit, date=data.date_visit2, status="partial", snapshot=None, ) ] ) assert data.date_visit1 < data.date_visit2 # no snapshot associated to the visit, so None actual_snapshot = snapshot_get_latest( swh_storage, origin.url, allowed_statuses=["partial"] ) assert actual_snapshot is None date_now = now() assert data.date_visit2 < date_now swh_storage.origin_visit_status_add( [ OriginVisitStatus( origin=origin.url, visit=ov1.visit, date=date_now, status="full", snapshot=complete_snapshot.id, ) ] ) actual_snapshot = snapshot_get_latest(swh_storage, origin.url) assert actual_snapshot is not None assert actual_snapshot == complete_snapshot swh_storage.origin_visit_status_add( [ OriginVisitStatus( origin=origin.url, visit=ov1.visit, date=date_now, status="full", snapshot=complete_snapshot.id, ) ] ) actual_snapshot = snapshot_get_latest(swh_storage, origin.url) assert actual_snapshot is not None assert actual_snapshot == complete_snapshot actual_snapshot = snapshot_get_latest(swh_storage, origin.url, branches_count=1) assert actual_snapshot is not None assert actual_snapshot.id == complete_snapshot.id assert len(actual_snapshot.branches.values()) == 1 with pytest.raises(ValueError, match="branches_count must be a positive integer"): snapshot_get_latest(swh_storage, origin.url, branches_count="something-wrong")