diff --git a/swh/storage/algos/snapshot.py b/swh/storage/algos/snapshot.py index b9cae79f..2847f817 100644 --- a/swh/storage/algos/snapshot.py +++ b/swh/storage/algos/snapshot.py @@ -1,218 +1,212 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Iterator, List, Optional, Tuple from swh.model.hashutil import hash_to_hex from swh.model.model import ( OriginVisit, OriginVisitStatus, Sha1Git, Snapshot, SnapshotBranch, TargetType, ) from swh.storage.algos.origin import ( iter_origin_visit_statuses, iter_origin_visits, origin_get_latest_visit_status, ) from swh.storage.interface import ListOrder, StorageInterface def snapshot_get_all_branches( storage: StorageInterface, snapshot_id: Sha1Git ) -> Optional[Snapshot]: """Get all the branches for a given snapshot Args: storage (swh.storage.interface.StorageInterface): the storage instance snapshot_id (bytes): the snapshot's identifier Returns: dict: a dict with two keys: * **id**: identifier of the snapshot * **branches**: a dict of branches contained in the snapshot whose keys are the branches' names. """ ret = storage.snapshot_get_branches(snapshot_id) if not ret: return None next_branch = ret["next_branch"] while next_branch: data = storage.snapshot_get_branches(snapshot_id, branches_from=next_branch) assert data, f"Snapshot {hash_to_hex(snapshot_id)} ceased to exist" ret["branches"].update(data["branches"]) next_branch = data["next_branch"] return Snapshot(id=ret["id"], branches=ret["branches"]) def snapshot_get_latest( storage: StorageInterface, origin: str, allowed_statuses: Optional[List[str]] = None, branches_count: Optional[int] = None, ) -> Optional[Snapshot]: """Get the latest snapshot for the given origin, optionally only from visits that have one of the given allowed_statuses. The branches of the snapshot are iterated in the lexicographical order of their names. Args: storage: Storage instance origin: the origin's URL allowed_statuses: list of visit statuses considered to find the latest snapshot for the visit. For instance, ``allowed_statuses=['full']`` will only consider visits that have successfully run to completion. branches_count: Optional parameter to retrieve snapshot with all branches (default behavior when None) or not. If set to positive number, the snapshot will be partial with only that number of branches. Raises: ValueError if branches_count is not a positive value Returns: The snapshot object if one is found matching the criteria or None. """ visit_and_status = origin_get_latest_visit_status( storage, origin, allowed_statuses=allowed_statuses, require_snapshot=True, ) if not visit_and_status: return None _, visit_status = visit_and_status snapshot_id = visit_status.snapshot if not snapshot_id: return None if branches_count: # partial snapshot if not isinstance(branches_count, int) or branches_count <= 0: raise ValueError( "Parameter branches_count must be a positive integer. " f"Current value is {branches_count}" ) snapshot = storage.snapshot_get_branches( snapshot_id, branches_count=branches_count ) if snapshot is None: return None return Snapshot(id=snapshot["id"], branches=snapshot["branches"]) else: return snapshot_get_all_branches(storage, snapshot_id) def snapshot_id_get_from_revision( storage: StorageInterface, origin: str, revision_id: bytes ) -> Optional[bytes]: """Retrieve the most recent snapshot id targeting the revision_id for the given origin. *Warning* This is a potentially highly costly operation Returns The snapshot id if found. None otherwise. """ res = visits_and_snapshots_get_from_revision(storage, origin, revision_id) # they are sorted by descending date, so we just need to return the first one, # if any. for (visit, status, snapshot) in res: return snapshot.id return None def visits_and_snapshots_get_from_revision( storage: StorageInterface, origin: str, revision_id: bytes ) -> Iterator[Tuple[OriginVisit, OriginVisitStatus, Snapshot]]: """Retrieve all visits, visit statuses, and matching snapshot of the given origin, such that the snapshot targets the revision_id. *Warning* This is a potentially highly costly operation Yields: Tuples of (visit, status, snapshot) """ revision = storage.revision_get([revision_id]) if not revision: return for visit in iter_origin_visits(storage, origin, order=ListOrder.DESC): assert visit.visit is not None for visit_status in iter_origin_visit_statuses( storage, origin, visit.visit, order=ListOrder.DESC ): snapshot_id = visit_status.snapshot if snapshot_id is None: continue snapshot = snapshot_get_all_branches(storage, snapshot_id) if not snapshot: continue for branch_name, branch in snapshot.branches.items(): if ( branch is not None and branch.target_type == TargetType.REVISION and branch.target == revision_id ): # snapshot found yield (visit, visit_status, snapshot) def snapshot_resolve_alias( storage: StorageInterface, snapshot_id: Sha1Git, alias_name: bytes -) -> Optional[Tuple[List[SnapshotBranch], Optional[SnapshotBranch]]]: +) -> Optional[SnapshotBranch]: """ - Resolve snapshot branch alias to its real target. + Transitively resolve snapshot branch alias to its real target, and return it; + ie. follows every branch that is an alias, until a branch that isn't an alias + is found. Args: storage: Storage instance snapshot_id: snapshot identifier alias_name: name of the branch alias to resolve Returns: - A tuple whose first member is the list of followed branches until the alias - got resolved to a branch whose target type is not an alias, and second member - the real targeted branch. - If a dangling branch is encountered during the resolve process, second member of - the tuple will be None. - If the target type of the tuple second member is an alias, it means that - a cycle has been detected during the resolve process. + The first branch that isn't an alias, in the alias chain; or None if + there is no such branch (ie. either because of a cycle alias, or a dangling + branch). """ snapshot = storage.snapshot_get_branches( snapshot_id, branches_from=alias_name, branches_count=1 ) if snapshot is None: return None if alias_name not in snapshot["branches"]: - return ([], None) + return None last_branch = snapshot["branches"][alias_name] - branches = [] - seen_aliases = {alias_name} - while ( - last_branch is not None - and last_branch.target_type == TargetType.ALIAS - and last_branch.target not in seen_aliases - ): - branches.append(last_branch) + while last_branch is not None and last_branch.target_type == TargetType.ALIAS: + if last_branch.target in seen_aliases: + return None + alias_target = last_branch.target snapshot = storage.snapshot_get_branches( snapshot_id, branches_from=alias_target, branches_count=1 ) assert snapshot is not None last_branch = snapshot["branches"].get(alias_target) seen_aliases.add(alias_target) - return (branches, last_branch) + return last_branch diff --git a/swh/storage/tests/algos/test_snapshot.py b/swh/storage/tests/algos/test_snapshot.py index d22028ef..47ed2184 100644 --- a/swh/storage/tests/algos/test_snapshot.py +++ b/swh/storage/tests/algos/test_snapshot.py @@ -1,412 +1,395 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from hypothesis import given import pytest from swh.model.hypothesis_strategies import branch_names, branch_targets, snapshots from swh.model.model import ( OriginVisit, OriginVisitStatus, Snapshot, SnapshotBranch, TargetType, ) from swh.storage.algos.snapshot import ( snapshot_get_all_branches, snapshot_get_latest, snapshot_id_get_from_revision, snapshot_resolve_alias, visits_and_snapshots_get_from_revision, ) from swh.storage.utils import now @pytest.fixture def swh_storage_backend_config(): yield { "cls": "memory", "journal_writer": None, } @given(snapshot=snapshots(min_size=0, max_size=10, only_objects=False)) def test_snapshot_small(swh_storage, snapshot): # noqa swh_storage.snapshot_add([snapshot]) returned_snapshot = snapshot_get_all_branches(swh_storage, snapshot.id) assert snapshot == returned_snapshot @given(branch_name=branch_names(), branch_target=branch_targets(only_objects=True)) def test_snapshot_large(swh_storage, branch_name, branch_target): # noqa snapshot = Snapshot( branches={b"%s%05d" % (branch_name, i): branch_target for i in range(10000)}, ) swh_storage.snapshot_add([snapshot]) returned_snapshot = snapshot_get_all_branches(swh_storage, snapshot.id) assert snapshot == returned_snapshot def test_snapshot_get_latest_none(swh_storage, sample_data): """Retrieve latest snapshot on unknown origin or origin without snapshot should yield no result """ # unknown origin so None assert snapshot_get_latest(swh_storage, "unknown-origin") is None # no snapshot on origin visit so None origin = sample_data.origin swh_storage.origin_add([origin]) origin_visit, origin_visit2 = sample_data.origin_visits[:2] assert origin_visit.origin == origin.url swh_storage.origin_visit_add([origin_visit]) assert snapshot_get_latest(swh_storage, origin.url) is None ov1 = swh_storage.origin_visit_get_latest(origin.url) assert ov1 is not None # visit references a snapshot but the snapshot does not exist in backend for some # reason complete_snapshot = sample_data.snapshots[2] swh_storage.origin_visit_status_add( [ OriginVisitStatus( origin=origin.url, visit=ov1.visit, date=origin_visit2.date, status="partial", snapshot=complete_snapshot.id, ) ] ) # so we do not find it assert snapshot_get_latest(swh_storage, origin.url) is None assert snapshot_get_latest(swh_storage, origin.url, branches_count=1) is None def test_snapshot_get_latest(swh_storage, sample_data): origin = sample_data.origin swh_storage.origin_add([origin]) visit1, visit2 = sample_data.origin_visits[:2] assert visit1.origin == origin.url swh_storage.origin_visit_add([visit1]) ov1 = swh_storage.origin_visit_get_latest(origin.url) # Add snapshot to visit1, latest snapshot = visit 1 snapshot complete_snapshot = sample_data.snapshots[2] swh_storage.snapshot_add([complete_snapshot]) swh_storage.origin_visit_status_add( [ OriginVisitStatus( origin=origin.url, visit=ov1.visit, date=visit2.date, status="partial", snapshot=None, ) ] ) assert visit1.date < visit2.date # no snapshot associated to the visit, so None actual_snapshot = snapshot_get_latest( swh_storage, origin.url, allowed_statuses=["partial"] ) assert actual_snapshot is None date_now = now() assert visit2.date < date_now swh_storage.origin_visit_status_add( [ OriginVisitStatus( origin=origin.url, visit=ov1.visit, date=date_now, status="full", snapshot=complete_snapshot.id, ) ] ) swh_storage.origin_visit_add( [OriginVisit(origin=origin.url, date=now(), type=visit1.type,)] ) actual_snapshot = snapshot_get_latest(swh_storage, origin.url) assert actual_snapshot is not None assert actual_snapshot == complete_snapshot actual_snapshot = snapshot_get_latest(swh_storage, origin.url, branches_count=1) assert actual_snapshot is not None assert actual_snapshot.id == complete_snapshot.id assert len(actual_snapshot.branches.values()) == 1 with pytest.raises(ValueError, match="branches_count must be a positive integer"): snapshot_get_latest(swh_storage, origin.url, branches_count="something-wrong") def test_snapshot_id_get_from_revision(swh_storage, sample_data): origin = sample_data.origin swh_storage.origin_add([origin]) date_visit2 = now() visit1, visit2 = sample_data.origin_visits[:2] assert visit1.origin == origin.url ov1, ov2 = swh_storage.origin_visit_add([visit1, visit2]) revision1, revision2, revision3 = sample_data.revisions[:3] swh_storage.revision_add([revision1, revision2]) empty_snapshot, complete_snapshot = sample_data.snapshots[1:3] swh_storage.snapshot_add([complete_snapshot]) # Add complete_snapshot to visit1 which targets revision1 ovs1, ovs2 = [ OriginVisitStatus( origin=origin.url, visit=ov1.visit, date=date_visit2, status="partial", snapshot=complete_snapshot.id, ), OriginVisitStatus( origin=origin.url, visit=ov2.visit, date=now(), status="full", snapshot=empty_snapshot.id, ), ] swh_storage.origin_visit_status_add([ovs1, ovs2]) assert ov1.date < ov2.date assert ov2.date < ovs1.date assert ovs1.date < ovs2.date # revision3 does not exist so result is None actual_snapshot_id = snapshot_id_get_from_revision( swh_storage, origin.url, revision3.id ) assert actual_snapshot_id is None # no snapshot targets revision2 for origin.url so result is None actual_snapshot_id = snapshot_id_get_from_revision( swh_storage, origin.url, revision2.id ) assert actual_snapshot_id is None # complete_snapshot targets at least revision1 actual_snapshot_id = snapshot_id_get_from_revision( swh_storage, origin.url, revision1.id ) assert actual_snapshot_id == complete_snapshot.id def test_visit_and_snapshot_get_from_revision(swh_storage, sample_data): origin = sample_data.origin swh_storage.origin_add([origin]) date_visit2 = now() visit1, visit2 = sample_data.origin_visits[:2] assert visit1.origin == origin.url ov1, ov2 = swh_storage.origin_visit_add([visit1, visit2]) revision1, revision2, revision3 = sample_data.revisions[:3] swh_storage.revision_add([revision1, revision2]) empty_snapshot, complete_snapshot = sample_data.snapshots[1:3] swh_storage.snapshot_add([complete_snapshot]) # Add complete_snapshot to visit1 which targets revision1 ovs1, ovs2 = [ OriginVisitStatus( origin=origin.url, visit=ov1.visit, date=date_visit2, status="partial", snapshot=complete_snapshot.id, ), OriginVisitStatus( origin=origin.url, visit=ov2.visit, date=now(), status="full", snapshot=empty_snapshot.id, ), ] swh_storage.origin_visit_status_add([ovs1, ovs2]) assert ov1.date < ov2.date assert ov2.date < ovs1.date assert ovs1.date < ovs2.date # revision3 does not exist so result is None actual_snapshot_id = snapshot_id_get_from_revision( swh_storage, origin.url, revision3.id ) assert actual_snapshot_id is None # no snapshot targets revision2 for origin.url so result is None res = list( visits_and_snapshots_get_from_revision(swh_storage, origin.url, revision2.id) ) assert res == [] # complete_snapshot targets at least revision1 res = list( visits_and_snapshots_get_from_revision(swh_storage, origin.url, revision1.id) ) assert res == [(ov1, ovs1, complete_snapshot)] def test_snapshot_resolve_aliases_unknown_snapshot(swh_storage): assert snapshot_resolve_alias(swh_storage, b"foo", b"HEAD") is None def test_snapshot_resolve_aliases_no_aliases(swh_storage): snapshot = Snapshot(branches={}) swh_storage.snapshot_add([snapshot]) - assert snapshot_resolve_alias(swh_storage, snapshot.id, b"HEAD") == ([], None) + assert snapshot_resolve_alias(swh_storage, snapshot.id, b"HEAD") is None def test_snapshot_resolve_alias(swh_storage, sample_data): rev_branch_name = b"revision_branch" rel_branch_name = b"release_branch" rev_alias1_name = b"rev_alias1" rev_alias2_name = b"rev_alias2" rev_alias3_name = b"rev_alias3" rel_alias_name = b"rel_alias" rev_branch_info = SnapshotBranch( target=sample_data.revisions[0].id, target_type=TargetType.REVISION, ) rel_branch_info = SnapshotBranch( target=sample_data.releases[0].id, target_type=TargetType.RELEASE, ) rev_alias1_branch_info = SnapshotBranch( target=rev_branch_name, target_type=TargetType.ALIAS ) rev_alias2_branch_info = SnapshotBranch( target=rev_alias1_name, target_type=TargetType.ALIAS ) rev_alias3_branch_info = SnapshotBranch( target=rev_alias2_name, target_type=TargetType.ALIAS ) rel_alias_branch_info = SnapshotBranch( target=rel_branch_name, target_type=TargetType.ALIAS ) snapshot = Snapshot( branches={ rev_branch_name: rev_branch_info, rel_branch_name: rel_branch_info, rev_alias1_name: rev_alias1_branch_info, rev_alias2_name: rev_alias2_branch_info, rev_alias3_name: rev_alias3_branch_info, rel_alias_name: rel_alias_branch_info, } ) swh_storage.snapshot_add([snapshot]) - for alias_name, expected_branches in ( - (rev_alias1_name, ([rev_alias1_branch_info], rev_branch_info)), - ( - rev_alias2_name, - ([rev_alias2_branch_info, rev_alias1_branch_info], rev_branch_info), - ), - ( - rev_alias3_name, - ( - [ - rev_alias3_branch_info, - rev_alias2_branch_info, - rev_alias1_branch_info, - ], - rev_branch_info, - ), - ), - (rel_alias_name, ([rel_alias_branch_info], rel_branch_info)), + for alias_name, expected_branch in ( + (rev_alias1_name, rev_branch_info), + (rev_alias2_name, rev_branch_info,), + (rev_alias3_name, rev_branch_info,), + (rel_alias_name, rel_branch_info), ): - branches = snapshot_resolve_alias(swh_storage, snapshot.id, alias_name) - assert branches == expected_branches + assert ( + snapshot_resolve_alias(swh_storage, snapshot.id, alias_name) + == expected_branch + ) def test_snapshot_resolve_alias_dangling_branch(swh_storage): dangling_branch_name = b"dangling_branch" alias_name = b"rev_alias" alias_branch = SnapshotBranch( target=dangling_branch_name, target_type=TargetType.ALIAS ) snapshot = Snapshot( branches={dangling_branch_name: None, alias_name: alias_branch,} ) swh_storage.snapshot_add([snapshot]) - branches = snapshot_resolve_alias(swh_storage, snapshot.id, alias_name) - assert branches == ([alias_branch], None) + assert snapshot_resolve_alias(swh_storage, snapshot.id, alias_name) is None def test_snapshot_resolve_alias_missing_branch(swh_storage): missing_branch_name = b"missing_branch" alias_name = b"rev_alias" alias_branch = SnapshotBranch( target=missing_branch_name, target_type=TargetType.ALIAS ) snapshot = Snapshot(id=b"42" * 10, branches={alias_name: alias_branch,}) swh_storage.snapshot_add([snapshot]) - branches = snapshot_resolve_alias(swh_storage, snapshot.id, alias_name) - assert branches == ([alias_branch], None) + assert snapshot_resolve_alias(swh_storage, snapshot.id, alias_name) is None def test_snapshot_resolve_alias_cycle_found(swh_storage): alias1_name = b"alias_1" alias2_name = b"alias_2" alias3_name = b"alias_3" alias4_name = b"alias_4" alias1_branch_info = SnapshotBranch( target=alias2_name, target_type=TargetType.ALIAS ) alias2_branch_info = SnapshotBranch( target=alias3_name, target_type=TargetType.ALIAS ) alias3_branch_info = SnapshotBranch( target=alias4_name, target_type=TargetType.ALIAS ) alias4_branch_info = SnapshotBranch( target=alias2_name, target_type=TargetType.ALIAS ) snapshot = Snapshot( branches={ alias1_name: alias1_branch_info, alias2_name: alias2_branch_info, alias3_name: alias3_branch_info, alias4_name: alias4_branch_info, } ) swh_storage.snapshot_add([snapshot]) - branches = snapshot_resolve_alias(swh_storage, snapshot.id, alias1_name) - assert branches == ( - [alias1_branch_info, alias2_branch_info, alias3_branch_info], - alias4_branch_info, - ) + assert snapshot_resolve_alias(swh_storage, snapshot.id, alias1_name) is None