diff --git a/swh/storage/algos/origin.py b/swh/storage/algos/origin.py index f52a14ad..9a9d98b6 100644 --- a/swh/storage/algos/origin.py +++ b/swh/storage/algos/origin.py @@ -1,95 +1,95 @@ -# Copyright (C) 2019-2020 The Software Heritage developers +# Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -from typing import Iterator, List, Optional, Tuple +from typing import Iterator, List, Optional from swh.core.api.classes import stream_results from swh.model.model import Origin, OriginVisit, OriginVisitStatus from swh.storage.interface import ListOrder, StorageInterface def iter_origins(storage: StorageInterface, limit: int = 10000,) -> Iterator[Origin]: """Iterates over origins in the storage. Args: storage: the storage object used for queries. limit: maximum number of origins per page Yields: origin model objects from the storage in page of `limit` origins """ yield from stream_results(storage.origin_list, limit=limit) def origin_get_latest_visit_status( storage: StorageInterface, origin_url: str, type: Optional[str] = None, allowed_statuses: Optional[List[str]] = None, require_snapshot: bool = False, -) -> Optional[Tuple[OriginVisit, OriginVisitStatus]]: +) -> Optional[OriginVisitStatus]: """Get the latest origin visit (and status) of an origin. Optionally, a combination of criteria can be provided, origin type, allowed statuses or if a visit has a snapshot. If no visit matching the criteria is found, returns None. Otherwise, returns a tuple of origin visit, origin visit status. Args: storage: A storage backend origin: origin URL type: Optional visit type to filter on (e.g git, tar, dsc, svn, hg, npm, pypi, ...) allowed_statuses: list of visit statuses considered to find the latest visit. For instance, ``allowed_statuses=['full']`` will only consider visits that have successfully run to completion. require_snapshot: If True, only a visit with a snapshot will be returned. Returns: a tuple of (visit, visit_status) model object if the visit *and* the visit status exist (and match the search criteria), None otherwise. """ visit = storage.origin_visit_get_latest( origin_url, type=type, allowed_statuses=allowed_statuses, require_snapshot=require_snapshot, ) - result: Optional[Tuple[OriginVisit, OriginVisitStatus]] = None + result: Optional[OriginVisitStatus] = None if visit: assert visit.visit is not None visit_status = storage.origin_visit_status_get_latest( origin_url, visit.visit, allowed_statuses=allowed_statuses, require_snapshot=require_snapshot, ) if visit_status: - result = visit, visit_status + result = visit_status return result def iter_origin_visits( storage: StorageInterface, origin: str, order: ListOrder = ListOrder.ASC ) -> Iterator[OriginVisit]: """Iter over origin visits from an origin """ yield from stream_results(storage.origin_visit_get, origin, order=order) def iter_origin_visit_statuses( storage: StorageInterface, origin: str, visit: int, order: ListOrder = ListOrder.ASC ) -> Iterator[OriginVisitStatus]: """Iter over origin visit status from an origin visit """ yield from stream_results( storage.origin_visit_status_get, origin, visit, order=order ) diff --git a/swh/storage/algos/snapshot.py b/swh/storage/algos/snapshot.py index 2847f817..79803c27 100644 --- a/swh/storage/algos/snapshot.py +++ b/swh/storage/algos/snapshot.py @@ -1,212 +1,210 @@ # Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from typing import Iterator, List, Optional, Tuple from swh.model.hashutil import hash_to_hex from swh.model.model import ( OriginVisit, OriginVisitStatus, Sha1Git, Snapshot, SnapshotBranch, TargetType, ) from swh.storage.algos.origin import ( iter_origin_visit_statuses, iter_origin_visits, origin_get_latest_visit_status, ) from swh.storage.interface import ListOrder, StorageInterface def snapshot_get_all_branches( storage: StorageInterface, snapshot_id: Sha1Git ) -> Optional[Snapshot]: """Get all the branches for a given snapshot Args: storage (swh.storage.interface.StorageInterface): the storage instance snapshot_id (bytes): the snapshot's identifier Returns: dict: a dict with two keys: * **id**: identifier of the snapshot * **branches**: a dict of branches contained in the snapshot whose keys are the branches' names. """ ret = storage.snapshot_get_branches(snapshot_id) if not ret: return None next_branch = ret["next_branch"] while next_branch: data = storage.snapshot_get_branches(snapshot_id, branches_from=next_branch) assert data, f"Snapshot {hash_to_hex(snapshot_id)} ceased to exist" ret["branches"].update(data["branches"]) next_branch = data["next_branch"] return Snapshot(id=ret["id"], branches=ret["branches"]) def snapshot_get_latest( storage: StorageInterface, origin: str, allowed_statuses: Optional[List[str]] = None, branches_count: Optional[int] = None, ) -> Optional[Snapshot]: """Get the latest snapshot for the given origin, optionally only from visits that have one of the given allowed_statuses. The branches of the snapshot are iterated in the lexicographical order of their names. Args: storage: Storage instance origin: the origin's URL allowed_statuses: list of visit statuses considered to find the latest snapshot for the visit. For instance, ``allowed_statuses=['full']`` will only consider visits that have successfully run to completion. branches_count: Optional parameter to retrieve snapshot with all branches (default behavior when None) or not. If set to positive number, the snapshot will be partial with only that number of branches. Raises: ValueError if branches_count is not a positive value Returns: The snapshot object if one is found matching the criteria or None. """ - visit_and_status = origin_get_latest_visit_status( + visit_status = origin_get_latest_visit_status( storage, origin, allowed_statuses=allowed_statuses, require_snapshot=True, ) - - if not visit_and_status: + if not visit_status: return None - _, visit_status = visit_and_status snapshot_id = visit_status.snapshot if not snapshot_id: return None if branches_count: # partial snapshot if not isinstance(branches_count, int) or branches_count <= 0: raise ValueError( "Parameter branches_count must be a positive integer. " f"Current value is {branches_count}" ) snapshot = storage.snapshot_get_branches( snapshot_id, branches_count=branches_count ) if snapshot is None: return None return Snapshot(id=snapshot["id"], branches=snapshot["branches"]) else: return snapshot_get_all_branches(storage, snapshot_id) def snapshot_id_get_from_revision( storage: StorageInterface, origin: str, revision_id: bytes ) -> Optional[bytes]: """Retrieve the most recent snapshot id targeting the revision_id for the given origin. *Warning* This is a potentially highly costly operation Returns The snapshot id if found. None otherwise. """ res = visits_and_snapshots_get_from_revision(storage, origin, revision_id) # they are sorted by descending date, so we just need to return the first one, # if any. for (visit, status, snapshot) in res: return snapshot.id return None def visits_and_snapshots_get_from_revision( storage: StorageInterface, origin: str, revision_id: bytes ) -> Iterator[Tuple[OriginVisit, OriginVisitStatus, Snapshot]]: """Retrieve all visits, visit statuses, and matching snapshot of the given origin, such that the snapshot targets the revision_id. *Warning* This is a potentially highly costly operation Yields: Tuples of (visit, status, snapshot) """ revision = storage.revision_get([revision_id]) if not revision: return for visit in iter_origin_visits(storage, origin, order=ListOrder.DESC): assert visit.visit is not None for visit_status in iter_origin_visit_statuses( storage, origin, visit.visit, order=ListOrder.DESC ): snapshot_id = visit_status.snapshot if snapshot_id is None: continue snapshot = snapshot_get_all_branches(storage, snapshot_id) if not snapshot: continue for branch_name, branch in snapshot.branches.items(): if ( branch is not None and branch.target_type == TargetType.REVISION and branch.target == revision_id ): # snapshot found yield (visit, visit_status, snapshot) def snapshot_resolve_alias( storage: StorageInterface, snapshot_id: Sha1Git, alias_name: bytes ) -> Optional[SnapshotBranch]: """ Transitively resolve snapshot branch alias to its real target, and return it; ie. follows every branch that is an alias, until a branch that isn't an alias is found. Args: storage: Storage instance snapshot_id: snapshot identifier alias_name: name of the branch alias to resolve Returns: The first branch that isn't an alias, in the alias chain; or None if there is no such branch (ie. either because of a cycle alias, or a dangling branch). """ snapshot = storage.snapshot_get_branches( snapshot_id, branches_from=alias_name, branches_count=1 ) if snapshot is None: return None if alias_name not in snapshot["branches"]: return None last_branch = snapshot["branches"][alias_name] seen_aliases = {alias_name} while last_branch is not None and last_branch.target_type == TargetType.ALIAS: if last_branch.target in seen_aliases: return None alias_target = last_branch.target snapshot = storage.snapshot_get_branches( snapshot_id, branches_from=alias_target, branches_count=1 ) assert snapshot is not None last_branch = snapshot["branches"].get(alias_target) seen_aliases.add(alias_target) return last_branch diff --git a/swh/storage/tests/algos/test_origin.py b/swh/storage/tests/algos/test_origin.py index f7b8bef8..80bced32 100644 --- a/swh/storage/tests/algos/test_origin.py +++ b/swh/storage/tests/algos/test_origin.py @@ -1,359 +1,356 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import datetime from swh.model.model import Origin, OriginVisit, OriginVisitStatus from swh.storage.algos.origin import ( iter_origin_visit_statuses, iter_origin_visits, iter_origins, origin_get_latest_visit_status, ) from swh.storage.interface import ListOrder from swh.storage.tests.storage_tests import round_to_milliseconds from swh.storage.utils import now def test_iter_origins(swh_storage): origins = [ Origin(url="bar"), Origin(url="qux"), Origin(url="quuz"), ] assert swh_storage.origin_add(origins) == {"origin:add": 3} # this returns all the origins, only the number of paged called is different assert list(iter_origins(swh_storage)) == origins assert list(iter_origins(swh_storage, limit=1)) == origins assert list(iter_origins(swh_storage, limit=2)) == origins def test_origin_get_latest_visit_status_none(swh_storage, sample_data): """Looking up unknown objects should return nothing """ # unknown origin so no result assert origin_get_latest_visit_status(swh_storage, "unknown-origin") is None # unknown type so no result origin = sample_data.origin origin_visit = sample_data.origin_visit assert origin_visit.origin == origin.url swh_storage.origin_add([origin]) swh_storage.origin_visit_add([origin_visit])[0] assert origin_visit.type != "unknown" actual_origin_visit = origin_get_latest_visit_status( swh_storage, origin.url, type="unknown" ) assert actual_origin_visit is None actual_origin_visit = origin_get_latest_visit_status( swh_storage, origin.url, require_snapshot=True ) assert actual_origin_visit is None def init_storage_with_origin_visits(swh_storage, sample_data): """Initialize storage with origin/origin-visit/origin-visit-status """ snapshot = sample_data.snapshots[2] origin1, origin2 = sample_data.origins[:2] swh_storage.origin_add([origin1, origin2]) ov1, ov2 = swh_storage.origin_visit_add( [ OriginVisit( origin=origin1.url, date=sample_data.date_visit1, type=sample_data.type_visit1, ), OriginVisit( origin=origin2.url, date=sample_data.date_visit2, type=sample_data.type_visit2, ), ] ) swh_storage.snapshot_add([snapshot]) date_now = now() date_now = round_to_milliseconds(date_now) assert sample_data.date_visit1 < sample_data.date_visit2 assert sample_data.date_visit2 < date_now # origin visit status 1 for origin visit 1 ovs11 = OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, date=ov1.date + datetime.timedelta(seconds=10), # so it's not ignored type=ov1.type, status="partial", snapshot=None, ) # origin visit status 2 for origin visit 1 ovs12 = OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, date=sample_data.date_visit2, type=ov1.type, status="ongoing", snapshot=None, ) # origin visit status 1 for origin visit 2 ovs21 = OriginVisitStatus( origin=ov2.origin, visit=ov2.visit, date=ov2.date + datetime.timedelta(seconds=10), # so it's not ignored type=ov2.type, status="ongoing", snapshot=None, ) # origin visit status 2 for origin visit 2 ovs22 = OriginVisitStatus( origin=ov2.origin, visit=ov2.visit, date=date_now, type=ov2.type, status="full", snapshot=snapshot.id, metadata={"something": "wicked"}, ) swh_storage.origin_visit_status_add([ovs11, ovs12, ovs21, ovs22]) return { "origin": [origin1, origin2], "origin_visit": [ov1, ov2], "origin_visit_status": [ovs11, ovs12, ovs21, ovs22], } def test_origin_get_latest_visit_status_filter_type(swh_storage, sample_data): """Filtering origin visit per types should yield consistent results """ objects = init_storage_with_origin_visits(swh_storage, sample_data) origin1, origin2 = objects["origin"] ov1, ov2 = objects["origin_visit"] ovs11, ovs12, _, ovs22 = objects["origin_visit_status"] # no visit for origin1 url with type_visit2 assert ( origin_get_latest_visit_status( swh_storage, origin1.url, type=sample_data.type_visit2 ) is None ) # no visit for origin2 url with type_visit1 assert ( origin_get_latest_visit_status( swh_storage, origin2.url, type=sample_data.type_visit1 ) is None ) # Two visits, both with no snapshot, take the most recent - actual_ov1, actual_ovs12 = origin_get_latest_visit_status( + actual_ovs12 = origin_get_latest_visit_status( swh_storage, origin1.url, type=sample_data.type_visit1 ) - assert isinstance(actual_ov1, OriginVisit) assert isinstance(actual_ovs12, OriginVisitStatus) - assert actual_ov1.origin == ov1.origin - assert actual_ov1.visit == ov1.visit - assert actual_ov1.type == sample_data.type_visit1 assert actual_ovs12 == ovs12 + assert actual_ovs12.origin == ov1.origin + assert actual_ovs12.visit == ov1.visit + assert actual_ovs12.type == sample_data.type_visit1 # take the most recent visit with type_visit2 - actual_ov2, actual_ovs22 = origin_get_latest_visit_status( + actual_ovs22 = origin_get_latest_visit_status( swh_storage, origin2.url, type=sample_data.type_visit2 ) - assert isinstance(actual_ov2, OriginVisit) assert isinstance(actual_ovs22, OriginVisitStatus) - assert actual_ov2.origin == ov2.origin - assert actual_ov2.visit == ov2.visit - assert actual_ov2.type == sample_data.type_visit2 assert actual_ovs22 == ovs22 + assert actual_ovs22.origin == ov2.origin + assert actual_ovs22.visit == ov2.visit + assert actual_ovs22.type == sample_data.type_visit2 def test_origin_get_latest_visit_status_filter_status(swh_storage, sample_data): objects = init_storage_with_origin_visits(swh_storage, sample_data) origin1, origin2 = objects["origin"] ov1, ov2 = objects["origin_visit"] ovs11, ovs12, _, ovs22 = objects["origin_visit_status"] # no partial status for that origin visit assert ( origin_get_latest_visit_status( swh_storage, origin2.url, allowed_statuses=["partial"] ) is None ) # only 1 partial for that visit - actual_ov1, actual_ovs11 = origin_get_latest_visit_status( + actual_ovs11 = origin_get_latest_visit_status( swh_storage, origin1.url, allowed_statuses=["partial"] ) - assert actual_ov1.origin == ov1.origin - assert actual_ov1.visit == ov1.visit - assert actual_ov1.type == sample_data.type_visit1 assert actual_ovs11 == ovs11 + assert actual_ovs11.origin == ov1.origin + assert actual_ovs11.visit == ov1.visit + assert actual_ovs11.type == sample_data.type_visit1 # both status exist, take the latest one - actual_ov1, actual_ovs12 = origin_get_latest_visit_status( + actual_ovs12 = origin_get_latest_visit_status( swh_storage, origin1.url, allowed_statuses=["partial", "ongoing"] ) - assert actual_ov1.origin == ov1.origin - assert actual_ov1.visit == ov1.visit - assert actual_ov1.type == sample_data.type_visit1 assert actual_ovs12 == ovs12 + assert actual_ovs12.origin == ov1.origin + assert actual_ovs12.visit == ov1.visit + assert actual_ovs12.type == sample_data.type_visit1 - assert isinstance(actual_ov1, OriginVisit) assert isinstance(actual_ovs12, OriginVisitStatus) - assert actual_ov1.origin == ov1.origin - assert actual_ov1.visit == ov1.visit - assert actual_ov1.type == sample_data.type_visit1 assert actual_ovs12 == ovs12 + assert actual_ovs12.origin == ov1.origin + assert actual_ovs12.visit == ov1.visit + assert actual_ovs12.type == sample_data.type_visit1 # take the most recent visit with type_visit2 - actual_ov2, actual_ovs22 = origin_get_latest_visit_status( + actual_ovs22 = origin_get_latest_visit_status( swh_storage, origin2.url, allowed_statuses=["full"] ) - assert actual_ov2.origin == ov2.origin - assert actual_ov2.visit == ov2.visit - assert actual_ov2.type == sample_data.type_visit2 assert actual_ovs22 == ovs22 + assert actual_ovs22.origin == ov2.origin + assert actual_ovs22.visit == ov2.visit + assert actual_ovs22.type == sample_data.type_visit2 def test_origin_get_latest_visit_status_filter_snapshot(swh_storage, sample_data): objects = init_storage_with_origin_visits(swh_storage, sample_data) origin1, origin2 = objects["origin"] _, ov2 = objects["origin_visit"] _, _, _, ovs22 = objects["origin_visit_status"] # there is no visit with snapshot yet for that visit assert ( origin_get_latest_visit_status(swh_storage, origin1.url, require_snapshot=True) is None ) # visit status with partial status visit elected - actual_ov2, actual_ovs22 = origin_get_latest_visit_status( + actual_ovs22 = origin_get_latest_visit_status( swh_storage, origin2.url, require_snapshot=True ) - assert actual_ov2.origin == ov2.origin - assert actual_ov2.visit == ov2.visit - assert actual_ov2.type == ov2.type assert actual_ovs22 == ovs22 + assert actual_ovs22.origin == ov2.origin + assert actual_ovs22.visit == ov2.visit + assert actual_ovs22.type == ov2.type date_now = now() # Add another visit swh_storage.origin_visit_add( [OriginVisit(origin=origin2.url, date=date_now, type=sample_data.type_visit2,),] ) # Requiring the latest visit with a snapshot, we still find the previous visit - ov2, ovs22 = origin_get_latest_visit_status( + ovs22 = origin_get_latest_visit_status( swh_storage, origin2.url, require_snapshot=True ) - assert actual_ov2.origin == ov2.origin - assert actual_ov2.visit == ov2.visit - assert actual_ov2.type == ov2.type assert actual_ovs22 == ovs22 + assert actual_ovs22.origin == ov2.origin + assert actual_ovs22.visit == ov2.visit + assert actual_ovs22.type == ov2.type def test_iter_origin_visits(swh_storage, sample_data): """Iter over origin visits for an origin returns all visits""" origin1, origin2 = sample_data.origins[:2] swh_storage.origin_add([origin1, origin2]) date_past = now() - datetime.timedelta(weeks=20) new_visits = [] for visit_id in range(20): new_visits.append( OriginVisit( origin=origin1.url, date=date_past + datetime.timedelta(days=visit_id), type="git", ) ) visits = swh_storage.origin_visit_add(new_visits) reversed_visits = list(reversed(visits)) # no limit, order asc actual_visits = list(iter_origin_visits(swh_storage, origin1.url)) assert actual_visits == visits # no limit, order desc actual_visits = list( iter_origin_visits(swh_storage, origin1.url, order=ListOrder.DESC) ) assert actual_visits == reversed_visits # no result actual_visits = list(iter_origin_visits(swh_storage, origin2.url)) assert actual_visits == [] def test_iter_origin_visit_status(swh_storage, sample_data): origin1, origin2 = sample_data.origins[:2] swh_storage.origin_add([origin1]) ov1 = swh_storage.origin_visit_add([sample_data.origin_visit])[0] assert ov1.origin == origin1.url date_past = now() - datetime.timedelta(weeks=20) ovs1 = OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, date=ov1.date, type=ov1.type, status="created", snapshot=None, ) new_visit_statuses = [ovs1] for i in range(20): status_date = date_past + datetime.timedelta(days=i) new_visit_statuses.append( OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, date=status_date, type=ov1.type, status="created", snapshot=None, ) ) swh_storage.origin_visit_status_add(new_visit_statuses) reversed_visit_statuses = list(reversed(new_visit_statuses)) # order asc actual_visit_statuses = list( iter_origin_visit_statuses(swh_storage, ov1.origin, ov1.visit) ) assert actual_visit_statuses == new_visit_statuses # order desc actual_visit_statuses = list( iter_origin_visit_statuses( swh_storage, ov1.origin, ov1.visit, order=ListOrder.DESC ) ) assert actual_visit_statuses == reversed_visit_statuses # no result actual_visit_statuses = list( iter_origin_visit_statuses(swh_storage, origin2.url, ov1.visit) ) assert actual_visit_statuses == []