diff --git a/swh/storage/algos/origin.py b/swh/storage/algos/origin.py --- a/swh/storage/algos/origin.py +++ b/swh/storage/algos/origin.py @@ -1,8 +1,11 @@ -# Copyright (C) 2019 The Software Heritage developers +# Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Optional, Iterable, Tuple +from swh.model.model import OriginVisit, OriginVisitStatus + def iter_origins(storage, origin_from=1, origin_to=None, batch_size=10000): """Iterates over all origins in the storage. @@ -33,3 +36,48 @@ yield origin if origin_to and start > origin_to: break + + +def origin_get_latest_visit_status( + storage, + origin_url: str, + type: Optional[str] = None, + allowed_statuses: Optional[Iterable[str]] = None, + require_snapshot: bool = False, +) -> Optional[Tuple[OriginVisit, OriginVisitStatus]]: + """Get the latest origin visit and visit status information for a given origin, + optionally looking only for those with one of the given allowed_statuses or for + those with a snapshot. + + If nothing matches the criteria, this returns None. + + Args: + storage: A storage backend + origin: origin URL + type: Optional visit type to filter on (e.g git, tar, dsc, svn, + hg, npm, pypi, ...) + allowed_statuses: list of visit statuses considered + to find the latest visit. For instance, + ``allowed_statuses=['full']`` will only consider visits that + have successfully run to completion. + require_snapshot: If True, only a visit with a snapshot + will be returned. + + Returns: + a tuple of (visit, visit_status) model object if the visit *and* the visit + status exist, None otherwise. + + """ + visit_d = storage.origin_visit_get_latest(origin_url, type=type) + if not visit_d: + return None + visit = OriginVisit.from_dict(visit_d) + visit_status = storage.origin_visit_status_get_latest( + origin_url, + visit.visit, + allowed_statuses=allowed_statuses, + require_snapshot=require_snapshot, + ) + if not visit_status: + return None + return (visit, visit_status) diff --git a/swh/storage/tests/algos/test_origin.py b/swh/storage/tests/algos/test_origin.py --- a/swh/storage/tests/algos/test_origin.py +++ b/swh/storage/tests/algos/test_origin.py @@ -7,7 +7,13 @@ from unittest.mock import patch -from swh.storage.algos.origin import iter_origins +from swh.model.model import Origin, OriginVisit, OriginVisitStatus, Snapshot + +from swh.storage.algos.origin import iter_origins, origin_get_latest_visit_status +from swh.storage.utils import now + +from swh.storage.tests.test_storage import round_to_milliseconds +from swh.storage.tests.storage_data import data def assert_list_eq(left, right, msg=None): @@ -69,3 +75,232 @@ list(iter_origins(swh_storage, batch_size=42)) mock_origin_get_range.assert_called_with(origin_from=1, origin_count=42) + + +def test_origin_get_latest_status_none(swh_storage): + """Looking up unknown objects should return nothing + + """ + # unknown origin so no result + assert origin_get_latest_visit_status(swh_storage, "unknown-origin") is None + + # unknown type so no result + origin = Origin.from_dict(data.origin) + swh_storage.origin_add_one(origin) + swh_storage.origin_visit_add( + [ + OriginVisit( + origin=origin.url, + date=data.date_visit1, + type="git", + status="ongoing", + snapshot=None, + ), + ] + )[0] + actual_origin_visit = origin_get_latest_visit_status( + swh_storage, origin.url, type="unknown" + ) + assert actual_origin_visit is None + + actual_origin_visit = origin_get_latest_visit_status( + swh_storage, origin.url, require_snapshot=True + ) + assert actual_origin_visit is None + + actual_origin_visit = origin_get_latest_visit_status( + swh_storage, origin.url, allowed_statuses=["unknown"] + ) + assert actual_origin_visit is None + + +def init_storage_with_origin_visits(swh_storage): + """Initialize storage with origin/origin-visit/origin-visit-status + + """ + origin1 = Origin.from_dict(data.origin) + origin2 = Origin.from_dict(data.origin2) + swh_storage.origin_add([origin1, origin2]) + + ov1, ov2 = swh_storage.origin_visit_add( + [ + OriginVisit( + origin=origin1.url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, + ), + OriginVisit( + origin=origin2.url, + date=data.date_visit2, + type=data.type_visit2, + status="ongoing", + snapshot=None, + ), + ] + ) + + snapshot = Snapshot.from_dict(data.complete_snapshot) + swh_storage.snapshot_add([snapshot]) + + date_now = now() + date_now = round_to_milliseconds(date_now) + assert data.date_visit1 < data.date_visit2 + assert data.date_visit2 < date_now + + # origin visit status 1 for origin visit 1 + ovs11 = OriginVisitStatus( + origin=origin1.url, + visit=ov1.visit, + date=data.date_visit1, + status="partial", + snapshot=None, + ) + # origin visit status 2 for origin visit 1 + ovs12 = OriginVisitStatus( + origin=origin1.url, + visit=ov1.visit, + date=data.date_visit2, + status="ongoing", + snapshot=None, + ) + # origin visit status 1 for origin visit 2 + ovs21 = OriginVisitStatus( + origin=origin2.url, + visit=ov2.visit, + date=data.date_visit2, + status="ongoing", + snapshot=None, + ) + # origin visit status 2 for origin visit 2 + ovs22 = OriginVisitStatus( + origin=origin2.url, + visit=ov2.visit, + date=date_now, + status="full", + snapshot=snapshot.id, + metadata={"something": "wicked"}, + ) + + swh_storage.origin_visit_status_add([ovs11, ovs12, ovs21, ovs22]) + return { + "origin": [origin1, origin2], + "origin_visit": [ov1, ov2], + "origin_visit_status": [ovs11, ovs12, ovs21, ovs22], + } + + +def test_origin_get_latest_status_filter_type(swh_storage): + """Filtering origin visit per types should yield consistent results + + """ + objects = init_storage_with_origin_visits(swh_storage) + origin1, origin2 = objects["origin"] + ov1, ov2 = objects["origin_visit"] + ovs11, ovs12, _, ovs22 = objects["origin_visit_status"] + + # no visit for origin1 url with type_visit2 + assert ( + origin_get_latest_visit_status(swh_storage, origin1.url, type=data.type_visit2) + is None + ) + + # no visit for origin2 url with type_visit1 + assert ( + origin_get_latest_visit_status(swh_storage, origin2.url, type=data.type_visit1) + is None + ) + + # Two visits, both with no snapshot, take the most recent + actual_ov1, actual_ovs12 = origin_get_latest_visit_status( + swh_storage, origin1.url, type=data.type_visit1 + ) + assert isinstance(actual_ov1, OriginVisit) + assert isinstance(actual_ovs12, OriginVisitStatus) + assert actual_ov1.origin == ov1.origin + assert actual_ov1.visit == ov1.visit + assert actual_ov1.type == data.type_visit1 + assert actual_ovs12 == ovs12 + + # take the most recent visit with type_visit2 + actual_ov2, actual_ovs22 = origin_get_latest_visit_status( + swh_storage, origin2.url, type=data.type_visit2 + ) + assert isinstance(actual_ov2, OriginVisit) + assert isinstance(actual_ovs22, OriginVisitStatus) + assert actual_ov2.origin == ov2.origin + assert actual_ov2.visit == ov2.visit + assert actual_ov2.type == data.type_visit2 + assert actual_ovs22 == ovs22 + + +def test_origin_get_latest_status_filter_status(swh_storage): + objects = init_storage_with_origin_visits(swh_storage) + origin1, origin2 = objects["origin"] + ov1, ov2 = objects["origin_visit"] + ovs11, ovs12, _, ovs22 = objects["origin_visit_status"] + + # no failed status for that visit + assert ( + origin_get_latest_visit_status( + swh_storage, origin2.url, allowed_statuses=["failed"] + ) + is None + ) + + # only 1 partial for that visit + actual_ov1, actual_ovs11 = origin_get_latest_visit_status( + swh_storage, origin1.url, allowed_statuses=["partial"] + ) + assert actual_ov1.origin == ov1.origin + assert actual_ov1.visit == ov1.visit + assert actual_ov1.type == data.type_visit1 + assert actual_ovs11 == ovs11 + + # both status exist, take the latest one + actual_ov1, actual_ovs12 = origin_get_latest_visit_status( + swh_storage, origin1.url, allowed_statuses=["partial", "ongoing"] + ) + assert actual_ov1.origin == ov1.origin + assert actual_ov1.visit == ov1.visit + assert actual_ov1.type == data.type_visit1 + assert actual_ovs12 == ovs12 + + assert isinstance(actual_ov1, OriginVisit) + assert isinstance(actual_ovs12, OriginVisitStatus) + assert actual_ov1.origin == ov1.origin + assert actual_ov1.visit == ov1.visit + assert actual_ov1.type == data.type_visit1 + assert actual_ovs12 == ovs12 + + # take the most recent visit with type_visit2 + actual_ov2, actual_ovs22 = origin_get_latest_visit_status( + swh_storage, origin2.url, allowed_statuses=["full"] + ) + assert actual_ov2.origin == ov2.origin + assert actual_ov2.visit == ov2.visit + assert actual_ov2.type == data.type_visit2 + assert actual_ovs22 == ovs22 + + +def test_origin_get_latest_status_filter_snapshot(swh_storage): + objects = init_storage_with_origin_visits(swh_storage) + origin1, origin2 = objects["origin"] + _, ov2 = objects["origin_visit"] + _, _, _, ovs22 = objects["origin_visit_status"] + + # there is no visit with snapshot yet for that visit + assert ( + origin_get_latest_visit_status(swh_storage, origin1.url, require_snapshot=True) + is None + ) + + # visit status with partial status visit elected + actual_ov2, actual_ovs22 = origin_get_latest_visit_status( + swh_storage, origin2.url, require_snapshot=True + ) + assert actual_ov2.origin == ov2.origin + assert actual_ov2.visit == ov2.visit + assert actual_ov2.type == ov2.type + assert actual_ovs22 == ovs22