diff --git a/swh/storage/algos/snapshot.py b/swh/storage/algos/snapshot.py --- a/swh/storage/algos/snapshot.py +++ b/swh/storage/algos/snapshot.py @@ -1,8 +1,14 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from typing import Iterable, Optional + +from swh.model.model import Snapshot + +from swh.storage.algos.origin import origin_get_latest_visit_status + def snapshot_get_all_branches(storage, snapshot_id): """Get all the branches for a given snapshot @@ -28,3 +34,61 @@ next_branch = data.get("next_branch") return ret + + +def snapshot_get_latest( + storage, + origin: str, + allowed_statuses: Optional[Iterable[str]] = None, + branches_count: Optional[int] = None, +) -> Optional[Snapshot]: + """Get the latest snapshot for the given origin, optionally only from visits that have + one of the given allowed_statuses. + + The branches of the snapshot are iterated in the lexicographical + order of their names. + + Args: + storage: Storage instance + origin: the origin's URL + allowed_statuses: list of visit statuses considered + to find the latest snapshot for the visit. For instance, + ``allowed_statuses=['full']`` will only consider visits that + have successfully run to completion. + branches_count: Optional parameter to retrieve snapshot with all branches + (default behavior when None) or not. If set to positive number, the snapshot + will be partial with only that number of branches. + + Raises: + ValueError if branches_count is not a positive value + + Returns: + The snapshot object if one is found matching the criteria or None. + + """ + visit_and_status = origin_get_latest_visit_status( + storage, origin, allowed_statuses=allowed_statuses, require_snapshot=True + ) + + if not visit_and_status: + return None + + _, visit_status = visit_and_status + snapshot_id = visit_status.snapshot + if not snapshot_id: + return None + + if branches_count: # partial snapshot + if not isinstance(branches_count, int) or branches_count <= 0: + raise ValueError( + "Parameter branches_count must be a positive integer. " + f"Current value is {branches_count}" + ) + snapshot = storage.snapshot_get_branches( + snapshot_id, branches_count=branches_count + ) + snapshot.pop("next_branch") + else: + snapshot = snapshot_get_all_branches(storage, snapshot_id) + + return Snapshot.from_dict(snapshot) diff --git a/swh/storage/cassandra/storage.py b/swh/storage/cassandra/storage.py --- a/swh/storage/cassandra/storage.py +++ b/swh/storage/cassandra/storage.py @@ -583,17 +583,6 @@ return self.snapshot_get(visit["snapshot"]) - def snapshot_get_latest(self, origin, allowed_statuses=None): - visit = self.origin_visit_get_latest( - origin, allowed_statuses=allowed_statuses, require_snapshot=True - ) - - if visit: - assert visit["snapshot"] - if self._cql_runner.snapshot_missing([visit["snapshot"]]): - raise StorageArgumentException("Visit references unknown snapshot") - return self.snapshot_get_branches(visit["snapshot"]) - def snapshot_count_branches(self, snapshot_id): if self._cql_runner.snapshot_missing([snapshot_id]): # Makes sure we don't fetch branches for a snapshot that is diff --git a/swh/storage/in_memory.py b/swh/storage/in_memory.py --- a/swh/storage/in_memory.py +++ b/swh/storage/in_memory.py @@ -583,22 +583,6 @@ else: return None - def snapshot_get_latest(self, origin, allowed_statuses=None): - origin_url = self._get_origin_url(origin) - if not origin_url: - return - - visit = self.origin_visit_get_latest( - origin_url, allowed_statuses=allowed_statuses, require_snapshot=True - ) - if visit and visit["snapshot"]: - snapshot = self.snapshot_get(visit["snapshot"]) - if not snapshot: - raise StorageArgumentException( - "last origin visit references an unknown snapshot" - ) - return snapshot - def snapshot_count_branches(self, snapshot_id): snapshot = self._snapshots[snapshot_id] return collections.Counter( diff --git a/swh/storage/interface.py b/swh/storage/interface.py --- a/swh/storage/interface.py +++ b/swh/storage/interface.py @@ -695,37 +695,6 @@ """ ... - @remote_api_endpoint("snapshot/latest") - def snapshot_get_latest(self, origin, allowed_statuses=None): - """Get the content, possibly partial, of the latest snapshot for the - given origin, optionally only from visits that have one of the given - allowed_statuses - - The branches of the snapshot are iterated in the lexicographical - order of their names. - - .. warning:: At most 1000 branches contained in the snapshot will be - returned for performance reasons. In order to browse the whole - set of branches, the method :meth:`snapshot_get_branches` - should be used instead. - - Args: - origin (str): the origin's URL - allowed_statuses (list of str): list of visit statuses considered - to find the latest snapshot for the visit. For instance, - ``allowed_statuses=['full']`` will only consider visits that - have successfully run to completion. - Returns: - dict: a dict with three keys: - * **id**: identifier of the snapshot - * **branches**: a dict of branches contained in the snapshot - whose keys are the branches' names. - * **next_branch**: the name of the first branch not returned - or :const:`None` if the snapshot has less than 1000 - branches. - """ - ... - @remote_api_endpoint("snapshot/count_branches") def snapshot_count_branches(self, snapshot_id): """Count the number of branches in the snapshot with the given id diff --git a/swh/storage/storage.py b/swh/storage/storage.py --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -735,30 +735,6 @@ return None - @timed - @db_transaction(statement_timeout=4000) - def snapshot_get_latest(self, origin, allowed_statuses=None, db=None, cur=None): - if isinstance(origin, int): - origin = self.origin_get({"id": origin}, db=db, cur=cur) - if not origin: - return - origin = origin["url"] - - origin_visit = self.origin_visit_get_latest( - origin, - allowed_statuses=allowed_statuses, - require_snapshot=True, - db=db, - cur=cur, - ) - if origin_visit and origin_visit["snapshot"]: - snapshot = self.snapshot_get(origin_visit["snapshot"], db=db, cur=cur) - if not snapshot: - raise StorageArgumentException( - "last origin visit references an unknown snapshot" - ) - return snapshot - @timed @db_transaction(statement_timeout=2000) def snapshot_count_branches(self, snapshot_id, db=None, cur=None): diff --git a/swh/storage/tests/algos/test_snapshot.py b/swh/storage/tests/algos/test_snapshot.py --- a/swh/storage/tests/algos/test_snapshot.py +++ b/swh/storage/tests/algos/test_snapshot.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018 The Software Heritage developers +# Copyright (C) 2018-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -6,10 +6,14 @@ from hypothesis import given import pytest -from swh.model.identifiers import snapshot_identifier, identifier_to_bytes from swh.model.hypothesis_strategies import snapshots, branch_names, branch_targets +from swh.model.identifiers import snapshot_identifier, identifier_to_bytes +from swh.model.model import Origin, OriginVisit, OriginVisitStatus, Snapshot + +from swh.storage.algos.snapshot import snapshot_get_all_branches, snapshot_get_latest +from swh.storage.utils import now -from swh.storage.algos.snapshot import snapshot_get_all_branches +from swh.storage.tests.storage_data import data @pytest.fixture @@ -42,3 +46,106 @@ returned_snapshot = snapshot_get_all_branches(swh_storage, snapshot["id"]) assert snapshot == returned_snapshot + + +def test_snapshot_get_latest_none(swh_storage): + """Retrieve latest snapshot on unknown origin or origin without snapshot should + yield no result + + """ + assert snapshot_get_latest(swh_storage, "unknown-origin") is None + + # no snapshot on origin visit then nothing is found + origin = Origin.from_dict(data.origin) + swh_storage.origin_add_one(origin) + swh_storage.origin_visit_add( + [ + OriginVisit( + origin=origin.url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, + ) + ] + ) + assert snapshot_get_latest(swh_storage, origin.url) is None + + +def test_snapshot_get_latest(swh_storage): + origin = Origin.from_dict(data.origin) + swh_storage.origin_add_one(origin) + + visit1 = OriginVisit( + origin=origin.url, + date=data.date_visit1, + type=data.type_visit1, + status="ongoing", + snapshot=None, + ) + ov1 = swh_storage.origin_visit_add([visit1])[0] + + # Add snapshot to visit1, latest snapshot = visit 1 snapshot + complete_snapshot = Snapshot.from_dict(data.complete_snapshot) + swh_storage.snapshot_add([complete_snapshot]) + + swh_storage.origin_visit_status_add( + [ + OriginVisitStatus( + origin=origin.url, + visit=ov1.visit, + date=data.date_visit2, + status="partial", + snapshot=None, + ) + ] + ) + assert data.date_visit1 < data.date_visit2 + + # no snapshot associated to the visit, so None + actual_snapshot = snapshot_get_latest( + swh_storage, origin.url, allowed_statuses=["partial"] + ) + assert actual_snapshot is None + + date_now = now() + assert data.date_visit2 < date_now + swh_storage.origin_visit_status_add( + [ + OriginVisitStatus( + origin=origin.url, + visit=ov1.visit, + date=date_now, + status="full", + snapshot=complete_snapshot.id, + ) + ] + ) + + actual_snapshot = snapshot_get_latest(swh_storage, origin.url) + assert actual_snapshot is not None + assert actual_snapshot == complete_snapshot + + swh_storage.origin_visit_status_add( + [ + OriginVisitStatus( + origin=origin.url, + visit=ov1.visit, + date=date_now, + status="full", + snapshot=complete_snapshot.id, + ) + ] + ) + + actual_snapshot = snapshot_get_latest(swh_storage, origin.url) + assert actual_snapshot is not None + assert actual_snapshot == complete_snapshot + + actual_snapshot = snapshot_get_latest(swh_storage, origin.url, branches_count=1) + assert actual_snapshot is not None + assert actual_snapshot.id == complete_snapshot.id + assert len(actual_snapshot.branches.values()) == 1 + + with pytest.raises(ValueError, match="branches_count must be a positive integer"): + snapshot_get_latest(swh_storage, origin.url, branches_count="something-wrong") diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -2863,213 +2863,6 @@ ("origin_visit_status", OriginVisitStatus.from_dict(data4),), ] - def test_snapshot_get_latest(self, swh_storage): - origin_url = swh_storage.origin_add_one(data.origin) - visit1 = OriginVisit( - origin=origin_url, - date=data.date_visit1, - type=data.type_visit1, - status="ongoing", - snapshot=None, - ) - visit2 = OriginVisit( - origin=origin_url, - date=data.date_visit2, - type=data.type_visit2, - status="ongoing", - snapshot=None, - ) - # Add a visit with the same date as the previous one - visit3 = OriginVisit( - origin=origin_url, - date=data.date_visit2, - type=data.type_visit3, - status="ongoing", - snapshot=None, - ) - ov1, ov2, ov3 = swh_storage.origin_visit_add([visit1, visit2, visit3]) - # Two visits, both with no snapshot: latest snapshot is None - assert swh_storage.snapshot_get_latest(origin_url) is None - - # Add snapshot to visit1, latest snapshot = visit 1 snapshot - swh_storage.snapshot_add([data.complete_snapshot]) - swh_storage.origin_visit_status_add( - [ - OriginVisitStatus( - origin=origin_url, - visit=ov1.visit, - date=now(), - status="ongoing", - snapshot=data.complete_snapshot["id"], - ) - ] - ) - assert { - **data.complete_snapshot, - "next_branch": None, - } == swh_storage.snapshot_get_latest(origin_url) - - # Status filter: all three visits are status=ongoing, so no snapshot - # returned - assert ( - swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"]) - is None - ) - - # Mark the first visit as completed and check status filter again - swh_storage.origin_visit_status_add( - [ - OriginVisitStatus( - origin=origin_url, - visit=ov1.visit, - date=now(), - status="full", - snapshot=data.complete_snapshot["id"], - ) - ] - ) - assert { - **data.complete_snapshot, - "next_branch": None, - } == swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"]) - - # Add snapshot to visit2 and check that the new snapshot is returned - swh_storage.snapshot_add([data.empty_snapshot]) - swh_storage.origin_visit_status_add( - [ - OriginVisitStatus( - origin=origin_url, - visit=ov2.visit, - date=now(), - status="ongoing", - snapshot=data.empty_snapshot["id"], - ) - ] - ) - assert { - **data.empty_snapshot, - "next_branch": None, - } == swh_storage.snapshot_get_latest(origin_url) - - # Check that the status filter is still working - assert { - **data.complete_snapshot, - "next_branch": None, - } == swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"]) - - # Add snapshot to visit3 (same date as visit2) and check that - # the new snapshot is returned - swh_storage.snapshot_add([data.complete_snapshot]) - swh_storage.origin_visit_status_add( - [ - OriginVisitStatus( - origin=origin_url, - visit=ov3.visit, - date=now(), - status="ongoing", - snapshot=data.complete_snapshot["id"], - ) - ] - ) - assert { - **data.complete_snapshot, - "next_branch": None, - } == swh_storage.snapshot_get_latest(origin_url) - - def test_snapshot_get_latest__missing_snapshot(self, swh_storage): - origin_url = swh_storage.origin_add_one(data.origin) - assert swh_storage.snapshot_get_latest(origin_url) is None - visit1 = OriginVisit( - origin=origin_url, - date=data.date_visit1, - type=data.type_visit1, - status="ongoing", - snapshot=None, - ) - visit2 = OriginVisit( - origin=origin_url, - date=data.date_visit2, - type=data.type_visit2, - status="ongoing", - snapshot=None, - ) - ov1, ov2 = swh_storage.origin_visit_add([visit1, visit2]) - - # Two visits, both with no snapshot: latest snapshot is None - assert swh_storage.snapshot_get_latest(origin_url) is None - - # Add unknown snapshot to visit1, check that the inconsistency is - # detected - swh_storage.origin_visit_status_add( - [ - OriginVisitStatus( - origin=origin_url, - visit=ov1.visit, - date=now(), - status="ongoing", - snapshot=data.complete_snapshot["id"], - ) - ] - ) - with pytest.raises(Exception): - # XXX: should the exception be more specific than this? - swh_storage.snapshot_get_latest(origin_url) - - # Status filter: both visits are status=ongoing, so no snapshot - # returned - assert ( - swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"]) - is None - ) - - # Mark the first visit as completed and check status filter again - swh_storage.origin_visit_status_add( - [ - OriginVisitStatus( - origin=origin_url, - visit=ov1.visit, - date=now(), - status="full", - snapshot=data.complete_snapshot["id"], - ) - ] - ) - - with pytest.raises(Exception): - # XXX: should the exception be more specific than this? - swh_storage.snapshot_get_latest(origin_url, allowed_statuses=["full"]), - - # Actually add the snapshot and check status filter again - swh_storage.snapshot_add([data.complete_snapshot]) - assert { - **data.complete_snapshot, - "next_branch": None, - } == swh_storage.snapshot_get_latest(origin_url) - - # Add unknown snapshot to visit2 and check that the inconsistency - # is detected - swh_storage.origin_visit_status_add( - [ - OriginVisitStatus( - origin=origin_url, - visit=ov2.visit, - date=now(), - status="ongoing", - snapshot=data.snapshot["id"], - ) - ] - ) - with pytest.raises(Exception): - # XXX: should the exception be more specific than this? - swh_storage.snapshot_get_latest(origin_url) - - # Actually add that snapshot and check that the new one is returned - swh_storage.snapshot_add([data.snapshot]) - assert { - **data.snapshot, - "next_branch": None, - } == swh_storage.snapshot_get_latest(origin_url) - def test_snapshot_get_random(self, swh_storage): swh_storage.snapshot_add( [data.snapshot, data.empty_snapshot, data.complete_snapshot]