diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -14,8 +14,5 @@ # [mypy-add_your_lib_here.*] # ignore_missing_imports = True -[mypy-swh.loader.*] -ignore_missing_imports = True - [mypy-breezy.*] -ignore_missing_imports = True \ No newline at end of file +ignore_missing_imports = True diff --git a/swh/loader/bzr/loader.py b/swh/loader/bzr/loader.py --- a/swh/loader/bzr/loader.py +++ b/swh/loader/bzr/loader.py @@ -23,7 +23,7 @@ from swh.loader.core.loader import BaseLoader from swh.loader.core.utils import clean_dangling_folders, clone_with_timeout -from swh.model import from_disk +from swh.model import from_disk, swhids from swh.model.model import ( Content, ExtID, @@ -40,6 +40,7 @@ Timestamp, TimestampWithTimezone, ) +from swh.storage.algos.snapshot import snapshot_get_latest from swh.storage.interface import StorageInterface TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.bzr.from_disk" @@ -159,6 +160,9 @@ # Revisions that are pointed to, but don't exist in the current branch # Rare, but exist usually for cross-VCS references. self._ghosts: Set[BzrRevisionId] = set() + # Exists if in an incremental run, is the latest saved revision from + # this origin + self._latest_head: Optional[BzrRevisionId] = None self._load_status = "eventful" self.origin_url = url @@ -189,6 +193,9 @@ """Second step executed by the loader to prepare some state needed by the loader. """ + latest_snapshot = snapshot_get_latest(self.storage, self.origin_url) + if latest_snapshot: + self._set_recorded_state(latest_snapshot) def load_status(self) -> Dict[str, str]: """Detailed loading status. @@ -203,6 +210,37 @@ "status": self._load_status, } + def _set_recorded_state(self, latest_snapshot: Snapshot) -> None: + head = latest_snapshot.branches[b"trunk"] + bzr_head = self._get_extids_for_targets([head.target])[0].extid + self._latest_head = BzrRevisionId(bzr_head) + + def _get_extids_for_targets(self, targets: List[Sha1Git]) -> List[ExtID]: + """Get all Bzr ExtIDs for the targets in the latest snapshot""" + extids = [] + for extid in self.storage.extid_get_from_target( + swhids.ObjectType.REVISION, + targets, + extid_type=EXTID_TYPE, + extid_version=EXTID_VERSION, + ): + extids.append(extid) + self._revision_id_to_sha1git[ + BzrRevisionId(extid.extid) + ] = extid.target.object_id + + if extids: + # Filter out dangling extids, we need to load their target again + revisions_missing = self.storage.revision_missing( + [extid.target.object_id for extid in extids] + ) + extids = [ + extid + for extid in extids + if extid.target.object_id not in revisions_missing + ] + return extids + def cleanup(self) -> None: if self.repo is not None: self.repo.unlock() @@ -438,7 +476,28 @@ ancestry.append((rev, parents)) sorter = tsort.TopoSorter(ancestry) - return sorter.sorted() + all_revisions = sorter.sorted() + if self._latest_head is not None: + # Breezy does not offer a generic querying system, so we do the + # filtering ourselves, which is simple enough given that bzr does + # not have multiple heads per branch + found = False + new_revisions = [] + # Filter out revisions until we reach the one we've already seen + for rev in all_revisions: + if not found: + if rev == self._latest_head: + found = True + else: + new_revisions.append(rev) + if not found and all_revisions: + # The previously saved head has been uncommitted, reload + # everything + msg = "Previous head (%s) not found, loading all revisions" + self.log.debug(msg, self._latest_head) + return all_revisions + return new_revisions + return all_revisions def _iterate_ancestors(self, rev: BzrRevision) -> Iterator[BzrRevisionId]: """Return an iterator of this revision's ancestors""" @@ -503,7 +562,18 @@ def _get_revision_id_from_bzr_id(self, bzr_id: BzrRevisionId) -> Sha1Git: """Return the git sha1 of a revision given its bazaar revision id.""" - return self._revision_id_to_sha1git[bzr_id] + from_cache = self._revision_id_to_sha1git.get(bzr_id) + if from_cache is not None: + return from_cache + # The parent was not loaded in this run, get it from storage + from_storage = self.storage.extid_get_from_extid( + EXTID_TYPE, ids=[bzr_id], version=EXTID_VERSION + ) + + if len(from_storage) != 1: + msg = "Expected 1 match from storage for bzr node %r, got %d" + raise LookupError(msg % (bzr_id.hex(), len(from_storage))) + return from_storage[0].target.object_id @property def branch(self) -> BzrBranch: diff --git a/swh/loader/bzr/tests/test_loader.py b/swh/loader/bzr/tests/test_loader.py --- a/swh/loader/bzr/tests/test_loader.py +++ b/swh/loader/bzr/tests/test_loader.py @@ -2,8 +2,10 @@ # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import os from pathlib import Path +from breezy.builtins import cmd_uncommit import pytest import swh.loader.bzr.loader as loader_mod @@ -47,6 +49,14 @@ # - No match from storage (wrong topo sort or broken rev) +def do_uncommit(repo_url): + """Remove the latest revision from the given bzr repo""" + uncommit_cmd = cmd_uncommit() + with open(os.devnull, "w") as f: + uncommit_cmd.outf = f + uncommit_cmd.run(repo_url) + + @pytest.mark.parametrize("do_clone", [False, True]) def test_nominal(swh_storage, datadir, tmp_path, do_clone): archive_path = Path(datadir, "nominal.tgz") @@ -295,3 +305,106 @@ directory[b"a/decently"] directory[b"a"] directory[b"another_node"] + + +def test_incremental_noop(swh_storage, datadir, tmp_path): + """Check that nothing happens if we try to load a repo twice in a row""" + archive_path = Path(datadir, "nominal.tgz") + repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path) + + loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) + res = loader.load() + assert res == {"status": "eventful"} + loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) + res = loader.load() + assert res == {"status": "uneventful"} + + +def test_incremental_nominal(swh_storage, datadir, tmp_path): + """Check that an updated repository does update after the second run, but + is still a noop in the third run.""" + archive_path = Path(datadir, "nominal.tgz") + repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path) + + # remove 2 latest commits + do_uncommit(repo_url) + do_uncommit(repo_url) + + loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) + res = loader.load() + assert res == {"status": "eventful"} + stats = get_stats(swh_storage) + assert stats == { + "content": 6, + "directory": 4, + "origin": 1, + "origin_visit": 1, + "release": 2, + "revision": 4, + "skipped_content": 0, + "snapshot": 1, + } + + # Load the complete repo now + repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path) + + loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) + res = loader.load() + assert res == {"status": "eventful"} + + stats = get_stats(swh_storage) + expected_stats = { + "content": 7, + "directory": 7, + "origin": 1, + "origin_visit": 2, + "release": 3, + "revision": 6, + "skipped_content": 0, + "snapshot": 2, + } + + assert stats == expected_stats + + # Nothing should change + loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) + res = loader.load() + assert res == {"status": "uneventful"} + + stats = get_stats(swh_storage) + assert stats == {**expected_stats, "origin_visit": 2 + 1} + + +def test_incremental_uncommitted_head(swh_storage, datadir, tmp_path): + """Check that doing an incremental run with the saved head missing does not + error out but instead loads everything correctly""" + archive_path = Path(datadir, "nominal.tgz") + repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path) + + loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) + res = loader.load() + assert res == {"status": "eventful"} + stats = get_stats(swh_storage) + expected_stats = { + "content": 7, + "directory": 7, + "origin": 1, + "origin_visit": 1, + "release": 3, + "revision": 6, + "skipped_content": 0, + "snapshot": 1, + } + + assert stats == expected_stats + + # Remove the previously saved head + do_uncommit(repo_url) + + loader = BazaarLoader(swh_storage, repo_url, directory=repo_url) + res = loader.load() + assert res == {"status": "eventful"} + + # Everything is loaded correctly + stats = get_stats(swh_storage) + assert stats == {**expected_stats, "origin_visit": 1 + 1, "snapshot": 1 + 1}