Page MenuHomeSoftware Heritage

D7059.diff
No OneTemporary

D7059.diff

diff --git a/swh/loader/bzr/loader.py b/swh/loader/bzr/loader.py
--- a/swh/loader/bzr/loader.py
+++ b/swh/loader/bzr/loader.py
@@ -23,7 +23,7 @@
from swh.loader.core.loader import BaseLoader
from swh.loader.core.utils import clean_dangling_folders, clone_with_timeout
-from swh.model import from_disk
+from swh.model import from_disk, swhids
from swh.model.model import (
Content,
ExtID,
@@ -40,6 +40,7 @@
Timestamp,
TimestampWithTimezone,
)
+from swh.storage.algos.snapshot import snapshot_get_latest
from swh.storage.interface import StorageInterface
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.bzr.from_disk"
@@ -159,6 +160,9 @@
# Revisions that are pointed to, but don't exist in the current branch
# Rare, but exist usually for cross-VCS references.
self._ghosts: Set[BzrRevisionId] = set()
+ # Exists if in an incremental run, is the latest saved revision from
+ # this origin
+ self._latest_head: Optional[BzrRevisionId] = None
self._load_status = "eventful"
self.origin_url = url
@@ -189,6 +193,9 @@
"""Second step executed by the loader to prepare some state needed by
the loader.
"""
+ latest_snapshot = snapshot_get_latest(self.storage, self.origin_url)
+ if latest_snapshot:
+ self._set_recorded_state(latest_snapshot)
def load_status(self) -> Dict[str, str]:
"""Detailed loading status.
@@ -203,6 +210,37 @@
"status": self._load_status,
}
+ def _set_recorded_state(self, latest_snapshot: Snapshot) -> None:
+ head = latest_snapshot.branches[b"trunk"]
+ bzr_head = self._get_extids_for_targets([head.target])[0].extid
+ self._latest_head = BzrRevisionId(bzr_head)
+
+ def _get_extids_for_targets(self, targets: List[Sha1Git]) -> List[ExtID]:
+ """Get all Bzr ExtIDs for the targets in the latest snapshot"""
+ extids = []
+ for extid in self.storage.extid_get_from_target(
+ swhids.ObjectType.REVISION,
+ targets,
+ extid_type=EXTID_TYPE,
+ extid_version=EXTID_VERSION,
+ ):
+ extids.append(extid)
+ self._revision_id_to_sha1git[
+ BzrRevisionId(extid.extid)
+ ] = extid.target.object_id
+
+ if extids:
+ # Filter out dangling extids, we need to load their target again
+ revisions_missing = self.storage.revision_missing(
+ [extid.target.object_id for extid in extids]
+ )
+ extids = [
+ extid
+ for extid in extids
+ if extid.target.object_id not in revisions_missing
+ ]
+ return extids
+
def cleanup(self) -> None:
if self.repo is not None:
self.repo.unlock()
@@ -438,7 +476,28 @@
ancestry.append((rev, parents))
sorter = tsort.TopoSorter(ancestry)
- return sorter.sorted()
+ all_revisions = sorter.sorted()
+ if self._latest_head is not None:
+ # Breezy does not offer a generic querying system, so we do the
+ # filtering ourselves, which is simple enough given that bzr does
+ # not have multiple heads per branch
+ found = False
+ new_revisions = []
+ # Filter out revisions until we reach the one we've already seen
+ for rev in all_revisions:
+ if not found:
+ if rev == self._latest_head:
+ found = True
+ else:
+ new_revisions.append(rev)
+ if not found and all_revisions:
+ # The previously saved head has been uncommitted, reload
+ # everything
+ msg = "Previous head (%s) not found, loading all revisions"
+ self.log.debug(msg, self._latest_head)
+ return all_revisions
+ return new_revisions
+ return all_revisions
def _iterate_ancestors(self, rev: BzrRevision) -> Iterator[BzrRevisionId]:
"""Return an iterator of this revision's ancestors"""
@@ -503,7 +562,18 @@
def _get_revision_id_from_bzr_id(self, bzr_id: BzrRevisionId) -> Sha1Git:
"""Return the git sha1 of a revision given its bazaar revision id."""
- return self._revision_id_to_sha1git[bzr_id]
+ from_cache = self._revision_id_to_sha1git.get(bzr_id)
+ if from_cache is not None:
+ return from_cache
+ # The parent was not loaded in this run, get it from storage
+ from_storage = self.storage.extid_get_from_extid(
+ EXTID_TYPE, ids=[bzr_id], version=EXTID_VERSION
+ )
+
+ if len(from_storage) != 1:
+ msg = "Expected 1 match from storage for bzr node %r, got %d"
+ raise LookupError(msg % (bzr_id.hex(), len(from_storage)))
+ return from_storage[0].target.object_id
@property
def branch(self) -> BzrBranch:
diff --git a/swh/loader/bzr/tests/test_loader.py b/swh/loader/bzr/tests/test_loader.py
--- a/swh/loader/bzr/tests/test_loader.py
+++ b/swh/loader/bzr/tests/test_loader.py
@@ -2,8 +2,10 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import os
from pathlib import Path
+from breezy.builtins import cmd_uncommit
import pytest
import swh.loader.bzr.loader as loader_mod
@@ -47,6 +49,14 @@
# - No match from storage (wrong topo sort or broken rev)
+def do_uncommit(repo_url):
+ """Remove the latest revision from the given bzr repo"""
+ uncommit_cmd = cmd_uncommit()
+ with open(os.devnull, "w") as f:
+ uncommit_cmd.outf = f
+ uncommit_cmd.run(repo_url)
+
+
@pytest.mark.parametrize("do_clone", [False, True])
def test_nominal(swh_storage, datadir, tmp_path, do_clone):
archive_path = Path(datadir, "nominal.tgz")
@@ -295,3 +305,106 @@
directory[b"a/decently"]
directory[b"a"]
directory[b"another_node"]
+
+
+def test_incremental_noop(swh_storage, datadir, tmp_path):
+ """Check that nothing happens if we try to load a repo twice in a row"""
+ archive_path = Path(datadir, "nominal.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
+
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "eventful"}
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "uneventful"}
+
+
+def test_incremental_nominal(swh_storage, datadir, tmp_path):
+ """Check that an updated repository does update after the second run, but
+ is still a noop in the third run."""
+ archive_path = Path(datadir, "nominal.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
+
+ # remove 2 latest commits
+ do_uncommit(repo_url)
+ do_uncommit(repo_url)
+
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "eventful"}
+ stats = get_stats(swh_storage)
+ assert stats == {
+ "content": 6,
+ "directory": 4,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 2,
+ "revision": 4,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+
+ # Load the complete repo now
+ repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
+
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "eventful"}
+
+ stats = get_stats(swh_storage)
+ expected_stats = {
+ "content": 7,
+ "directory": 7,
+ "origin": 1,
+ "origin_visit": 2,
+ "release": 3,
+ "revision": 6,
+ "skipped_content": 0,
+ "snapshot": 2,
+ }
+
+ assert stats == expected_stats
+
+ # Nothing should change
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "uneventful"}
+
+ stats = get_stats(swh_storage)
+ assert stats == {**expected_stats, "origin_visit": 2 + 1}
+
+
+def test_incremental_uncommitted_head(swh_storage, datadir, tmp_path):
+ """Check that doing an incremental run with the saved head missing does not
+ error out but instead loads everything correctly"""
+ archive_path = Path(datadir, "nominal.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
+
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "eventful"}
+ stats = get_stats(swh_storage)
+ expected_stats = {
+ "content": 7,
+ "directory": 7,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 3,
+ "revision": 6,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+
+ assert stats == expected_stats
+
+ # Remove the previously saved head
+ do_uncommit(repo_url)
+
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "eventful"}
+
+ # Everything is loaded correctly
+ stats = get_stats(swh_storage)
+ assert stats == {**expected_stats, "origin_visit": 1 + 1, "snapshot": 1 + 1}

File Metadata

Mime Type
text/plain
Expires
Dec 21 2024, 6:46 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222159

Event Timeline