Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124765
D7059.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Subscribers
None
D7059.diff
View Options
diff --git a/swh/loader/bzr/loader.py b/swh/loader/bzr/loader.py
--- a/swh/loader/bzr/loader.py
+++ b/swh/loader/bzr/loader.py
@@ -23,7 +23,7 @@
from swh.loader.core.loader import BaseLoader
from swh.loader.core.utils import clean_dangling_folders, clone_with_timeout
-from swh.model import from_disk
+from swh.model import from_disk, swhids
from swh.model.model import (
Content,
ExtID,
@@ -40,6 +40,7 @@
Timestamp,
TimestampWithTimezone,
)
+from swh.storage.algos.snapshot import snapshot_get_latest
from swh.storage.interface import StorageInterface
TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.bzr.from_disk"
@@ -159,6 +160,9 @@
# Revisions that are pointed to, but don't exist in the current branch
# Rare, but exist usually for cross-VCS references.
self._ghosts: Set[BzrRevisionId] = set()
+ # Exists if in an incremental run, is the latest saved revision from
+ # this origin
+ self._latest_head: Optional[BzrRevisionId] = None
self._load_status = "eventful"
self.origin_url = url
@@ -189,6 +193,9 @@
"""Second step executed by the loader to prepare some state needed by
the loader.
"""
+ latest_snapshot = snapshot_get_latest(self.storage, self.origin_url)
+ if latest_snapshot:
+ self._set_recorded_state(latest_snapshot)
def load_status(self) -> Dict[str, str]:
"""Detailed loading status.
@@ -203,6 +210,37 @@
"status": self._load_status,
}
+ def _set_recorded_state(self, latest_snapshot: Snapshot) -> None:
+ head = latest_snapshot.branches[b"trunk"]
+ bzr_head = self._get_extids_for_targets([head.target])[0].extid
+ self._latest_head = BzrRevisionId(bzr_head)
+
+ def _get_extids_for_targets(self, targets: List[Sha1Git]) -> List[ExtID]:
+ """Get all Bzr ExtIDs for the targets in the latest snapshot"""
+ extids = []
+ for extid in self.storage.extid_get_from_target(
+ swhids.ObjectType.REVISION,
+ targets,
+ extid_type=EXTID_TYPE,
+ extid_version=EXTID_VERSION,
+ ):
+ extids.append(extid)
+ self._revision_id_to_sha1git[
+ BzrRevisionId(extid.extid)
+ ] = extid.target.object_id
+
+ if extids:
+ # Filter out dangling extids, we need to load their target again
+ revisions_missing = self.storage.revision_missing(
+ [extid.target.object_id for extid in extids]
+ )
+ extids = [
+ extid
+ for extid in extids
+ if extid.target.object_id not in revisions_missing
+ ]
+ return extids
+
def cleanup(self) -> None:
if self.repo is not None:
self.repo.unlock()
@@ -438,7 +476,28 @@
ancestry.append((rev, parents))
sorter = tsort.TopoSorter(ancestry)
- return sorter.sorted()
+ all_revisions = sorter.sorted()
+ if self._latest_head is not None:
+ # Breezy does not offer a generic querying system, so we do the
+ # filtering ourselves, which is simple enough given that bzr does
+ # not have multiple heads per branch
+ found = False
+ new_revisions = []
+ # Filter out revisions until we reach the one we've already seen
+ for rev in all_revisions:
+ if not found:
+ if rev == self._latest_head:
+ found = True
+ else:
+ new_revisions.append(rev)
+ if not found and all_revisions:
+ # The previously saved head has been uncommitted, reload
+ # everything
+ msg = "Previous head (%s) not found, loading all revisions"
+ self.log.debug(msg, self._latest_head)
+ return all_revisions
+ return new_revisions
+ return all_revisions
def _iterate_ancestors(self, rev: BzrRevision) -> Iterator[BzrRevisionId]:
"""Return an iterator of this revision's ancestors"""
@@ -503,7 +562,18 @@
def _get_revision_id_from_bzr_id(self, bzr_id: BzrRevisionId) -> Sha1Git:
"""Return the git sha1 of a revision given its bazaar revision id."""
- return self._revision_id_to_sha1git[bzr_id]
+ from_cache = self._revision_id_to_sha1git.get(bzr_id)
+ if from_cache is not None:
+ return from_cache
+ # The parent was not loaded in this run, get it from storage
+ from_storage = self.storage.extid_get_from_extid(
+ EXTID_TYPE, ids=[bzr_id], version=EXTID_VERSION
+ )
+
+ if len(from_storage) != 1:
+ msg = "Expected 1 match from storage for bzr node %r, got %d"
+ raise LookupError(msg % (bzr_id.hex(), len(from_storage)))
+ return from_storage[0].target.object_id
@property
def branch(self) -> BzrBranch:
diff --git a/swh/loader/bzr/tests/test_loader.py b/swh/loader/bzr/tests/test_loader.py
--- a/swh/loader/bzr/tests/test_loader.py
+++ b/swh/loader/bzr/tests/test_loader.py
@@ -2,8 +2,10 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
+import os
from pathlib import Path
+from breezy.builtins import cmd_uncommit
import pytest
import swh.loader.bzr.loader as loader_mod
@@ -47,6 +49,14 @@
# - No match from storage (wrong topo sort or broken rev)
+def do_uncommit(repo_url):
+ """Remove the latest revision from the given bzr repo"""
+ uncommit_cmd = cmd_uncommit()
+ with open(os.devnull, "w") as f:
+ uncommit_cmd.outf = f
+ uncommit_cmd.run(repo_url)
+
+
@pytest.mark.parametrize("do_clone", [False, True])
def test_nominal(swh_storage, datadir, tmp_path, do_clone):
archive_path = Path(datadir, "nominal.tgz")
@@ -295,3 +305,106 @@
directory[b"a/decently"]
directory[b"a"]
directory[b"another_node"]
+
+
+def test_incremental_noop(swh_storage, datadir, tmp_path):
+ """Check that nothing happens if we try to load a repo twice in a row"""
+ archive_path = Path(datadir, "nominal.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
+
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "eventful"}
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "uneventful"}
+
+
+def test_incremental_nominal(swh_storage, datadir, tmp_path):
+ """Check that an updated repository does update after the second run, but
+ is still a noop in the third run."""
+ archive_path = Path(datadir, "nominal.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
+
+ # remove 2 latest commits
+ do_uncommit(repo_url)
+ do_uncommit(repo_url)
+
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "eventful"}
+ stats = get_stats(swh_storage)
+ assert stats == {
+ "content": 6,
+ "directory": 4,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 2,
+ "revision": 4,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+
+ # Load the complete repo now
+ repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
+
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "eventful"}
+
+ stats = get_stats(swh_storage)
+ expected_stats = {
+ "content": 7,
+ "directory": 7,
+ "origin": 1,
+ "origin_visit": 2,
+ "release": 3,
+ "revision": 6,
+ "skipped_content": 0,
+ "snapshot": 2,
+ }
+
+ assert stats == expected_stats
+
+ # Nothing should change
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "uneventful"}
+
+ stats = get_stats(swh_storage)
+ assert stats == {**expected_stats, "origin_visit": 2 + 1}
+
+
+def test_incremental_uncommitted_head(swh_storage, datadir, tmp_path):
+ """Check that doing an incremental run with the saved head missing does not
+ error out but instead loads everything correctly"""
+ archive_path = Path(datadir, "nominal.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, "nominal", tmp_path)
+
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "eventful"}
+ stats = get_stats(swh_storage)
+ expected_stats = {
+ "content": 7,
+ "directory": 7,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 3,
+ "revision": 6,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+
+ assert stats == expected_stats
+
+ # Remove the previously saved head
+ do_uncommit(repo_url)
+
+ loader = BazaarLoader(swh_storage, repo_url, directory=repo_url)
+ res = loader.load()
+ assert res == {"status": "eventful"}
+
+ # Everything is loaded correctly
+ stats = get_stats(swh_storage)
+ assert stats == {**expected_stats, "origin_visit": 1 + 1, "snapshot": 1 + 1}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 21 2024, 6:46 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222159
Attached To
D7059: Add incremental support for the bzr loader
Event Timeline
Log In to Comment