Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7124631
D7130.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D7130.diff
View Options
diff --git a/swh/loader/bzr/loader.py b/swh/loader/bzr/loader.py
--- a/swh/loader/bzr/loader.py
+++ b/swh/loader/bzr/loader.py
@@ -8,6 +8,7 @@
"""
from datetime import datetime
from functools import lru_cache, partial
+import itertools
import os
from tempfile import mkdtemp
from typing import Dict, Iterator, List, NewType, Optional, Set, TypeVar, Union
@@ -18,8 +19,10 @@
from breezy.bzr import bzrdir
from breezy.bzr.branch import Branch as BzrBranch
from breezy.bzr.inventory import Inventory, InventoryEntry
+from breezy.bzr.inventorytree import InventoryTreeChange
from breezy.revision import NULL_REVISION
from breezy.revision import Revision as BzrRevision
+from breezy.tree import Tree
from swh.loader.core.loader import BaseLoader
from swh.loader.core.utils import clean_dangling_folders, clone_with_timeout
@@ -128,6 +131,21 @@
return default
+def sort_changes(change: InventoryTreeChange) -> str:
+ """Key function for sorting the changes by path.
+
+ Sorting allows us to group the folders together (for example "b", then "a/a",
+ then "a/b"). Reversing this sort in the `sorted()` call will make it
+ so the files appear before the folder ("a/a", then "a") if the folder has
+ changed. This removes a bug where the order of operations is:
+
+ - "a" goes from directory to file, removing all of its subtree
+ - "a/a" is removed, but our structure has already forgotten it"""
+ source_path, target_path = change.path
+ # Neither path can be the empty string
+ return source_path or target_path
+
+
class BazaarLoader(BaseLoader):
"""Loads a Bazaar repository"""
@@ -156,6 +174,9 @@
self._last_root = BzrDirectory()
self._tags: Optional[Dict[bytes, BzrRevisionId]] = None
self._head_revision_id: Optional[bytes] = None
+ # Remember the previous revision to only compute the delta between
+ # revisions
+ self._prev_revision: Optional[BzrRevision] = None
self._branch: Optional[BzrBranch] = None
# Revisions that are pointed to, but don't exist in the current branch
# Rare, but exist usually for cross-VCS references.
@@ -211,6 +232,9 @@
}
def _set_recorded_state(self, latest_snapshot: Snapshot) -> None:
+ if not latest_snapshot.branches:
+ # Last snapshot was empty
+ return
head = latest_snapshot.branches[b"trunk"]
bzr_head = self._get_extids_for_targets([head.target])[0].extid
self._latest_head = BzrRevisionId(bzr_head)
@@ -405,10 +429,45 @@
)
def store_directories(self, bzr_rev: BzrRevision) -> Sha1Git:
+ """Store a revision's directories."""
repo: repository.Repository = self.repo
inventory: Inventory = repo.get_inventory(bzr_rev.revision_id)
- self._store_directories_slow(bzr_rev, inventory)
- return self._store_tree(inventory)
+ if self._prev_revision is None:
+ self._store_directories_slow(bzr_rev, inventory)
+ return self._store_tree(bzr_rev)
+
+ old_tree = self._get_revision_tree(self._prev_revision.revision_id)
+ new_tree = self._get_revision_tree(bzr_rev.revision_id)
+
+ delta = new_tree.changes_from(old_tree)
+
+ if delta.renamed or delta.copied:
+ # Figuring out all nested and possibly conflicting renames is a lot
+ # of effort for very few revisions, just go the slow way
+ self._store_directories_slow(bzr_rev, inventory)
+ return self._store_tree(bzr_rev)
+
+ to_remove = sorted(
+ delta.removed + delta.missing, key=sort_changes, reverse=True
+ )
+ for change in to_remove:
+ if change.kind[0] == "directory":
+ # empty directories will delete themselves in `self._last_root`
+ continue
+ path = change.path[0]
+ del self._last_root[path.encode()]
+
+ # `delta.kind_changed` needs to happen before `delta.added` since a file
+ # could be added under a node that changed from directory to file at the
+ # same time, for example
+ for change in itertools.chain(delta.kind_changed, delta.added, delta.modified):
+ path = change.path[1]
+ entry = inventory.get_entry(change.file_id)
+ content = self.store_content(bzr_rev, path, entry)
+ self._last_root[path.encode()] = content
+
+ self._prev_revision = bzr_rev
+ return self._store_tree(bzr_rev)
def store_release(self, name: bytes, target: Sha1Git) -> Sha1Git:
"""Store a release given its name and its target.
@@ -510,12 +569,14 @@
assert self.repo is not None
return self.repo.get_graph().iter_ancestry([rev.revision_id])
- @lru_cache()
- def _get_revision_tree(self, rev: BzrRevisionId):
+ # We want to cache at most the current revision and the last, no need to
+ # take cache more than this.
+ @lru_cache(maxsize=2)
+ def _get_revision_tree(self, rev: BzrRevisionId) -> Tree:
assert self.repo is not None
return self.repo.revision_tree(rev)
- def _store_tree(self, inventory: Inventory) -> Sha1Git:
+ def _store_tree(self, bzr_rev: BzrRevision) -> Sha1Git:
"""Save the current in-memory tree to storage."""
directories: List[from_disk.Directory] = [self._last_root]
while directories:
@@ -528,14 +589,17 @@
if isinstance(item, from_disk.Directory)
]
)
- self._prev_inventory = inventory
+ self._prev_revision = bzr_rev
return self._last_root.hash
def _store_directories_slow(self, bzr_rev: BzrRevision, inventory: Inventory):
- """Store a revision directories given its hg nodeid.
+ """Store a revision's directories.
This is the slow variant: it does not use a diff from the last revision
- but lists all the files. A future patch will introduce a faster version.
+ but lists all the files. It is used for the first revision of a load
+ (the null revision for a full run, the last recorded head for an
+ incremental one) or for cases where the headaches of figuring out the
+ delta from the breezy primitives is not worth it.
"""
# Don't reuse the last root, we're listing everything anyway, and we
# could be keeping around deleted files
diff --git a/swh/loader/bzr/tests/test_loader.py b/swh/loader/bzr/tests/test_loader.py
--- a/swh/loader/bzr/tests/test_loader.py
+++ b/swh/loader/bzr/tests/test_loader.py
@@ -163,6 +163,10 @@
res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load()
assert res == {"status": "uneventful"}
+ # Empty snapshot does not bother the incremental code
+ res = BazaarLoader(swh_storage, repo_url, directory=repo_url).load()
+ assert res == {"status": "uneventful"}
+
def test_renames(swh_storage, datadir, tmp_path):
archive_path = Path(datadir, "renames.tgz")
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 21 2024, 3:08 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218822
Attached To
D7130: Add fast-path to the bzr loader
Event Timeline
Log In to Comment