Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F11023474
D5687.id20396.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
9 KB
Subscribers
None
D5687.id20396.diff
View Options
diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py
--- a/swh/loader/mercurial/from_disk.py
+++ b/swh/loader/mercurial/from_disk.py
@@ -8,7 +8,7 @@
import os
from shutil import rmtree
from tempfile import mkdtemp
-from typing import Deque, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Deque, Dict, List, Optional, Set, Tuple, TypeVar, Union
from swh.loader.core.loader import BaseLoader
from swh.loader.core.utils import clean_dangling_folders
@@ -162,6 +162,10 @@
# hg node id of the latest snapshot branch heads
# used to find what are the new revisions since last snapshot
self._latest_heads: List[bytes] = []
+ # hg node ids of all the tags recorded on previous runs
+ # Used to compute which tags need to be added, even across incremental runs
+ # that might separate a changeset introducing a tag from its target.
+ self._saved_tags: Set[bytes] = set()
self._load_status = "eventful"
# If set, will override the default value
@@ -209,27 +213,41 @@
latest_snapshot = snapshot_get_latest(self.storage, self.origin_url)
if latest_snapshot:
- self._set_latest_heads(latest_snapshot)
+ self._set_recorded_state(latest_snapshot)
- def _set_latest_heads(self, latest_snapshot: Snapshot) -> None:
+ def _set_recorded_state(self, latest_snapshot: Snapshot) -> None:
"""
Looks up the nodeid for all revisions in the snapshot via extid_get_from_target,
- and adds them to self._latest_heads.
+ and adds them to `self._latest_heads`.
+ Also looks up the currently saved releases ("tags" in Mercurial speak).
+ The tags are all listed for easy comparison at the end, while only the latest
+ heads are needed for revisions.
"""
- # TODO: add support for releases
- snapshot_branches = [
- branch.target
- for branch in latest_snapshot.branches.values()
- if branch.target_type == TargetType.REVISION
- ]
+ heads = []
+ tags = []
+
+ for branch in latest_snapshot.branches.values():
+ if branch.target_type == TargetType.REVISION:
+ heads.append(branch.target)
+ elif branch.target_type == TargetType.RELEASE:
+ tags.append(branch.target)
- # Get all ExtIDs for revisions in the latest snapshot
- extids = self.storage.extid_get_from_target(
- identifiers.ObjectType.REVISION, snapshot_branches
+ self._latest_heads.extend(
+ extid.extid for extid in self._get_extids_for_targets(heads)
+ )
+ self._saved_tags.update(
+ extid.extid for extid in self._get_extids_for_targets(tags)
)
- # Filter out extids not specific to Mercurial
- extids = [extid for extid in extids if extid.extid_type == EXTID_TYPE]
+ def _get_extids_for_targets(self, targets: List[bytes]) -> List[ExtID]:
+ # Get all Mercurial ExtIDs for the targets in the latest snapshot
+ extids = [
+ extid
+ for extid in self.storage.extid_get_from_target(
+ identifiers.ObjectType.REVISION, targets
+ )
+ if extid.extid_type == EXTID_TYPE
+ ]
if extids:
# Filter out dangling extids, we need to load their target again
@@ -241,9 +259,7 @@
for extid in extids
if extid.target.object_id not in revisions_missing
]
-
- # Add the found nodeids to self.latest_heads
- self._latest_heads.extend(extid.extid for extid in extids)
+ return extids
def fetch_data(self) -> bool:
"""Fetch the data from the source the loader is currently loading
@@ -290,13 +306,9 @@
# and not the heads themselves
new_revs = repo.revs("not ::(%ld)", existing_heads)
- # for now, reload all revisions if there are new commits
- # otherwise the loader will crash on missing parents
- # incremental loading will come in next commits
if new_revs:
- return repo.revs("all()")
- else:
- return new_revs
+ self.log.info("New revisions found: %d", len(new_revs))
+ return new_revs
else:
return repo.revs("all()")
@@ -326,25 +338,24 @@
hg_nodeid: name for name, hg_nodeid in hgutil.branches(repo).items()
}
tags_by_name: Dict[bytes, HgNodeId] = repo.tags()
- tags_by_hg_nodeid: Dict[HgNodeId, bytes] = {
- hg_nodeid: name for name, hg_nodeid in tags_by_name.items()
- }
snapshot_branches: Dict[bytes, SnapshotBranch] = {}
- extids = []
-
- for hg_nodeid, revision_sha1git in self._revision_nodeid_to_sha1git.items():
- tag_name = tags_by_hg_nodeid.get(hg_nodeid)
-
- # tip is listed in the tags by the mercurial api
- # but its not a tag defined by the user in `.hgtags`
- if tag_name and tag_name != b"tip":
+ for tag_name, hg_nodeid in tags_by_name.items():
+ if tag_name == b"tip":
+ # tip is listed in the tags by the mercurial api
+ # but its not a tag defined by the user in `.hgtags`
+ continue
+ if hg_nodeid not in self._saved_tags:
+ revision_sha1git = self.get_revision_id_from_hg_nodeid(hg_nodeid)
snapshot_branches[tag_name] = SnapshotBranch(
target=self.store_release(tag_name, revision_sha1git),
target_type=TargetType.RELEASE,
)
+ extids = []
+
+ for hg_nodeid, revision_sha1git in self._revision_nodeid_to_sha1git.items():
if hg_nodeid in branch_by_hg_nodeid:
name = branch_by_hg_nodeid[hg_nodeid]
snapshot_branches[name] = SnapshotBranch(
@@ -403,7 +414,16 @@
Returns:
the sha1_git of the revision.
"""
- return self._revision_nodeid_to_sha1git[hg_nodeid]
+
+ from_cache = self._revision_nodeid_to_sha1git.get(hg_nodeid)
+ if from_cache is not None:
+ return from_cache
+ # The parent was not loaded in this run, get it from storage
+ from_storage = self.storage.extid_get_from_extid(EXTID_TYPE, ids=[hg_nodeid])
+
+ msg = "Expected 1 match from storage for hg node %r, got %d"
+ assert len(from_storage) == 1, msg % (hg_nodeid, len(from_storage))
+ return from_storage[0].target.object_id
def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]:
"""Return the git sha1 of the parent revisions.
@@ -420,7 +440,8 @@
# nullid is the value of a parent that does not exist
if parent_hg_nodeid == hgutil.NULLID:
continue
- parents.append(self.get_revision_id_from_hg_nodeid(parent_hg_nodeid))
+ revision_id = self.get_revision_id_from_hg_nodeid(parent_hg_nodeid)
+ parents.append(revision_id)
return tuple(parents)
diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py
--- a/swh/loader/mercurial/tests/test_from_disk.py
+++ b/swh/loader/mercurial/tests/test_from_disk.py
@@ -2,10 +2,11 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
from datetime import datetime
from hashlib import sha1
import os
+from pathlib import Path
+import subprocess
import attr
import pytest
@@ -434,3 +435,52 @@
assert actual_load_status == {"status": "eventful"}
assert_last_visit_matches(swh_storage, repo_url, status="partial", type="hg")
+
+
+def hg_strip(repo: str, revset: str) -> None:
+ """Removes `revset` and all of their descendants from the local repository."""
+ # Previously called `hg strip`, it was renamed to `hg debugstrip` in Mercurial 5.7
+ # because it's most likely not what most users want to do (they should use some kind
+ # of history-rewriting tool like `histedit` or `prune`).
+ # But here, it's exactly what we want to do.
+ subprocess.check_call(["hg", "debugstrip", revset], cwd=repo)
+
+
+def test_load_repo_with_new_commits(swh_storage, datadir, tmp_path):
+ archive_name = "hello"
+ archive_path = Path(datadir, f"{archive_name}.tgz")
+ json_path = Path(datadir, f"{archive_name}.json")
+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+
+ # first load with missing commits
+ hg_strip(repo_url.replace("file://", ""), "tip")
+ loader = HgLoaderFromDisk(swh_storage, repo_url)
+ assert loader.load() == {"status": "eventful"}
+ assert get_stats(loader.storage) == {
+ "content": 2,
+ "directory": 2,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 0,
+ "revision": 2,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+
+ # second load with all commits
+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+ loader = HgLoaderFromDisk(swh_storage, repo_url)
+ checker = LoaderChecker(loader=loader, expected=ExpectedSwhids.load(json_path),)
+
+ checker.check()
+
+ assert get_stats(loader.storage) == {
+ "content": 3,
+ "directory": 3,
+ "origin": 1,
+ "origin_visit": 2,
+ "release": 1,
+ "revision": 3,
+ "skipped_content": 0,
+ "snapshot": 2,
+ }
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Sep 17, 4:37 PM (7 h, 21 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223058
Attached To
D5687: Make the Mercurial loader incremental
Event Timeline
Log In to Comment