Page MenuHomeSoftware Heritage

D5687.id20396.diff
No OneTemporary

D5687.id20396.diff

diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py
--- a/swh/loader/mercurial/from_disk.py
+++ b/swh/loader/mercurial/from_disk.py
@@ -8,7 +8,7 @@
import os
from shutil import rmtree
from tempfile import mkdtemp
-from typing import Deque, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Deque, Dict, List, Optional, Set, Tuple, TypeVar, Union
from swh.loader.core.loader import BaseLoader
from swh.loader.core.utils import clean_dangling_folders
@@ -162,6 +162,10 @@
# hg node id of the latest snapshot branch heads
# used to find what are the new revisions since last snapshot
self._latest_heads: List[bytes] = []
+ # hg node ids of all the tags recorded on previous runs
+ # Used to compute which tags need to be added, even across incremental runs
+ # that might separate a changeset introducing a tag from its target.
+ self._saved_tags: Set[bytes] = set()
self._load_status = "eventful"
# If set, will override the default value
@@ -209,27 +213,41 @@
latest_snapshot = snapshot_get_latest(self.storage, self.origin_url)
if latest_snapshot:
- self._set_latest_heads(latest_snapshot)
+ self._set_recorded_state(latest_snapshot)
- def _set_latest_heads(self, latest_snapshot: Snapshot) -> None:
+ def _set_recorded_state(self, latest_snapshot: Snapshot) -> None:
"""
Looks up the nodeid for all revisions in the snapshot via extid_get_from_target,
- and adds them to self._latest_heads.
+ and adds them to `self._latest_heads`.
+ Also looks up the currently saved releases ("tags" in Mercurial speak).
+ The tags are all listed for easy comparison at the end, while only the latest
+ heads are needed for revisions.
"""
- # TODO: add support for releases
- snapshot_branches = [
- branch.target
- for branch in latest_snapshot.branches.values()
- if branch.target_type == TargetType.REVISION
- ]
+ heads = []
+ tags = []
+
+ for branch in latest_snapshot.branches.values():
+ if branch.target_type == TargetType.REVISION:
+ heads.append(branch.target)
+ elif branch.target_type == TargetType.RELEASE:
+ tags.append(branch.target)
- # Get all ExtIDs for revisions in the latest snapshot
- extids = self.storage.extid_get_from_target(
- identifiers.ObjectType.REVISION, snapshot_branches
+ self._latest_heads.extend(
+ extid.extid for extid in self._get_extids_for_targets(heads)
+ )
+ self._saved_tags.update(
+ extid.extid for extid in self._get_extids_for_targets(tags)
)
- # Filter out extids not specific to Mercurial
- extids = [extid for extid in extids if extid.extid_type == EXTID_TYPE]
+ def _get_extids_for_targets(self, targets: List[bytes]) -> List[ExtID]:
+ # Get all Mercurial ExtIDs for the targets in the latest snapshot
+ extids = [
+ extid
+ for extid in self.storage.extid_get_from_target(
+ identifiers.ObjectType.REVISION, targets
+ )
+ if extid.extid_type == EXTID_TYPE
+ ]
if extids:
# Filter out dangling extids, we need to load their target again
@@ -241,9 +259,7 @@
for extid in extids
if extid.target.object_id not in revisions_missing
]
-
- # Add the found nodeids to self.latest_heads
- self._latest_heads.extend(extid.extid for extid in extids)
+ return extids
def fetch_data(self) -> bool:
"""Fetch the data from the source the loader is currently loading
@@ -290,13 +306,9 @@
# and not the heads themselves
new_revs = repo.revs("not ::(%ld)", existing_heads)
- # for now, reload all revisions if there are new commits
- # otherwise the loader will crash on missing parents
- # incremental loading will come in next commits
if new_revs:
- return repo.revs("all()")
- else:
- return new_revs
+ self.log.info("New revisions found: %d", len(new_revs))
+ return new_revs
else:
return repo.revs("all()")
@@ -326,25 +338,24 @@
hg_nodeid: name for name, hg_nodeid in hgutil.branches(repo).items()
}
tags_by_name: Dict[bytes, HgNodeId] = repo.tags()
- tags_by_hg_nodeid: Dict[HgNodeId, bytes] = {
- hg_nodeid: name for name, hg_nodeid in tags_by_name.items()
- }
snapshot_branches: Dict[bytes, SnapshotBranch] = {}
- extids = []
-
- for hg_nodeid, revision_sha1git in self._revision_nodeid_to_sha1git.items():
- tag_name = tags_by_hg_nodeid.get(hg_nodeid)
-
- # tip is listed in the tags by the mercurial api
- # but its not a tag defined by the user in `.hgtags`
- if tag_name and tag_name != b"tip":
+ for tag_name, hg_nodeid in tags_by_name.items():
+ if tag_name == b"tip":
+ # tip is listed in the tags by the mercurial api
+ # but its not a tag defined by the user in `.hgtags`
+ continue
+ if hg_nodeid not in self._saved_tags:
+ revision_sha1git = self.get_revision_id_from_hg_nodeid(hg_nodeid)
snapshot_branches[tag_name] = SnapshotBranch(
target=self.store_release(tag_name, revision_sha1git),
target_type=TargetType.RELEASE,
)
+ extids = []
+
+ for hg_nodeid, revision_sha1git in self._revision_nodeid_to_sha1git.items():
if hg_nodeid in branch_by_hg_nodeid:
name = branch_by_hg_nodeid[hg_nodeid]
snapshot_branches[name] = SnapshotBranch(
@@ -403,7 +414,16 @@
Returns:
the sha1_git of the revision.
"""
- return self._revision_nodeid_to_sha1git[hg_nodeid]
+
+ from_cache = self._revision_nodeid_to_sha1git.get(hg_nodeid)
+ if from_cache is not None:
+ return from_cache
+ # The parent was not loaded in this run, get it from storage
+ from_storage = self.storage.extid_get_from_extid(EXTID_TYPE, ids=[hg_nodeid])
+
+ msg = "Expected 1 match from storage for hg node %r, got %d"
+ assert len(from_storage) == 1, msg % (hg_nodeid, len(from_storage))
+ return from_storage[0].target.object_id
def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]:
"""Return the git sha1 of the parent revisions.
@@ -420,7 +440,8 @@
# nullid is the value of a parent that does not exist
if parent_hg_nodeid == hgutil.NULLID:
continue
- parents.append(self.get_revision_id_from_hg_nodeid(parent_hg_nodeid))
+ revision_id = self.get_revision_id_from_hg_nodeid(parent_hg_nodeid)
+ parents.append(revision_id)
return tuple(parents)
diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py
--- a/swh/loader/mercurial/tests/test_from_disk.py
+++ b/swh/loader/mercurial/tests/test_from_disk.py
@@ -2,10 +2,11 @@
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
from datetime import datetime
from hashlib import sha1
import os
+from pathlib import Path
+import subprocess
import attr
import pytest
@@ -434,3 +435,52 @@
assert actual_load_status == {"status": "eventful"}
assert_last_visit_matches(swh_storage, repo_url, status="partial", type="hg")
+
+
+def hg_strip(repo: str, revset: str) -> None:
+ """Removes `revset` and all of their descendants from the local repository."""
+ # Previously called `hg strip`, it was renamed to `hg debugstrip` in Mercurial 5.7
+ # because it's most likely not what most users want to do (they should use some kind
+ # of history-rewriting tool like `histedit` or `prune`).
+ # But here, it's exactly what we want to do.
+ subprocess.check_call(["hg", "debugstrip", revset], cwd=repo)
+
+
+def test_load_repo_with_new_commits(swh_storage, datadir, tmp_path):
+ archive_name = "hello"
+ archive_path = Path(datadir, f"{archive_name}.tgz")
+ json_path = Path(datadir, f"{archive_name}.json")
+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+
+ # first load with missing commits
+ hg_strip(repo_url.replace("file://", ""), "tip")
+ loader = HgLoaderFromDisk(swh_storage, repo_url)
+ assert loader.load() == {"status": "eventful"}
+ assert get_stats(loader.storage) == {
+ "content": 2,
+ "directory": 2,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 0,
+ "revision": 2,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+
+ # second load with all commits
+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+ loader = HgLoaderFromDisk(swh_storage, repo_url)
+ checker = LoaderChecker(loader=loader, expected=ExpectedSwhids.load(json_path),)
+
+ checker.check()
+
+ assert get_stats(loader.storage) == {
+ "content": 3,
+ "directory": 3,
+ "origin": 1,
+ "origin_visit": 2,
+ "release": 1,
+ "revision": 3,
+ "skipped_content": 0,
+ "snapshot": 2,
+ }

File Metadata

Mime Type
text/plain
Expires
Wed, Sep 17, 4:37 PM (7 h, 21 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223058

Event Timeline