diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py
index 5c81c35..27ad94b 100644
--- a/swh/loader/mercurial/from_disk.py
+++ b/swh/loader/mercurial/from_disk.py
@@ -1,676 +1,697 @@
 # Copyright (C) 2020-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
 
 from collections import deque
 from datetime import datetime
 import os
 from shutil import rmtree
 from tempfile import mkdtemp
-from typing import Deque, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Deque, Dict, List, Optional, Set, Tuple, TypeVar, Union
 
 from swh.loader.core.loader import BaseLoader
 from swh.loader.core.utils import clean_dangling_folders
 from swh.loader.mercurial.utils import get_minimum_env, parse_visit_date
 from swh.model import identifiers
 from swh.model.from_disk import Content, DentryPerms, Directory
 from swh.model.hashutil import hash_to_bytehex
 from swh.model.model import (
     ExtID,
     ObjectType,
     Origin,
     Person,
     Release,
     Revision,
     RevisionType,
     Sha1Git,
     Snapshot,
     SnapshotBranch,
     TargetType,
     TimestampWithTimezone,
 )
 from swh.model.model import Content as ModelContent
 from swh.storage.algos.snapshot import snapshot_get_latest
 from swh.storage.interface import StorageInterface
 
 from . import hgutil
 from .archive_extract import tmp_extract
 from .hgutil import HgFilteredSet, HgNodeId, HgSpanSet
 
 FLAG_PERMS = {
     b"l": DentryPerms.symlink,
     b"x": DentryPerms.executable_content,
     b"": DentryPerms.content,
 }  # type: Dict[bytes, DentryPerms]
 
 TEMPORARY_DIR_PREFIX_PATTERN = "swh.loader.mercurial.from_disk"
 
 EXTID_TYPE = "hg-nodeid"
 
 
 T = TypeVar("T")
 
 
 class CorruptedRevision(ValueError):
     """Raised when a revision is corrupted."""
 
     def __init__(self, hg_nodeid: HgNodeId) -> None:
         super().__init__(hg_nodeid.hex())
         self.hg_nodeid = hg_nodeid
 
 
 class HgDirectory(Directory):
     """A more practical directory.
 
     - creates missing parent directories
     - removes empty directories
     """
 
     def __setitem__(self, path: bytes, value: Union[Content, "HgDirectory"]) -> None:
         if b"/" in path:
             head, tail = path.split(b"/", 1)
 
             directory = self.get(head)
             if directory is None or isinstance(directory, Content):
                 directory = HgDirectory()
                 self[head] = directory
 
             directory[tail] = value
         else:
             super().__setitem__(path, value)
 
     def __delitem__(self, path: bytes) -> None:
         super().__delitem__(path)
 
         while b"/" in path:  # remove empty parent directories
             path = path.rsplit(b"/", 1)[0]
             if len(self[path]) == 0:
                 super().__delitem__(path)
             else:
                 break
 
     def get(
         self, path: bytes, default: Optional[T] = None
     ) -> Optional[Union[Content, "HgDirectory", T]]:
         # TODO move to swh.model.from_disk.Directory
         try:
             return self[path]
         except KeyError:
             return default
 
 
 class HgLoaderFromDisk(BaseLoader):
     """Load a mercurial repository from a local repository."""
 
     CONFIG_BASE_FILENAME = "loader/mercurial"
 
     visit_type = "hg"
 
     def __init__(
         self,
         storage: StorageInterface,
         url: str,
         directory: Optional[str] = None,
         logging_class: str = "swh.loader.mercurial.LoaderFromDisk",
         visit_date: Optional[datetime] = None,
         temp_directory: str = "/tmp",
         clone_timeout_seconds: int = 7200,
         content_cache_size: int = 10_000,
         max_content_size: Optional[int] = None,
     ):
         """Initialize the loader.
 
         Args:
             url: url of the repository.
             directory: directory of the local repository.
             logging_class: class of the loader logger.
             visit_date: visit date of the repository
             config: loader configuration
         """
         super().__init__(
             storage=storage,
             logging_class=logging_class,
             max_content_size=max_content_size,
         )
 
         self._temp_directory = temp_directory
         self._clone_timeout = clone_timeout_seconds
 
         self.origin_url = url
         self.visit_date = visit_date
         self.directory = directory
 
         self._repo: Optional[hgutil.Repository] = None
         self._revision_nodeid_to_sha1git: Dict[HgNodeId, Sha1Git] = {}
         self._repo_directory: Optional[str] = None
 
         # keeps the last processed hg nodeid
         # it is used for differential tree update by store_directories
         # NULLID is the parent of the first revision
         self._last_hg_nodeid = hgutil.NULLID
 
         # keeps the last revision tree
         # it is used for differential tree update by store_directories
         self._last_root = HgDirectory()
 
         # Cache the content hash across revisions to avoid recalculation.
         self._content_hash_cache: hgutil.LRUCacheDict = hgutil.LRUCacheDict(
             content_cache_size,
         )
 
         # hg node id of the latest snapshot branch heads
         # used to find what are the new revisions since last snapshot
         self._latest_heads: List[bytes] = []
+        # hg node ids of all the tags recorded on previous runs
+        # Used to compute which tags need to be added, even across incremental runs
+        # that might separate a changeset introducing a tag from its target.
+        self._saved_tags: Set[bytes] = set()
 
         self._load_status = "eventful"
         # If set, will override the default value
         self._visit_status = None
 
     def pre_cleanup(self) -> None:
         """As a first step, will try and check for dangling data to cleanup.
         This should do its best to avoid raising issues.
 
         """
         clean_dangling_folders(
             self._temp_directory,
             pattern_check=TEMPORARY_DIR_PREFIX_PATTERN,
             log=self.log,
         )
 
         self.old_environ = os.environ.copy()
         os.environ.clear()
         os.environ.update(get_minimum_env())
 
     def cleanup(self) -> None:
         """Last step executed by the loader."""
         os.environ.clear()
         os.environ.update(self.old_environ)
 
         if self._repo_directory and os.path.exists(self._repo_directory):
             self.log.debug(f"Cleanup up repository {self._repo_directory}")
             rmtree(self._repo_directory)
 
     def prepare_origin_visit(self) -> None:
         """First step executed by the loader to prepare origin and visit
         references. Set/update self.origin, and
         optionally self.origin_url, self.visit_date.
 
         """
         self.origin = Origin(url=self.origin_url)
 
     def prepare(self) -> None:
         """Second step executed by the loader to prepare some state needed by
         the loader.
 
         """
         # Set here to allow multiple calls to load on the same loader instance
         self._latest_heads = []
 
         latest_snapshot = snapshot_get_latest(self.storage, self.origin_url)
         if latest_snapshot:
-            self._set_latest_heads(latest_snapshot)
+            self._set_recorded_state(latest_snapshot)
 
-    def _set_latest_heads(self, latest_snapshot: Snapshot) -> None:
+    def _set_recorded_state(self, latest_snapshot: Snapshot) -> None:
         """
         Looks up the nodeid for all revisions in the snapshot via extid_get_from_target,
-        and adds them to self._latest_heads.
+        and adds them to `self._latest_heads`.
+        Also looks up the currently saved releases ("tags" in Mercurial speak).
+        The tags are all listed for easy comparison at the end, while only the latest
+        heads are needed for revisions.
         """
-        # TODO: add support for releases
-        snapshot_branches = [
-            branch.target
-            for branch in latest_snapshot.branches.values()
-            if branch.target_type == TargetType.REVISION
-        ]
+        heads = []
+        tags = []
+
+        for branch in latest_snapshot.branches.values():
+            if branch.target_type == TargetType.REVISION:
+                heads.append(branch.target)
+            elif branch.target_type == TargetType.RELEASE:
+                tags.append(branch.target)
 
-        # Get all ExtIDs for revisions in the latest snapshot
-        extids = self.storage.extid_get_from_target(
-            identifiers.ObjectType.REVISION, snapshot_branches
+        self._latest_heads.extend(
+            extid.extid for extid in self._get_extids_for_targets(heads)
+        )
+        self._saved_tags.update(
+            extid.extid for extid in self._get_extids_for_targets(tags)
         )
 
-        # Filter out extids not specific to Mercurial
-        extids = [extid for extid in extids if extid.extid_type == EXTID_TYPE]
+    def _get_extids_for_targets(self, targets: List[bytes]) -> List[ExtID]:
+        # Get all Mercurial ExtIDs for the targets in the latest snapshot
+        extids = [
+            extid
+            for extid in self.storage.extid_get_from_target(
+                identifiers.ObjectType.REVISION, targets
+            )
+            if extid.extid_type == EXTID_TYPE
+        ]
 
         if extids:
             # Filter out dangling extids, we need to load their target again
             revisions_missing = self.storage.revision_missing(
                 [extid.target.object_id for extid in extids]
             )
             extids = [
                 extid
                 for extid in extids
                 if extid.target.object_id not in revisions_missing
             ]
-
-            # Add the found nodeids to self.latest_heads
-            self._latest_heads.extend(extid.extid for extid in extids)
+        return extids
 
     def fetch_data(self) -> bool:
         """Fetch the data from the source the loader is currently loading
 
         Returns:
             a value that is interpreted as a boolean. If True, fetch_data needs
             to be called again to complete loading.
 
         """
         if not self.directory:  # no local repository
             self._repo_directory = mkdtemp(
                 prefix=TEMPORARY_DIR_PREFIX_PATTERN,
                 suffix=f"-{os.getpid()}",
                 dir=self._temp_directory,
             )
             self.log.debug(
                 f"Cloning {self.origin_url} to {self.directory} "
                 f"with timeout {self._clone_timeout} seconds"
             )
             hgutil.clone(self.origin_url, self._repo_directory, self._clone_timeout)
         else:  # existing local repository
             # Allow to load on disk repository without cloning
             # for testing purpose.
             self._repo_directory = self.directory
 
         self._repo = hgutil.repository(self._repo_directory)
 
         return False
 
     def get_hg_revs_to_load(self) -> Union[HgFilteredSet, HgSpanSet]:
         """Return the hg revision numbers to load."""
         assert self._repo is not None
         repo: hgutil.Repository = self._repo
         if self._latest_heads:
             existing_heads = []  # heads that still exist in the repository
             for hg_nodeid in self._latest_heads:
                 try:
                     rev = repo[hg_nodeid].rev()
                     existing_heads.append(rev)
                 except KeyError:  # the node does not exist anymore
                     pass
 
             # select revisions that are not ancestors of heads
             # and not the heads themselves
             new_revs = repo.revs("not ::(%ld)", existing_heads)
 
-            # for now, reload all revisions if there are new commits
-            # otherwise the loader will crash on missing parents
-            # incremental loading will come in next commits
             if new_revs:
-                return repo.revs("all()")
-            else:
-                return new_revs
+                self.log.info("New revisions found: %d", len(new_revs))
+            return new_revs
         else:
             return repo.revs("all()")
 
     def store_data(self):
         """Store fetched data in the database."""
         revs = self.get_hg_revs_to_load()
         if not revs:
             self._load_status = "uneventful"
             return
 
         assert self._repo is not None
         repo = self._repo
 
         blacklisted_revs: List[int] = []
         for rev in revs:
             if rev in blacklisted_revs:
                 continue
             try:
                 self.store_revision(repo[rev])
             except CorruptedRevision as e:
                 self._visit_status = "partial"
                 self.log.warning("Corrupted revision %s", e)
                 descendents = repo.revs("(%ld)::", [rev])
                 blacklisted_revs.extend(descendents)
 
         branch_by_hg_nodeid: Dict[HgNodeId, bytes] = {
             hg_nodeid: name for name, hg_nodeid in hgutil.branches(repo).items()
         }
         tags_by_name: Dict[bytes, HgNodeId] = repo.tags()
-        tags_by_hg_nodeid: Dict[HgNodeId, bytes] = {
-            hg_nodeid: name for name, hg_nodeid in tags_by_name.items()
-        }
 
         snapshot_branches: Dict[bytes, SnapshotBranch] = {}
 
-        extids = []
-
-        for hg_nodeid, revision_sha1git in self._revision_nodeid_to_sha1git.items():
-            tag_name = tags_by_hg_nodeid.get(hg_nodeid)
-
-            # tip is listed in the tags by the mercurial api
-            # but its not a tag defined by the user in `.hgtags`
-            if tag_name and tag_name != b"tip":
+        for tag_name, hg_nodeid in tags_by_name.items():
+            if tag_name == b"tip":
+                # tip is listed in the tags by the mercurial api
+                # but its not a tag defined by the user in `.hgtags`
+                continue
+            if hg_nodeid not in self._saved_tags:
+                revision_sha1git = self.get_revision_id_from_hg_nodeid(hg_nodeid)
                 snapshot_branches[tag_name] = SnapshotBranch(
                     target=self.store_release(tag_name, revision_sha1git),
                     target_type=TargetType.RELEASE,
                 )
 
+        extids = []
+
+        for hg_nodeid, revision_sha1git in self._revision_nodeid_to_sha1git.items():
             if hg_nodeid in branch_by_hg_nodeid:
                 name = branch_by_hg_nodeid[hg_nodeid]
                 snapshot_branches[name] = SnapshotBranch(
                     target=revision_sha1git, target_type=TargetType.REVISION,
                 )
 
             # The tip is mapped to `HEAD` to match
             # the historical implementation
             if hg_nodeid == tags_by_name[b"tip"]:
                 snapshot_branches[b"HEAD"] = SnapshotBranch(
                     target=name, target_type=TargetType.ALIAS,
                 )
 
             if hg_nodeid not in self._latest_heads:
                 revision_swhid = identifiers.CoreSWHID(
                     object_type=identifiers.ObjectType.REVISION,
                     object_id=revision_sha1git,
                 )
                 extids.append(
                     ExtID(extid_type=EXTID_TYPE, extid=hg_nodeid, target=revision_swhid)
                 )
 
         snapshot = Snapshot(branches=snapshot_branches)
         self.storage.snapshot_add([snapshot])
 
         self.storage.extid_add(extids)
 
         self.flush()
         self.loaded_snapshot_id = snapshot.id
 
     def load_status(self) -> Dict[str, str]:
         """Detailed loading status.
 
         Defaults to logging an eventful load.
 
         Returns: a dictionary that is eventually passed back as the task's
           result to the scheduler, allowing tuning of the task recurrence
           mechanism.
         """
         return {
             "status": self._load_status,
         }
 
     def visit_status(self) -> str:
         """Allow overriding the visit status in case of partial load"""
         if self._visit_status is not None:
             return self._visit_status
         return super().visit_status()
 
     def get_revision_id_from_hg_nodeid(self, hg_nodeid: HgNodeId) -> Sha1Git:
         """Return the git sha1 of a revision given its hg nodeid.
 
         Args:
             hg_nodeid: the hg nodeid of the revision.
 
         Returns:
             the sha1_git of the revision.
         """
-        return self._revision_nodeid_to_sha1git[hg_nodeid]
+
+        from_cache = self._revision_nodeid_to_sha1git.get(hg_nodeid)
+        if from_cache is not None:
+            return from_cache
+        # The parent was not loaded in this run, get it from storage
+        from_storage = self.storage.extid_get_from_extid(EXTID_TYPE, ids=[hg_nodeid])
+
+        msg = "Expected 1 match from storage for hg node %r, got %d"
+        assert len(from_storage) == 1, msg % (hg_nodeid, len(from_storage))
+        return from_storage[0].target.object_id
 
     def get_revision_parents(self, rev_ctx: hgutil.BaseContext) -> Tuple[Sha1Git, ...]:
         """Return the git sha1 of the parent revisions.
 
         Args:
             hg_nodeid: the hg nodeid of the revision.
 
         Returns:
             the sha1_git of the parent revisions.
         """
         parents = []
         for parent_ctx in rev_ctx.parents():
             parent_hg_nodeid = parent_ctx.node()
             # nullid is the value of a parent that does not exist
             if parent_hg_nodeid == hgutil.NULLID:
                 continue
-            parents.append(self.get_revision_id_from_hg_nodeid(parent_hg_nodeid))
+            revision_id = self.get_revision_id_from_hg_nodeid(parent_hg_nodeid)
+            parents.append(revision_id)
 
         return tuple(parents)
 
     def store_revision(self, rev_ctx: hgutil.BaseContext) -> None:
         """Store a revision given its hg nodeid.
 
         Args:
             rev_ctx: the he revision context.
 
         Returns:
             the sha1_git of the stored revision.
         """
         hg_nodeid = rev_ctx.node()
 
         root_sha1git = self.store_directories(rev_ctx)
 
         # `Person.from_fullname` is compatible with mercurial's freeform author
         # as fullname is what is used in revision hash when available.
         author = Person.from_fullname(rev_ctx.user())
 
         (timestamp, offset) = rev_ctx.date()
 
         # TimestampWithTimezone.from_dict will change name
         # as it accept more than just dicts
         rev_date = TimestampWithTimezone.from_dict(int(timestamp))
 
         extra_headers = [
             (b"time_offset_seconds", str(offset).encode(),),
         ]
         for key, value in rev_ctx.extra().items():
             # The default branch is skipped to match
             # the historical implementation
             if key == b"branch" and value == b"default":
                 continue
 
             # transplant_source is converted to match
             # the historical implementation
             if key == b"transplant_source":
                 value = hash_to_bytehex(value)
             extra_headers.append((key, value))
 
         revision = Revision(
             author=author,
             date=rev_date,
             committer=author,
             committer_date=rev_date,
             type=RevisionType.MERCURIAL,
             directory=root_sha1git,
             message=rev_ctx.description(),
             extra_headers=tuple(extra_headers),
             synthetic=False,
             parents=self.get_revision_parents(rev_ctx),
         )
 
         self._revision_nodeid_to_sha1git[hg_nodeid] = revision.id
         self.storage.revision_add([revision])
 
     def store_release(self, name: bytes, target: Sha1Git) -> Sha1Git:
         """Store a release given its name and its target.
 
         A release correspond to a user defined tag in mercurial.
         The mercurial api as a `tip` tag that must be ignored.
 
         Args:
             name: name of the release.
             target: sha1_git of the target revision.
 
         Returns:
             the sha1_git of the stored release.
         """
         release = Release(
             name=name,
             target=target,
             target_type=ObjectType.REVISION,
             message=None,
             metadata=None,
             synthetic=False,
             author=Person(name=None, email=None, fullname=b""),
             date=None,
         )
 
         self.storage.release_add([release])
 
         return release.id
 
     def store_content(self, rev_ctx: hgutil.BaseContext, file_path: bytes) -> Content:
         """Store a revision content hg nodeid and file path.
 
         Content is a mix of file content at a given revision
         and its permissions found in the changeset's manifest.
 
         Args:
             rev_ctx: the he revision context.
             file_path: the hg path of the content.
 
         Returns:
             the sha1_git of the top level directory.
         """
         hg_nodeid = rev_ctx.node()
         file_ctx = rev_ctx[file_path]
 
         try:
             file_nodeid = file_ctx.filenode()
         except hgutil.LookupError:
             # TODO
             # Raising CorruptedRevision avoid crashing the whole loading
             # but can lead to a lot of missing revisions.
             # SkippedContent could be used but need actual content to calculate its id.
             # Maybe the hg_nodeid can be used instead.
             # Another option could be to just ignore the missing content.
             # This point is left to future commits.
             # Check for other uses to apply the same logic there.
             raise CorruptedRevision(hg_nodeid)
 
         perms = FLAG_PERMS[file_ctx.flags()]
 
         # Key is file_nodeid + perms because permissions does not participate
         # in content hash in hg while it is the case in swh.
         cache_key = (file_nodeid, perms)
 
         sha1_git = self._content_hash_cache.get(cache_key)
         if sha1_git is None:
             try:
                 data = file_ctx.data()
             except hgutil.error.RevlogError:
                 # TODO
                 # See above use of `CorruptedRevision`
                 raise CorruptedRevision(hg_nodeid)
 
             content = ModelContent.from_data(data)
 
             self.storage.content_add([content])
 
             sha1_git = content.sha1_git
             self._content_hash_cache[cache_key] = sha1_git
 
         # Here we make sure to return only necessary data.
         return Content({"sha1_git": sha1_git, "perms": perms})
 
     def store_directories(self, rev_ctx: hgutil.BaseContext) -> Sha1Git:
         """Store a revision directories given its hg nodeid.
 
         Mercurial as no directory as in git. A Git like tree must be build
         from file paths to obtain each directory hash.
 
         Args:
             rev_ctx: the he revision context.
 
         Returns:
             the sha1_git of the top level directory.
         """
         repo: hgutil.Repository = self._repo  # mypy can't infer that repo is not None
         prev_ctx = repo[self._last_hg_nodeid]
 
         # TODO maybe do diff on parents
         try:
             status = prev_ctx.status(rev_ctx)
         except hgutil.error.LookupError:
             raise CorruptedRevision(rev_ctx.node())
 
         for file_path in status.removed:
             try:
                 del self._last_root[file_path]
             except KeyError:
                 raise CorruptedRevision(rev_ctx.node())
 
         for file_path in status.added:
             content = self.store_content(rev_ctx, file_path)
             self._last_root[file_path] = content
 
         for file_path in status.modified:
             content = self.store_content(rev_ctx, file_path)
             self._last_root[file_path] = content
 
         self._last_hg_nodeid = rev_ctx.node()
 
         directories: Deque[Directory] = deque([self._last_root])
         while directories:
             directory = directories.pop()
             self.storage.directory_add([directory.to_model()])
             directories.extend(
                 [item for item in directory.values() if isinstance(item, Directory)]
             )
 
         return self._last_root.hash
 
 
 class HgArchiveLoaderFromDisk(HgLoaderFromDisk):
     """Mercurial loader for repository wrapped within tarballs."""
 
     def __init__(
         self,
         storage: StorageInterface,
         url: str,
         visit_date: Optional[datetime] = None,
         archive_path: str = None,
         temp_directory: str = "/tmp",
         max_content_size: Optional[int] = None,
     ):
         super().__init__(
             storage=storage,
             url=url,
             visit_date=visit_date,
             logging_class="swh.loader.mercurial.ArchiveLoaderFromDisk",
             temp_directory=temp_directory,
             max_content_size=max_content_size,
         )
         self.archive_extract_temp_dir = None
         self.archive_path = archive_path
 
     def prepare(self):
         """Extract the archive instead of cloning."""
         self.archive_extract_temp_dir = tmp_extract(
             archive=self.archive_path,
             dir=self._temp_directory,
             prefix=TEMPORARY_DIR_PREFIX_PATTERN,
             suffix=f".dump-{os.getpid()}",
             log=self.log,
             source=self.origin_url,
         )
 
         repo_name = os.listdir(self.temp_dir)[0]
         self.directory = os.path.join(self.archive_extract_temp_dir, repo_name)
         super().prepare()
 
 
 # Allow direct usage of the loader from the command line with
 # `python -m swh.loader.mercurial.from_disk $ORIGIN_URL`
 if __name__ == "__main__":
     import logging
 
     import click
 
     logging.basicConfig(
         level=logging.DEBUG, format="%(asctime)s %(process)d %(message)s"
     )
 
     @click.command()
     @click.option("--origin-url", help="origin url")
     @click.option("--hg-directory", help="Path to mercurial repository to load")
     @click.option("--visit-date", default=None, help="Visit date")
     def main(origin_url, hg_directory, visit_date):
         from swh.storage import get_storage
 
         storage = get_storage(cls="memory")
         return HgLoaderFromDisk(
             storage,
             origin_url,
             directory=hg_directory,
             visit_date=parse_visit_date(visit_date),
         ).load()
 
     main()
diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py
index 1a9ed5f..991b374 100644
--- a/swh/loader/mercurial/tests/test_from_disk.py
+++ b/swh/loader/mercurial/tests/test_from_disk.py
@@ -1,436 +1,486 @@
 # Copyright (C) 2020-2021  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
-
 from datetime import datetime
 from hashlib import sha1
 import os
+from pathlib import Path
+import subprocess
 
 import attr
 import pytest
 
 from swh.loader.mercurial.utils import parse_visit_date
 from swh.loader.tests import (
     assert_last_visit_matches,
     check_snapshot,
     get_stats,
     prepare_repository_from_archive,
 )
 from swh.model.from_disk import Content, DentryPerms
 from swh.model.hashutil import hash_to_bytes, hash_to_hex
 from swh.model.identifiers import ObjectType
 from swh.model.model import RevisionType, Snapshot, SnapshotBranch, TargetType
 from swh.storage import get_storage
 from swh.storage.algos.snapshot import snapshot_get_latest
 
 from ..from_disk import HgDirectory, HgLoaderFromDisk
 from .loader_checker import ExpectedSwhids, LoaderChecker
 
 VISIT_DATE = parse_visit_date("2016-05-03 15:16:32+00")
 assert VISIT_DATE is not None
 
 
 def random_content() -> Content:
     """Create minimal content object."""
     data = str(datetime.now()).encode()
     return Content({"sha1_git": sha1(data).digest(), "perms": DentryPerms.content})
 
 
 def test_hg_directory_creates_missing_directories():
     directory = HgDirectory()
     directory[b"path/to/some/content"] = random_content()
 
 
 def test_hg_directory_get():
     content = random_content()
     directory = HgDirectory()
 
     assert directory.get(b"path/to/content") is None
     assert directory.get(b"path/to/content", content) == content
 
     directory[b"path/to/content"] = content
     assert directory.get(b"path/to/content") == content
 
 
 def test_hg_directory_deletes_empty_directories():
     directory = HgDirectory()
     content = random_content()
     directory[b"path/to/content"] = content
     directory[b"path/to/some/deep/content"] = random_content()
 
     del directory[b"path/to/some/deep/content"]
 
     assert directory.get(b"path/to/some/deep") is None
     assert directory.get(b"path/to/some") is None
     assert directory.get(b"path/to/content") == content
 
 
 def test_hg_directory_when_directory_replaces_file():
     directory = HgDirectory()
     directory[b"path/to/some"] = random_content()
     directory[b"path/to/some/content"] = random_content()
 
 
 # Those tests assert expectations on repository loading
 # by reading expected values from associated json files
 # produced by the `swh-hg-identify` command line utility.
 #
 # It has more granularity than historical tests.
 # Assertions will tell if the error comes from the directories
 # revisions or release rather than only checking the snapshot.
 #
 # With more work it should event be possible to know which part
 # of an object is faulty.
 def test_examples(swh_storage, datadir, tmp_path):
     for archive_name in ("hello", "transplant", "the-sandbox", "example"):
         archive_path = os.path.join(datadir, f"{archive_name}.tgz")
         json_path = os.path.join(datadir, f"{archive_name}.json")
         repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
 
         LoaderChecker(
             loader=HgLoaderFromDisk(swh_storage, repo_url),
             expected=ExpectedSwhids.load(json_path),
         ).check()
 
 
 # This test has as been adapted from the historical `HgBundle20Loader` tests
 # to ensure compatibility of `HgLoaderFromDisk`.
 # Hashes as been produced by copy pasting the result of the implementation
 # to prevent regressions.
 def test_loader_hg_new_visit_no_release(swh_storage, datadir, tmp_path):
     """Eventful visit should yield 1 snapshot"""
     archive_name = "the-sandbox"
     archive_path = os.path.join(datadir, f"{archive_name}.tgz")
     repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
 
     loader = HgLoaderFromDisk(swh_storage, url=repo_url)
 
     assert loader.load() == {"status": "eventful"}
 
     tip_revision_develop = "a9c4534552df370f43f0ef97146f393ef2f2a08c"
     tip_revision_default = "70e750bb046101fdced06f428e73fee471509c56"
     expected_snapshot = Snapshot(
         id=hash_to_bytes("3b8fe58e467deb7597b12a5fd3b2c096b8c02028"),
         branches={
             b"develop": SnapshotBranch(
                 target=hash_to_bytes(tip_revision_develop),
                 target_type=TargetType.REVISION,
             ),
             b"default": SnapshotBranch(
                 target=hash_to_bytes(tip_revision_default),
                 target_type=TargetType.REVISION,
             ),
             b"HEAD": SnapshotBranch(target=b"develop", target_type=TargetType.ALIAS,),
         },
     )
 
     assert_last_visit_matches(
         loader.storage,
         repo_url,
         status="full",
         type="hg",
         snapshot=expected_snapshot.id,
     )
     check_snapshot(expected_snapshot, loader.storage)
 
     stats = get_stats(loader.storage)
     assert stats == {
         "content": 2,
         "directory": 3,
         "origin": 1,
         "origin_visit": 1,
         "release": 0,
         "revision": 58,
         "skipped_content": 0,
         "snapshot": 1,
     }
 
 
 # This test has as been adapted from the historical `HgBundle20Loader` tests
 # to ensure compatibility of `HgLoaderFromDisk`.
 # Hashes as been produced by copy pasting the result of the implementation
 # to prevent regressions.
 def test_loader_hg_new_visit_with_release(swh_storage, datadir, tmp_path):
     """Eventful visit with release should yield 1 snapshot"""
 
     archive_name = "hello"
     archive_path = os.path.join(datadir, f"{archive_name}.tgz")
     repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
 
     loader = HgLoaderFromDisk(swh_storage, url=repo_url, visit_date=VISIT_DATE,)
 
     actual_load_status = loader.load()
     assert actual_load_status == {"status": "eventful"}
 
     # then
     stats = get_stats(loader.storage)
     assert stats == {
         "content": 3,
         "directory": 3,
         "origin": 1,
         "origin_visit": 1,
         "release": 1,
         "revision": 3,
         "skipped_content": 0,
         "snapshot": 1,
     }
 
     # cf. test_loader.org for explaining from where those hashes
     tip_release = hash_to_bytes("515c4d72e089404356d0f4b39d60f948b8999140")
     release = loader.storage.release_get([tip_release])[0]
     assert release is not None
 
     tip_revision_default = hash_to_bytes("c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27")
     revision = loader.storage.revision_get([tip_revision_default])[0]
     assert revision is not None
 
     expected_snapshot = Snapshot(
         id=hash_to_bytes("d35668e02e2ba4321dc951cd308cf883786f918a"),
         branches={
             b"default": SnapshotBranch(
                 target=tip_revision_default, target_type=TargetType.REVISION,
             ),
             b"0.1": SnapshotBranch(target=tip_release, target_type=TargetType.RELEASE,),
             b"HEAD": SnapshotBranch(target=b"default", target_type=TargetType.ALIAS,),
         },
     )
 
     check_snapshot(expected_snapshot, loader.storage)
     assert_last_visit_matches(
         loader.storage,
         repo_url,
         type=RevisionType.MERCURIAL.value,
         status="full",
         snapshot=expected_snapshot.id,
     )
 
 
 # This test has as been adapted from the historical `HgBundle20Loader` tests
 # to ensure compatibility of `HgLoaderFromDisk`.
 # Hashes as been produced by copy pasting the result of the implementation
 # to prevent regressions.
 def test_visit_repository_with_transplant_operations(swh_storage, datadir, tmp_path):
     """Visit a mercurial repository visit transplant operations within should yield a
     snapshot as well.
 
     """
 
     archive_name = "transplant"
     archive_path = os.path.join(datadir, f"{archive_name}.tgz")
     repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
 
     loader = HgLoaderFromDisk(swh_storage, url=repo_url, visit_date=VISIT_DATE,)
 
     # load hg repository
     actual_load_status = loader.load()
     assert actual_load_status == {"status": "eventful"}
 
     # collect swh revisions
     assert_last_visit_matches(
         loader.storage, repo_url, type=RevisionType.MERCURIAL.value, status="full"
     )
 
     revisions = []
     snapshot = snapshot_get_latest(loader.storage, repo_url)
     for branch in snapshot.branches.values():
         if branch.target_type.value != "revision":
             continue
         revisions.append(branch.target)
 
     # extract original changesets info and the transplant sources
     hg_changesets = set()
     transplant_sources = set()
     for rev in loader.storage.revision_log(revisions):
         extids = list(
             loader.storage.extid_get_from_target(ObjectType.REVISION, [rev["id"]])
         )
         assert len(extids) == 1
         hg_changesets.add(hash_to_hex(extids[0].extid))
         for k, v in rev["extra_headers"]:
             if k == b"transplant_source":
                 transplant_sources.add(v.decode("ascii"))
 
     # check extracted data are valid
     assert len(hg_changesets) > 0
     assert len(transplant_sources) > 0
     assert transplant_sources <= hg_changesets
 
 
 def _partial_copy_storage(
     old_storage, origin_url: str, mechanism: str, copy_revisions: bool
 ):
     """Create a new storage, and only copy ExtIDs or head revisions to it."""
     new_storage = get_storage(cls="memory")
     snapshot = snapshot_get_latest(old_storage, origin_url)
     assert snapshot
     heads = [branch.target for branch in snapshot.branches.values()]
 
     if mechanism == "extid":
         extids = old_storage.extid_get_from_target(ObjectType.REVISION, heads)
         new_storage.extid_add(extids)
         if copy_revisions:
             # copy revisions, but erase their metadata to make sure the loader doesn't
             # fallback to revision.metadata["nodeid"]
             revisions = [
                 attr.evolve(rev, metadata={})
                 for rev in old_storage.revision_get(heads)
                 if rev
             ]
             new_storage.revision_add(revisions)
 
     else:
         assert mechanism == "same storage"
         return old_storage
 
     # copy origin, visit, status
     new_storage.origin_add(old_storage.origin_get([origin_url]))
     visit = old_storage.origin_visit_get_latest(origin_url)
     new_storage.origin_visit_add([visit])
     statuses = old_storage.origin_visit_status_get(origin_url, visit.visit).results
     new_storage.origin_visit_status_add(statuses)
     new_storage.snapshot_add([snapshot])
 
     return new_storage
 
 
 @pytest.mark.parametrize("mechanism", ("extid", "same storage"))
 def test_load_unchanged_repo_should_be_uneventful(
     swh_storage, datadir, tmp_path, mechanism
 ):
     """Checks the loader can find which revisions it already loaded, using ExtIDs."""
     archive_name = "hello"
     archive_path = os.path.join(datadir, f"{archive_name}.tgz")
     repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
     repo_path = repo_url.replace("file://", "")
 
     loader = HgLoaderFromDisk(swh_storage, repo_path)
 
     assert loader.load() == {"status": "eventful"}
     assert get_stats(loader.storage) == {
         "content": 3,
         "directory": 3,
         "origin": 1,
         "origin_visit": 1,
         "release": 1,
         "revision": 3,
         "skipped_content": 0,
         "snapshot": 1,
     }
 
     old_storage = swh_storage
 
     # Create a new storage, and only copy ExtIDs or head revisions to it.
     # This should be enough for the loader to know revisions were already loaded
     new_storage = _partial_copy_storage(
         old_storage, repo_path, mechanism=mechanism, copy_revisions=True
     )
 
     # Create a new loader (to start with a clean slate, eg. remove the caches),
     # with the new, partial, storage
     loader = HgLoaderFromDisk(new_storage, repo_path)
     assert loader.load() == {"status": "uneventful"}
 
     if mechanism == "same storage":
         # Should have all the objects
         assert get_stats(loader.storage) == {
             "content": 3,
             "directory": 3,
             "origin": 1,
             "origin_visit": 2,
             "release": 1,
             "revision": 3,
             "skipped_content": 0,
             "snapshot": 1,
         }
     else:
         # Should have only the objects we directly inserted from the test, plus
         # a new visit
         assert get_stats(loader.storage) == {
             "content": 0,
             "directory": 0,
             "origin": 1,
             "origin_visit": 2,
             "release": 0,
             "revision": 1,
             "skipped_content": 0,
             "snapshot": 1,
         }
 
 
 def test_load_unchanged_repo__dangling_extid(swh_storage, datadir, tmp_path):
     """Checks the loader will load revisions targeted by an ExtID if the
     revisions are missing from the storage"""
     archive_name = "hello"
     archive_path = os.path.join(datadir, f"{archive_name}.tgz")
     repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
     repo_path = repo_url.replace("file://", "")
 
     loader = HgLoaderFromDisk(swh_storage, repo_path)
 
     assert loader.load() == {"status": "eventful"}
     assert get_stats(loader.storage) == {
         "content": 3,
         "directory": 3,
         "origin": 1,
         "origin_visit": 1,
         "release": 1,
         "revision": 3,
         "skipped_content": 0,
         "snapshot": 1,
     }
 
     old_storage = swh_storage
 
     # Create a new storage, and only copy ExtIDs or head revisions to it.
     # This should be enough for the loader to know revisions were already loaded
     new_storage = _partial_copy_storage(
         old_storage, repo_path, mechanism="extid", copy_revisions=False
     )
 
     # Create a new loader (to start with a clean slate, eg. remove the caches),
     # with the new, partial, storage
     loader = HgLoaderFromDisk(new_storage, repo_path)
 
     assert get_stats(loader.storage) == {
         "content": 0,
         "directory": 0,
         "origin": 1,
         "origin_visit": 1,
         "release": 0,
         "revision": 0,
         "skipped_content": 0,
         "snapshot": 1,
     }
 
     assert loader.load() == {"status": "eventful"}
 
     assert get_stats(loader.storage) == {
         "content": 3,
         "directory": 3,
         "origin": 1,
         "origin_visit": 2,
         "release": 1,
         "revision": 3,
         "skipped_content": 0,
         "snapshot": 1,
     }
 
 
 def test_missing_filelog_should_not_crash(swh_storage, datadir, tmp_path):
     archive_name = "missing-filelog"
     archive_path = os.path.join(datadir, f"{archive_name}.tgz")
     repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
     directory = repo_url.replace("file://", "")
 
     loader = HgLoaderFromDisk(
         storage=swh_storage,
         url=repo_url,
         directory=directory,  # specify directory to avoid clone
         visit_date=VISIT_DATE,
     )
 
     actual_load_status = loader.load()
     assert actual_load_status == {"status": "eventful"}
 
     assert_last_visit_matches(swh_storage, repo_url, status="partial", type="hg")
+
+
+def hg_strip(repo: str, revset: str) -> None:
+    """Removes `revset` and all of their descendants from the local repository."""
+    # Previously called `hg strip`, it was renamed to `hg debugstrip` in Mercurial 5.7
+    # because it's most likely not what most users want to do (they should use some kind
+    # of history-rewriting tool like `histedit` or `prune`).
+    # But here, it's exactly what we want to do.
+    subprocess.check_call(["hg", "debugstrip", revset], cwd=repo)
+
+
+def test_load_repo_with_new_commits(swh_storage, datadir, tmp_path):
+    archive_name = "hello"
+    archive_path = Path(datadir, f"{archive_name}.tgz")
+    json_path = Path(datadir, f"{archive_name}.json")
+    repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+
+    # first load with missing commits
+    hg_strip(repo_url.replace("file://", ""), "tip")
+    loader = HgLoaderFromDisk(swh_storage, repo_url)
+    assert loader.load() == {"status": "eventful"}
+    assert get_stats(loader.storage) == {
+        "content": 2,
+        "directory": 2,
+        "origin": 1,
+        "origin_visit": 1,
+        "release": 0,
+        "revision": 2,
+        "skipped_content": 0,
+        "snapshot": 1,
+    }
+
+    # second load with all commits
+    repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+    loader = HgLoaderFromDisk(swh_storage, repo_url)
+    checker = LoaderChecker(loader=loader, expected=ExpectedSwhids.load(json_path),)
+
+    checker.check()
+
+    assert get_stats(loader.storage) == {
+        "content": 3,
+        "directory": 3,
+        "origin": 1,
+        "origin_visit": 2,
+        "release": 1,
+        "revision": 3,
+        "skipped_content": 0,
+        "snapshot": 2,
+    }