Page MenuHomeSoftware Heritage

D6240.id22696.diff
No OneTemporary

D6240.id22696.diff

diff --git a/swh/loader/mercurial/from_disk.py b/swh/loader/mercurial/from_disk.py
--- a/swh/loader/mercurial/from_disk.py
+++ b/swh/loader/mercurial/from_disk.py
@@ -8,7 +8,7 @@
import os
from shutil import rmtree
from tempfile import mkdtemp
-from typing import Deque, Dict, List, Optional, Set, Tuple, TypeVar, Union
+from typing import Deque, Dict, Iterator, List, Optional, Set, Tuple, TypeVar, Union
from swh.loader.core.loader import BaseLoader
from swh.loader.core.utils import clean_dangling_folders
@@ -36,7 +36,7 @@
from . import hgutil
from .archive_extract import tmp_extract
-from .hgutil import HgFilteredSet, HgNodeId, HgSpanSet
+from .hgutil import HgNodeId
FLAG_PERMS = {
b"l": DentryPerms.symlink,
@@ -289,6 +289,28 @@
]
return extids
+ def _get_extids_for_hgnodes(self, hgnode_ids: List[bytes]) -> List[ExtID]:
+ """Get all Mercurial ExtIDs for the mercurial nodes in the list which point to
+ a known revision.
+
+ """
+ extids = [
+ extid
+ for extid in self.storage.extid_get_from_extid(EXTID_TYPE, hgnode_ids)
+ if extid.extid_version == EXTID_VERSION
+ ]
+ if extids:
+ # Filter out dangling extids, we need to load their target again
+ revisions_missing = self.storage.revision_missing(
+ [extid.target.object_id for extid in extids]
+ )
+ extids = [
+ extid
+ for extid in extids
+ if extid.target.object_id not in revisions_missing
+ ]
+ return extids
+
def fetch_data(self) -> bool:
"""Fetch the data from the source the loader is currently loading
@@ -317,35 +339,55 @@
return False
- def get_hg_revs_to_load(self) -> Union[HgFilteredSet, HgSpanSet]:
- """Return the hg revision numbers to load."""
+ def _new_revs(self, heads: List[bytes]):
+ """Return unseen revisions. That is, filter out revisions that are not ancestors of
+ heads"""
+ assert self._repo is not None
+ existing_heads = []
+
+ for hg_nodeid in heads:
+ try:
+ rev = self._repo[hg_nodeid].rev()
+ existing_heads.append(rev)
+ except KeyError: # the node does not exist anymore
+ pass
+
+ # select revisions that are not ancestors of heads
+ # and not the heads themselves
+ new_revs = self._repo.revs("not ::(%ld)", existing_heads)
+
+ if new_revs:
+ self.log.info("New revisions found: %d", len(new_revs))
+
+ return new_revs
+
+ def get_hg_revs_to_load(self) -> Iterator[int]:
+ """Yield hg revision numbers to load.
+
+ """
assert self._repo is not None
repo: hgutil.Repository = self._repo
+
+ seen_revs: Set[int] = set()
+ # 1. use snapshot to reuse existing seen heads from it
if self._latest_heads:
- existing_heads = [] # heads that still exist in the repository
- for hg_nodeid in self._latest_heads:
- try:
- rev = repo[hg_nodeid].rev()
- existing_heads.append(rev)
- except KeyError: # the node does not exist anymore
- pass
-
- # select revisions that are not ancestors of heads
- # and not the heads themselves
- new_revs = repo.revs("not ::(%ld)", existing_heads)
-
- if new_revs:
- self.log.info("New revisions found: %d", len(new_revs))
- return new_revs
- else:
- return repo.revs("all()")
+ for rev in self._new_revs(self._latest_heads):
+ seen_revs.add(rev)
+ yield rev
+
+ # 2. Then filter out remaining revisions through the overall extid mappings
+ # across hg origins
+ revs_left = repo.revs("all() - ::(%ld)", seen_revs)
+ hg_nodeids = [repo[nodeid].node() for nodeid in revs_left]
+ for rev in self._new_revs(
+ [extid.extid for extid in self._get_extids_for_hgnodes(hg_nodeids)]
+ ):
+ yield rev
def store_data(self):
"""Store fetched data in the database."""
revs = self.get_hg_revs_to_load()
- if not revs:
- self._load_status = "uneventful"
- return
+ length_ingested_revs = 0
assert self._repo is not None
repo = self._repo
@@ -356,14 +398,15 @@
continue
try:
self.store_revision(repo[rev])
+ length_ingested_revs += 1
except CorruptedRevision as e:
self._visit_status = "partial"
self.log.warning("Corrupted revision %s", e)
descendents = repo.revs("(%ld)::", [rev])
ignored_revs.update(descendents)
- if len(ignored_revs) == len(revs):
- # The repository is completely broken, nothing can be loaded
+ if length_ingested_revs == 0:
+ # The repository has nothing to ingest (either empty or broken repository)
self._load_status = "uneventful"
return
diff --git a/swh/loader/mercurial/tests/test_from_disk.py b/swh/loader/mercurial/tests/test_from_disk.py
--- a/swh/loader/mercurial/tests/test_from_disk.py
+++ b/swh/loader/mercurial/tests/test_from_disk.py
@@ -687,3 +687,61 @@
loader = HgLoaderFromDisk(swh_storage, repo_path)
assert loader.load() == {"status": "uneventful"}
+
+
+def test_loader_hg_extid_filtering(swh_storage, datadir, tmp_path):
+ """The first visit of a fork should filter already seen revisions (through extids)
+
+ """
+ archive_name = "the-sandbox"
+ archive_path = os.path.join(datadir, f"{archive_name}.tgz")
+ repo_url = prepare_repository_from_archive(archive_path, archive_name, tmp_path)
+
+ loader = HgLoaderFromDisk(swh_storage, url=repo_url)
+
+ assert loader.load() == {"status": "eventful"}
+ stats = get_stats(loader.storage)
+ expected_stats = {
+ "content": 2,
+ "directory": 3,
+ "origin": 1,
+ "origin_visit": 1,
+ "release": 0,
+ "revision": 58,
+ "skipped_content": 0,
+ "snapshot": 1,
+ }
+ assert stats == expected_stats
+
+ visit_status = assert_last_visit_matches(
+ loader.storage, repo_url, status="full", type="hg",
+ )
+
+ # Make a fork of the first repository we ingested
+ fork_url = prepare_repository_from_archive(
+ archive_path, "the-sandbox-reloaded", tmp_path
+ )
+ loader2 = HgLoaderFromDisk(
+ swh_storage, url=fork_url, directory=str(tmp_path / archive_name)
+ )
+
+ assert loader2.load() == {"status": "uneventful"}
+
+ stats = get_stats(loader.storage)
+ expected_stats2 = expected_stats.copy()
+ expected_stats2.update(
+ {"origin": 1 + 1, "origin_visit": 1 + 1,}
+ )
+ assert stats == expected_stats2
+
+ visit_status2 = assert_last_visit_matches(
+ loader.storage, fork_url, status="full", type="hg",
+ )
+
+ assert visit_status.snapshot is not None
+ assert visit_status2.snapshot is None
+
+ # FIXME: Consistent behavior with filtering data out from already seen snapshot (on
+ # a given origin). But, the other fork origin has no snapshot at all. We should
+ # though, shouldn't we? Otherwise, that would mean fork could end up with no
+ # snapshot at all.

File Metadata

Mime Type
text/plain
Expires
Tue, Dec 17, 2:20 PM (2 d, 22 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222971

Event Timeline